diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,164934 @@ +{ + "best_global_step": 227000, + "best_metric": 0.36603018641471863, + "best_model_checkpoint": "path_to_checkpoint", + "epoch": 1.0, + "eval_steps": 1000, + "global_step": 232926, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.293209002000636e-05, + "grad_norm": 7.446129322052002, + "learning_rate": 9e-07, + "loss": 2.1290369033813477, + "step": 10 + }, + { + "epoch": 8.586418004001271e-05, + "grad_norm": 3.0518405437469482, + "learning_rate": 1.9e-06, + "loss": 2.087772560119629, + "step": 20 + }, + { + "epoch": 0.00012879627006001906, + "grad_norm": 7.433437824249268, + "learning_rate": 2.9e-06, + "loss": 2.093190383911133, + "step": 30 + }, + { + "epoch": 0.00017172836008002543, + "grad_norm": 9.070439338684082, + "learning_rate": 3.9e-06, + "loss": 2.0560066223144533, + "step": 40 + }, + { + "epoch": 0.00021466045010003177, + "grad_norm": 6.858762264251709, + "learning_rate": 4.9000000000000005e-06, + "loss": 2.1954065322875977, + "step": 50 + }, + { + "epoch": 0.0002575925401200381, + "grad_norm": 13.182660102844238, + "learning_rate": 5.9e-06, + "loss": 1.8956178665161132, + "step": 60 + }, + { + "epoch": 0.00030052463014004446, + "grad_norm": 2.741999387741089, + "learning_rate": 6.900000000000001e-06, + "loss": 1.8288171768188477, + "step": 70 + }, + { + "epoch": 0.00034345672016005086, + "grad_norm": 9.149059295654297, + "learning_rate": 7.9e-06, + "loss": 1.4354008674621581, + "step": 80 + }, + { + "epoch": 0.0003863888101800572, + "grad_norm": 1.0485624074935913, + "learning_rate": 8.9e-06, + "loss": 0.9927331924438476, + "step": 90 + }, + { + "epoch": 0.00042932090020006354, + "grad_norm": 4.739808082580566, + "learning_rate": 9.900000000000002e-06, + "loss": 0.8089075088500977, + "step": 100 + }, + { + "epoch": 0.0004722529902200699, + "grad_norm": 7.053487777709961, + "learning_rate": 1.09e-05, + "loss": 0.7783641815185547, + "step": 110 + }, + { + "epoch": 0.0005151850802400762, + "grad_norm": 3.860586404800415, + "learning_rate": 1.19e-05, + "loss": 0.5170607089996337, + "step": 120 + }, + { + "epoch": 0.0005581171702600826, + "grad_norm": 4.848254680633545, + "learning_rate": 1.29e-05, + "loss": 0.7371084213256835, + "step": 130 + }, + { + "epoch": 0.0006010492602800889, + "grad_norm": 1.8437591791152954, + "learning_rate": 1.3900000000000002e-05, + "loss": 0.8458614349365234, + "step": 140 + }, + { + "epoch": 0.0006439813503000953, + "grad_norm": 0.7110176682472229, + "learning_rate": 1.49e-05, + "loss": 0.7368865489959717, + "step": 150 + }, + { + "epoch": 0.0006869134403201017, + "grad_norm": 6.293887615203857, + "learning_rate": 1.59e-05, + "loss": 0.46790356636047364, + "step": 160 + }, + { + "epoch": 0.000729845530340108, + "grad_norm": 2.7968358993530273, + "learning_rate": 1.69e-05, + "loss": 0.37965471744537355, + "step": 170 + }, + { + "epoch": 0.0007727776203601144, + "grad_norm": 4.619741916656494, + "learning_rate": 1.79e-05, + "loss": 0.33707737922668457, + "step": 180 + }, + { + "epoch": 0.0008157097103801207, + "grad_norm": 13.043880462646484, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.21272482872009277, + "step": 190 + }, + { + "epoch": 0.0008586418004001271, + "grad_norm": 2.6656389236450195, + "learning_rate": 1.9900000000000003e-05, + "loss": 0.5739723205566406, + "step": 200 + }, + { + "epoch": 0.0009015738904201335, + "grad_norm": 0.2941681146621704, + "learning_rate": 2.09e-05, + "loss": 0.3597684860229492, + "step": 210 + }, + { + "epoch": 0.0009445059804401398, + "grad_norm": 0.010510088875889778, + "learning_rate": 2.19e-05, + "loss": 0.4325720310211182, + "step": 220 + }, + { + "epoch": 0.0009874380704601462, + "grad_norm": 2.2361772060394287, + "learning_rate": 2.29e-05, + "loss": 0.4555656433105469, + "step": 230 + }, + { + "epoch": 0.0010303701604801525, + "grad_norm": 1.4009690284729004, + "learning_rate": 2.39e-05, + "loss": 0.535375452041626, + "step": 240 + }, + { + "epoch": 0.0010733022505001587, + "grad_norm": 3.090742826461792, + "learning_rate": 2.4900000000000002e-05, + "loss": 0.4737332820892334, + "step": 250 + }, + { + "epoch": 0.0011162343405201653, + "grad_norm": 1.8690292835235596, + "learning_rate": 2.5900000000000003e-05, + "loss": 0.3506669521331787, + "step": 260 + }, + { + "epoch": 0.0011591664305401715, + "grad_norm": 2.552485942840576, + "learning_rate": 2.6900000000000003e-05, + "loss": 0.3364131212234497, + "step": 270 + }, + { + "epoch": 0.0012020985205601778, + "grad_norm": 0.12322066724300385, + "learning_rate": 2.7900000000000004e-05, + "loss": 0.5483479976654053, + "step": 280 + }, + { + "epoch": 0.0012450306105801843, + "grad_norm": 0.7322730422019958, + "learning_rate": 2.8899999999999998e-05, + "loss": 0.48665218353271483, + "step": 290 + }, + { + "epoch": 0.0012879627006001906, + "grad_norm": 1.7751740217208862, + "learning_rate": 2.9900000000000002e-05, + "loss": 0.40238499641418457, + "step": 300 + }, + { + "epoch": 0.001330894790620197, + "grad_norm": 0.5181493759155273, + "learning_rate": 3.09e-05, + "loss": 0.35978949069976807, + "step": 310 + }, + { + "epoch": 0.0013738268806402034, + "grad_norm": 1.502162218093872, + "learning_rate": 3.19e-05, + "loss": 0.32965424060821535, + "step": 320 + }, + { + "epoch": 0.0014167589706602097, + "grad_norm": 2.794450283050537, + "learning_rate": 3.29e-05, + "loss": 0.3894787311553955, + "step": 330 + }, + { + "epoch": 0.001459691060680216, + "grad_norm": 2.374486207962036, + "learning_rate": 3.3900000000000004e-05, + "loss": 0.40500693321228026, + "step": 340 + }, + { + "epoch": 0.0015026231507002223, + "grad_norm": 0.6215456128120422, + "learning_rate": 3.49e-05, + "loss": 0.26663985252380373, + "step": 350 + }, + { + "epoch": 0.0015455552407202288, + "grad_norm": 18.548954010009766, + "learning_rate": 3.59e-05, + "loss": 0.4955165386199951, + "step": 360 + }, + { + "epoch": 0.001588487330740235, + "grad_norm": 0.000909608555957675, + "learning_rate": 3.69e-05, + "loss": 0.45831050872802737, + "step": 370 + }, + { + "epoch": 0.0016314194207602414, + "grad_norm": 0.026904495432972908, + "learning_rate": 3.79e-05, + "loss": 0.43085713386535646, + "step": 380 + }, + { + "epoch": 0.0016743515107802479, + "grad_norm": 0.03139101713895798, + "learning_rate": 3.8900000000000004e-05, + "loss": 0.2645646810531616, + "step": 390 + }, + { + "epoch": 0.0017172836008002542, + "grad_norm": 2.005051374435425, + "learning_rate": 3.99e-05, + "loss": 0.4847880363464355, + "step": 400 + }, + { + "epoch": 0.0017602156908202605, + "grad_norm": 2.000047445297241, + "learning_rate": 4.09e-05, + "loss": 0.4927666664123535, + "step": 410 + }, + { + "epoch": 0.001803147780840267, + "grad_norm": 2.906313896179199, + "learning_rate": 4.19e-05, + "loss": 0.4311686038970947, + "step": 420 + }, + { + "epoch": 0.0018460798708602733, + "grad_norm": 0.0084048667922616, + "learning_rate": 4.29e-05, + "loss": 0.3550480127334595, + "step": 430 + }, + { + "epoch": 0.0018890119608802795, + "grad_norm": 0.01702202670276165, + "learning_rate": 4.39e-05, + "loss": 0.20395941734313966, + "step": 440 + }, + { + "epoch": 0.0019319440509002858, + "grad_norm": 2.0575366020202637, + "learning_rate": 4.49e-05, + "loss": 0.44746909141540525, + "step": 450 + }, + { + "epoch": 0.0019748761409202923, + "grad_norm": 12.149165153503418, + "learning_rate": 4.5900000000000004e-05, + "loss": 0.3470444917678833, + "step": 460 + }, + { + "epoch": 0.002017808230940299, + "grad_norm": 6.969229698181152, + "learning_rate": 4.69e-05, + "loss": 0.15598635673522948, + "step": 470 + }, + { + "epoch": 0.002060740320960305, + "grad_norm": 0.15074250102043152, + "learning_rate": 4.79e-05, + "loss": 0.25433228015899656, + "step": 480 + }, + { + "epoch": 0.0021036724109803114, + "grad_norm": 0.2592933773994446, + "learning_rate": 4.89e-05, + "loss": 0.30282485485076904, + "step": 490 + }, + { + "epoch": 0.0021466045010003175, + "grad_norm": 8.743586540222168, + "learning_rate": 4.99e-05, + "loss": 0.31165275573730467, + "step": 500 + }, + { + "epoch": 0.002189536591020324, + "grad_norm": 0.11677571386098862, + "learning_rate": 5.0900000000000004e-05, + "loss": 0.5364396095275878, + "step": 510 + }, + { + "epoch": 0.0022324686810403305, + "grad_norm": 0.05715586617588997, + "learning_rate": 5.19e-05, + "loss": 0.3230873107910156, + "step": 520 + }, + { + "epoch": 0.0022754007710603366, + "grad_norm": 2.341602325439453, + "learning_rate": 5.2900000000000005e-05, + "loss": 0.3851492166519165, + "step": 530 + }, + { + "epoch": 0.002318332861080343, + "grad_norm": 8.533411979675293, + "learning_rate": 5.390000000000001e-05, + "loss": 0.8554889678955078, + "step": 540 + }, + { + "epoch": 0.0023612649511003496, + "grad_norm": 0.472126305103302, + "learning_rate": 5.4900000000000006e-05, + "loss": 0.24252398014068605, + "step": 550 + }, + { + "epoch": 0.0024041970411203557, + "grad_norm": 6.999728679656982, + "learning_rate": 5.590000000000001e-05, + "loss": 0.3368447065353394, + "step": 560 + }, + { + "epoch": 0.002447129131140362, + "grad_norm": 0.036653582006692886, + "learning_rate": 5.69e-05, + "loss": 0.2939608573913574, + "step": 570 + }, + { + "epoch": 0.0024900612211603687, + "grad_norm": 0.19750595092773438, + "learning_rate": 5.79e-05, + "loss": 0.2020362138748169, + "step": 580 + }, + { + "epoch": 0.0025329933111803747, + "grad_norm": 1.7139503955841064, + "learning_rate": 5.89e-05, + "loss": 0.2586077690124512, + "step": 590 + }, + { + "epoch": 0.0025759254012003813, + "grad_norm": 0.024881673976778984, + "learning_rate": 5.99e-05, + "loss": 0.1620743155479431, + "step": 600 + }, + { + "epoch": 0.0026188574912203878, + "grad_norm": 1.9613510370254517, + "learning_rate": 6.09e-05, + "loss": 0.4541365623474121, + "step": 610 + }, + { + "epoch": 0.002661789581240394, + "grad_norm": 0.010247381404042244, + "learning_rate": 6.19e-05, + "loss": 0.3532832384109497, + "step": 620 + }, + { + "epoch": 0.0027047216712604003, + "grad_norm": 1.9921550750732422, + "learning_rate": 6.29e-05, + "loss": 0.48864259719848635, + "step": 630 + }, + { + "epoch": 0.002747653761280407, + "grad_norm": 0.03684716299176216, + "learning_rate": 6.390000000000001e-05, + "loss": 0.2768329858779907, + "step": 640 + }, + { + "epoch": 0.002790585851300413, + "grad_norm": 0.04419621825218201, + "learning_rate": 6.49e-05, + "loss": 0.29871292114257814, + "step": 650 + }, + { + "epoch": 0.0028335179413204194, + "grad_norm": 0.4213247299194336, + "learning_rate": 6.59e-05, + "loss": 0.32069456577301025, + "step": 660 + }, + { + "epoch": 0.002876450031340426, + "grad_norm": 1.8658652305603027, + "learning_rate": 6.690000000000001e-05, + "loss": 0.49007110595703124, + "step": 670 + }, + { + "epoch": 0.002919382121360432, + "grad_norm": 0.02149110846221447, + "learning_rate": 6.790000000000001e-05, + "loss": 0.5062370300292969, + "step": 680 + }, + { + "epoch": 0.0029623142113804385, + "grad_norm": 1.4124505519866943, + "learning_rate": 6.89e-05, + "loss": 0.38771054744720457, + "step": 690 + }, + { + "epoch": 0.0030052463014004446, + "grad_norm": 0.020706655457615852, + "learning_rate": 6.99e-05, + "loss": 0.25068256855010984, + "step": 700 + }, + { + "epoch": 0.003048178391420451, + "grad_norm": 2.2183375358581543, + "learning_rate": 7.09e-05, + "loss": 0.49508442878723147, + "step": 710 + }, + { + "epoch": 0.0030911104814404576, + "grad_norm": 6.63649320602417, + "learning_rate": 7.19e-05, + "loss": 0.478483772277832, + "step": 720 + }, + { + "epoch": 0.0031340425714604637, + "grad_norm": 1.0420591831207275, + "learning_rate": 7.29e-05, + "loss": 0.497357177734375, + "step": 730 + }, + { + "epoch": 0.00317697466148047, + "grad_norm": 3.652012348175049, + "learning_rate": 7.390000000000001e-05, + "loss": 0.2723304033279419, + "step": 740 + }, + { + "epoch": 0.0032199067515004767, + "grad_norm": 2.3289570808410645, + "learning_rate": 7.49e-05, + "loss": 0.6125946521759034, + "step": 750 + }, + { + "epoch": 0.0032628388415204827, + "grad_norm": 1.3651496171951294, + "learning_rate": 7.59e-05, + "loss": 0.3251293659210205, + "step": 760 + }, + { + "epoch": 0.0033057709315404893, + "grad_norm": 1.72046959400177, + "learning_rate": 7.69e-05, + "loss": 0.32114553451538086, + "step": 770 + }, + { + "epoch": 0.0033487030215604958, + "grad_norm": 1.1918679475784302, + "learning_rate": 7.790000000000001e-05, + "loss": 0.41199021339416503, + "step": 780 + }, + { + "epoch": 0.003391635111580502, + "grad_norm": 0.03947390615940094, + "learning_rate": 7.890000000000001e-05, + "loss": 0.03135415315628052, + "step": 790 + }, + { + "epoch": 0.0034345672016005083, + "grad_norm": 1.5788650512695312, + "learning_rate": 7.99e-05, + "loss": 0.5670211315155029, + "step": 800 + }, + { + "epoch": 0.003477499291620515, + "grad_norm": 0.019160764291882515, + "learning_rate": 8.090000000000001e-05, + "loss": 0.2667243242263794, + "step": 810 + }, + { + "epoch": 0.003520431381640521, + "grad_norm": 5.5093231201171875, + "learning_rate": 8.19e-05, + "loss": 0.4863880157470703, + "step": 820 + }, + { + "epoch": 0.0035633634716605274, + "grad_norm": 1.7203266620635986, + "learning_rate": 8.29e-05, + "loss": 0.23645930290222167, + "step": 830 + }, + { + "epoch": 0.003606295561680534, + "grad_norm": 1.8550962209701538, + "learning_rate": 8.39e-05, + "loss": 0.32827005386352537, + "step": 840 + }, + { + "epoch": 0.00364922765170054, + "grad_norm": 0.002791638718917966, + "learning_rate": 8.49e-05, + "loss": 0.23109066486358643, + "step": 850 + }, + { + "epoch": 0.0036921597417205465, + "grad_norm": 0.0013374650152400136, + "learning_rate": 8.59e-05, + "loss": 0.310694146156311, + "step": 860 + }, + { + "epoch": 0.0037350918317405526, + "grad_norm": 2.3354532718658447, + "learning_rate": 8.69e-05, + "loss": 0.48372364044189453, + "step": 870 + }, + { + "epoch": 0.003778023921760559, + "grad_norm": 5.989701747894287, + "learning_rate": 8.790000000000001e-05, + "loss": 0.3159054756164551, + "step": 880 + }, + { + "epoch": 0.0038209560117805656, + "grad_norm": 1.735064148902893, + "learning_rate": 8.89e-05, + "loss": 0.34026777744293213, + "step": 890 + }, + { + "epoch": 0.0038638881018005717, + "grad_norm": 0.031852539628744125, + "learning_rate": 8.99e-05, + "loss": 0.08108786344528199, + "step": 900 + }, + { + "epoch": 0.003906820191820579, + "grad_norm": 0.03357387334108353, + "learning_rate": 9.090000000000001e-05, + "loss": 0.46551074981689455, + "step": 910 + }, + { + "epoch": 0.003949752281840585, + "grad_norm": 0.010642527602612972, + "learning_rate": 9.190000000000001e-05, + "loss": 0.4507905006408691, + "step": 920 + }, + { + "epoch": 0.003992684371860591, + "grad_norm": 0.9609713554382324, + "learning_rate": 9.290000000000001e-05, + "loss": 0.202797269821167, + "step": 930 + }, + { + "epoch": 0.004035616461880598, + "grad_norm": 2.6735310554504395, + "learning_rate": 9.39e-05, + "loss": 0.33949248790740966, + "step": 940 + }, + { + "epoch": 0.004078548551900604, + "grad_norm": 0.14202536642551422, + "learning_rate": 9.49e-05, + "loss": 0.4102811813354492, + "step": 950 + }, + { + "epoch": 0.00412148064192061, + "grad_norm": 0.8708382248878479, + "learning_rate": 9.59e-05, + "loss": 0.40193929672241213, + "step": 960 + }, + { + "epoch": 0.004164412731940616, + "grad_norm": 0.4100968539714813, + "learning_rate": 9.69e-05, + "loss": 0.6347273826599121, + "step": 970 + }, + { + "epoch": 0.004207344821960623, + "grad_norm": 4.046662330627441, + "learning_rate": 9.790000000000001e-05, + "loss": 0.3081040859222412, + "step": 980 + }, + { + "epoch": 0.004250276911980629, + "grad_norm": 0.8989287614822388, + "learning_rate": 9.89e-05, + "loss": 0.5247183799743652, + "step": 990 + }, + { + "epoch": 0.004293209002000635, + "grad_norm": 4.966592788696289, + "learning_rate": 9.99e-05, + "loss": 0.3147383213043213, + "step": 1000 + }, + { + "epoch": 0.004293209002000635, + "eval_loss": 0.6870803236961365, + "eval_runtime": 27.4913, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 3.638, + "step": 1000 + }, + { + "epoch": 0.004336141092020642, + "grad_norm": 0.0117591992020607, + "learning_rate": 9.999611945189415e-05, + "loss": 0.2696777582168579, + "step": 1010 + }, + { + "epoch": 0.004379073182040648, + "grad_norm": 3.5806615352630615, + "learning_rate": 9.999180773177652e-05, + "loss": 0.34646382331848147, + "step": 1020 + }, + { + "epoch": 0.004422005272060654, + "grad_norm": 0.02200966142117977, + "learning_rate": 9.99874960116589e-05, + "loss": 0.16257576942443847, + "step": 1030 + }, + { + "epoch": 0.004464937362080661, + "grad_norm": 0.028981667011976242, + "learning_rate": 9.998318429154128e-05, + "loss": 0.3506030559539795, + "step": 1040 + }, + { + "epoch": 0.004507869452100667, + "grad_norm": 0.016286378726363182, + "learning_rate": 9.997887257142366e-05, + "loss": 0.34904468059539795, + "step": 1050 + }, + { + "epoch": 0.004550801542120673, + "grad_norm": 0.004303790628910065, + "learning_rate": 9.997456085130602e-05, + "loss": 0.37080433368682864, + "step": 1060 + }, + { + "epoch": 0.00459373363214068, + "grad_norm": 1.1950221061706543, + "learning_rate": 9.99702491311884e-05, + "loss": 0.24393827915191652, + "step": 1070 + }, + { + "epoch": 0.004636665722160686, + "grad_norm": 0.008893678896129131, + "learning_rate": 9.996593741107077e-05, + "loss": 0.5550142288208008, + "step": 1080 + }, + { + "epoch": 0.004679597812180692, + "grad_norm": 3.304513454437256, + "learning_rate": 9.996162569095315e-05, + "loss": 0.2666121244430542, + "step": 1090 + }, + { + "epoch": 0.004722529902200699, + "grad_norm": 0.027765685692429543, + "learning_rate": 9.995731397083553e-05, + "loss": 0.16644638776779175, + "step": 1100 + }, + { + "epoch": 0.004765461992220705, + "grad_norm": 0.42218098044395447, + "learning_rate": 9.99530022507179e-05, + "loss": 0.3481473922729492, + "step": 1110 + }, + { + "epoch": 0.004808394082240711, + "grad_norm": 1.156235694885254, + "learning_rate": 9.994869053060028e-05, + "loss": 0.372220778465271, + "step": 1120 + }, + { + "epoch": 0.004851326172260718, + "grad_norm": 0.052121300250291824, + "learning_rate": 9.994437881048266e-05, + "loss": 0.3564959764480591, + "step": 1130 + }, + { + "epoch": 0.004894258262280724, + "grad_norm": 0.002628577407449484, + "learning_rate": 9.994006709036502e-05, + "loss": 0.3152308464050293, + "step": 1140 + }, + { + "epoch": 0.00493719035230073, + "grad_norm": 2.0352439880371094, + "learning_rate": 9.99357553702474e-05, + "loss": 0.3882049560546875, + "step": 1150 + }, + { + "epoch": 0.004980122442320737, + "grad_norm": 7.649345397949219, + "learning_rate": 9.993144365012979e-05, + "loss": 0.40246758460998533, + "step": 1160 + }, + { + "epoch": 0.005023054532340743, + "grad_norm": 2.1263649463653564, + "learning_rate": 9.992713193001217e-05, + "loss": 0.14638826847076417, + "step": 1170 + }, + { + "epoch": 0.0050659866223607495, + "grad_norm": 0.10847385972738266, + "learning_rate": 9.992282020989455e-05, + "loss": 0.3140259265899658, + "step": 1180 + }, + { + "epoch": 0.0051089187123807564, + "grad_norm": 0.08196664601564407, + "learning_rate": 9.991850848977692e-05, + "loss": 0.17124234437942504, + "step": 1190 + }, + { + "epoch": 0.0051518508024007625, + "grad_norm": 1.618770718574524, + "learning_rate": 9.99141967696593e-05, + "loss": 0.5490658283233643, + "step": 1200 + }, + { + "epoch": 0.005194782892420769, + "grad_norm": 0.06352327018976212, + "learning_rate": 9.990988504954168e-05, + "loss": 0.4400003910064697, + "step": 1210 + }, + { + "epoch": 0.0052377149824407755, + "grad_norm": 0.04779497906565666, + "learning_rate": 9.990557332942404e-05, + "loss": 0.19397462606430055, + "step": 1220 + }, + { + "epoch": 0.005280647072460782, + "grad_norm": 0.008033322170376778, + "learning_rate": 9.990126160930642e-05, + "loss": 0.5248307228088379, + "step": 1230 + }, + { + "epoch": 0.005323579162480788, + "grad_norm": 1.4066425561904907, + "learning_rate": 9.98969498891888e-05, + "loss": 0.4247109889984131, + "step": 1240 + }, + { + "epoch": 0.005366511252500795, + "grad_norm": 7.272195816040039, + "learning_rate": 9.989263816907117e-05, + "loss": 0.5679349899291992, + "step": 1250 + }, + { + "epoch": 0.005409443342520801, + "grad_norm": 0.02575252577662468, + "learning_rate": 9.988832644895355e-05, + "loss": 0.5577791213989258, + "step": 1260 + }, + { + "epoch": 0.005452375432540807, + "grad_norm": 2.0221657752990723, + "learning_rate": 9.988401472883593e-05, + "loss": 0.21536378860473632, + "step": 1270 + }, + { + "epoch": 0.005495307522560814, + "grad_norm": 2.7715706825256348, + "learning_rate": 9.987970300871831e-05, + "loss": 0.5313220024108887, + "step": 1280 + }, + { + "epoch": 0.00553823961258082, + "grad_norm": 0.05551741644740105, + "learning_rate": 9.987539128860068e-05, + "loss": 0.3612039089202881, + "step": 1290 + }, + { + "epoch": 0.005581171702600826, + "grad_norm": 0.2868395149707794, + "learning_rate": 9.987107956848305e-05, + "loss": 0.1773497700691223, + "step": 1300 + }, + { + "epoch": 0.005624103792620833, + "grad_norm": 0.028904501348733902, + "learning_rate": 9.986676784836543e-05, + "loss": 0.37716073989868165, + "step": 1310 + }, + { + "epoch": 0.005667035882640839, + "grad_norm": 5.851991176605225, + "learning_rate": 9.98624561282478e-05, + "loss": 0.28295397758483887, + "step": 1320 + }, + { + "epoch": 0.005709967972660845, + "grad_norm": 0.9384163618087769, + "learning_rate": 9.985814440813018e-05, + "loss": 0.6173577785491944, + "step": 1330 + }, + { + "epoch": 0.005752900062680852, + "grad_norm": 1.63738214969635, + "learning_rate": 9.985383268801256e-05, + "loss": 0.02494920641183853, + "step": 1340 + }, + { + "epoch": 0.005795832152700858, + "grad_norm": 0.03405584394931793, + "learning_rate": 9.984952096789493e-05, + "loss": 0.23200345039367676, + "step": 1350 + }, + { + "epoch": 0.005838764242720864, + "grad_norm": 10.848572731018066, + "learning_rate": 9.984520924777731e-05, + "loss": 0.3212714672088623, + "step": 1360 + }, + { + "epoch": 0.00588169633274087, + "grad_norm": 8.821533203125, + "learning_rate": 9.984089752765969e-05, + "loss": 0.2774125814437866, + "step": 1370 + }, + { + "epoch": 0.005924628422760877, + "grad_norm": 5.936229228973389, + "learning_rate": 9.983658580754207e-05, + "loss": 0.3172731399536133, + "step": 1380 + }, + { + "epoch": 0.005967560512780883, + "grad_norm": 0.028739016503095627, + "learning_rate": 9.983227408742444e-05, + "loss": 0.19269169569015504, + "step": 1390 + }, + { + "epoch": 0.006010492602800889, + "grad_norm": 0.06869284063577652, + "learning_rate": 9.982796236730682e-05, + "loss": 0.24538626670837402, + "step": 1400 + }, + { + "epoch": 0.006053424692820896, + "grad_norm": 0.08283459395170212, + "learning_rate": 9.98236506471892e-05, + "loss": 0.30588588714599607, + "step": 1410 + }, + { + "epoch": 0.006096356782840902, + "grad_norm": 0.07285956293344498, + "learning_rate": 9.981933892707158e-05, + "loss": 0.4131472587585449, + "step": 1420 + }, + { + "epoch": 0.006139288872860908, + "grad_norm": 0.0415986068546772, + "learning_rate": 9.981502720695395e-05, + "loss": 0.2583375692367554, + "step": 1430 + }, + { + "epoch": 0.006182220962880915, + "grad_norm": 0.42376863956451416, + "learning_rate": 9.981071548683633e-05, + "loss": 0.17857149839401246, + "step": 1440 + }, + { + "epoch": 0.006225153052900921, + "grad_norm": 0.17374563217163086, + "learning_rate": 9.980640376671871e-05, + "loss": 0.6782264709472656, + "step": 1450 + }, + { + "epoch": 0.006268085142920927, + "grad_norm": 0.39217206835746765, + "learning_rate": 9.980209204660109e-05, + "loss": 0.3628535270690918, + "step": 1460 + }, + { + "epoch": 0.006311017232940934, + "grad_norm": 0.2118167132139206, + "learning_rate": 9.979778032648345e-05, + "loss": 0.14380197525024413, + "step": 1470 + }, + { + "epoch": 0.00635394932296094, + "grad_norm": 2.725713014602661, + "learning_rate": 9.979346860636583e-05, + "loss": 0.2942746639251709, + "step": 1480 + }, + { + "epoch": 0.006396881412980946, + "grad_norm": 0.11453196406364441, + "learning_rate": 9.97891568862482e-05, + "loss": 0.28425648212432864, + "step": 1490 + }, + { + "epoch": 0.006439813503000953, + "grad_norm": 2.679959535598755, + "learning_rate": 9.978484516613058e-05, + "loss": 0.23248529434204102, + "step": 1500 + }, + { + "epoch": 0.006482745593020959, + "grad_norm": 3.3989484310150146, + "learning_rate": 9.978053344601296e-05, + "loss": 0.6053690433502197, + "step": 1510 + }, + { + "epoch": 0.0065256776830409655, + "grad_norm": 3.2708628177642822, + "learning_rate": 9.977622172589534e-05, + "loss": 0.08650413751602173, + "step": 1520 + }, + { + "epoch": 0.0065686097730609724, + "grad_norm": 2.154971122741699, + "learning_rate": 9.977191000577771e-05, + "loss": 0.49035396575927737, + "step": 1530 + }, + { + "epoch": 0.0066115418630809785, + "grad_norm": 0.013774173334240913, + "learning_rate": 9.976759828566009e-05, + "loss": 0.4272459983825684, + "step": 1540 + }, + { + "epoch": 0.006654473953100985, + "grad_norm": 1.9301711320877075, + "learning_rate": 9.976328656554245e-05, + "loss": 0.2707101821899414, + "step": 1550 + }, + { + "epoch": 0.0066974060431209915, + "grad_norm": 1.391924500465393, + "learning_rate": 9.975897484542483e-05, + "loss": 0.4632604122161865, + "step": 1560 + }, + { + "epoch": 0.006740338133140998, + "grad_norm": 4.163330078125, + "learning_rate": 9.975466312530721e-05, + "loss": 0.26462554931640625, + "step": 1570 + }, + { + "epoch": 0.006783270223161004, + "grad_norm": 0.18783286213874817, + "learning_rate": 9.975035140518959e-05, + "loss": 0.3839559078216553, + "step": 1580 + }, + { + "epoch": 0.006826202313181011, + "grad_norm": 0.347711980342865, + "learning_rate": 9.974603968507196e-05, + "loss": 0.19243018627166747, + "step": 1590 + }, + { + "epoch": 0.006869134403201017, + "grad_norm": 5.590195655822754, + "learning_rate": 9.974172796495434e-05, + "loss": 0.5789153099060058, + "step": 1600 + }, + { + "epoch": 0.006912066493221023, + "grad_norm": 1.6785224676132202, + "learning_rate": 9.973741624483672e-05, + "loss": 0.30777108669281006, + "step": 1610 + }, + { + "epoch": 0.00695499858324103, + "grad_norm": 3.9783945083618164, + "learning_rate": 9.97331045247191e-05, + "loss": 0.38220555782318116, + "step": 1620 + }, + { + "epoch": 0.006997930673261036, + "grad_norm": 3.0387625694274902, + "learning_rate": 9.972879280460147e-05, + "loss": 0.22734384536743163, + "step": 1630 + }, + { + "epoch": 0.007040862763281042, + "grad_norm": 5.566250801086426, + "learning_rate": 9.972448108448385e-05, + "loss": 0.49216341972351074, + "step": 1640 + }, + { + "epoch": 0.007083794853301049, + "grad_norm": 0.014136805199086666, + "learning_rate": 9.972016936436623e-05, + "loss": 0.3727003812789917, + "step": 1650 + }, + { + "epoch": 0.007126726943321055, + "grad_norm": 2.337724447250366, + "learning_rate": 9.97158576442486e-05, + "loss": 0.18289399147033691, + "step": 1660 + }, + { + "epoch": 0.007169659033341061, + "grad_norm": 0.00662636524066329, + "learning_rate": 9.971154592413098e-05, + "loss": 0.19337474107742308, + "step": 1670 + }, + { + "epoch": 0.007212591123361068, + "grad_norm": 0.12630727887153625, + "learning_rate": 9.970723420401336e-05, + "loss": 0.3763300657272339, + "step": 1680 + }, + { + "epoch": 0.007255523213381074, + "grad_norm": 0.0527946762740612, + "learning_rate": 9.970292248389574e-05, + "loss": 0.2878370761871338, + "step": 1690 + }, + { + "epoch": 0.00729845530340108, + "grad_norm": 2.522291421890259, + "learning_rate": 9.969861076377811e-05, + "loss": 0.29763169288635255, + "step": 1700 + }, + { + "epoch": 0.007341387393421087, + "grad_norm": 0.09618072956800461, + "learning_rate": 9.969429904366048e-05, + "loss": 0.27122509479522705, + "step": 1710 + }, + { + "epoch": 0.007384319483441093, + "grad_norm": 0.042337898164987564, + "learning_rate": 9.968998732354286e-05, + "loss": 0.1773484468460083, + "step": 1720 + }, + { + "epoch": 0.007427251573461099, + "grad_norm": 0.012770925648510456, + "learning_rate": 9.968567560342523e-05, + "loss": 0.25236375331878663, + "step": 1730 + }, + { + "epoch": 0.007470183663481105, + "grad_norm": 1.6886173486709595, + "learning_rate": 9.968136388330761e-05, + "loss": 0.3894350051879883, + "step": 1740 + }, + { + "epoch": 0.007513115753501112, + "grad_norm": 2.3420915603637695, + "learning_rate": 9.967705216318999e-05, + "loss": 0.21277191638946533, + "step": 1750 + }, + { + "epoch": 0.007556047843521118, + "grad_norm": 1.1795216798782349, + "learning_rate": 9.967274044307237e-05, + "loss": 0.3945819139480591, + "step": 1760 + }, + { + "epoch": 0.007598979933541124, + "grad_norm": 0.27975350618362427, + "learning_rate": 9.966842872295474e-05, + "loss": 0.29100914001464845, + "step": 1770 + }, + { + "epoch": 0.007641912023561131, + "grad_norm": 1.951181173324585, + "learning_rate": 9.966411700283712e-05, + "loss": 0.23559587001800536, + "step": 1780 + }, + { + "epoch": 0.007684844113581137, + "grad_norm": 1.1079846620559692, + "learning_rate": 9.96598052827195e-05, + "loss": 0.3525418758392334, + "step": 1790 + }, + { + "epoch": 0.007727776203601143, + "grad_norm": 0.04059452936053276, + "learning_rate": 9.965549356260186e-05, + "loss": 0.16814641952514647, + "step": 1800 + }, + { + "epoch": 0.00777070829362115, + "grad_norm": 0.5009684562683105, + "learning_rate": 9.965118184248424e-05, + "loss": 0.5324047088623047, + "step": 1810 + }, + { + "epoch": 0.007813640383641157, + "grad_norm": 1.176175832748413, + "learning_rate": 9.964687012236662e-05, + "loss": 0.38814170360565187, + "step": 1820 + }, + { + "epoch": 0.007856572473661163, + "grad_norm": 0.12649580836296082, + "learning_rate": 9.964255840224899e-05, + "loss": 0.34246914386749266, + "step": 1830 + }, + { + "epoch": 0.00789950456368117, + "grad_norm": 4.079379081726074, + "learning_rate": 9.963824668213137e-05, + "loss": 0.4702861785888672, + "step": 1840 + }, + { + "epoch": 0.007942436653701175, + "grad_norm": 0.17662937939167023, + "learning_rate": 9.963393496201375e-05, + "loss": 0.14358367919921874, + "step": 1850 + }, + { + "epoch": 0.007985368743721181, + "grad_norm": 0.11334045231342316, + "learning_rate": 9.962962324189613e-05, + "loss": 0.011836948990821838, + "step": 1860 + }, + { + "epoch": 0.008028300833741188, + "grad_norm": 1.63093101978302, + "learning_rate": 9.96253115217785e-05, + "loss": 0.3222597599029541, + "step": 1870 + }, + { + "epoch": 0.008071232923761195, + "grad_norm": 0.038519952446222305, + "learning_rate": 9.962099980166088e-05, + "loss": 0.21120266914367675, + "step": 1880 + }, + { + "epoch": 0.008114165013781201, + "grad_norm": 0.0032395331654697657, + "learning_rate": 9.961668808154326e-05, + "loss": 0.11140183210372925, + "step": 1890 + }, + { + "epoch": 0.008157097103801208, + "grad_norm": 0.12331575155258179, + "learning_rate": 9.961237636142563e-05, + "loss": 0.4628122329711914, + "step": 1900 + }, + { + "epoch": 0.008200029193821214, + "grad_norm": 0.2696305215358734, + "learning_rate": 9.960806464130801e-05, + "loss": 0.2922004461288452, + "step": 1910 + }, + { + "epoch": 0.00824296128384122, + "grad_norm": 0.04623638093471527, + "learning_rate": 9.960375292119039e-05, + "loss": 0.3664160013198853, + "step": 1920 + }, + { + "epoch": 0.008285893373861226, + "grad_norm": 1.505217432975769, + "learning_rate": 9.959944120107277e-05, + "loss": 0.32481415271759034, + "step": 1930 + }, + { + "epoch": 0.008328825463881232, + "grad_norm": 6.866296291351318, + "learning_rate": 9.959512948095514e-05, + "loss": 0.32975733280181885, + "step": 1940 + }, + { + "epoch": 0.00837175755390124, + "grad_norm": 0.13486604392528534, + "learning_rate": 9.959081776083752e-05, + "loss": 0.30583069324493406, + "step": 1950 + }, + { + "epoch": 0.008414689643921246, + "grad_norm": 1.8435735702514648, + "learning_rate": 9.958650604071988e-05, + "loss": 0.22341222763061525, + "step": 1960 + }, + { + "epoch": 0.008457621733941252, + "grad_norm": 6.405146598815918, + "learning_rate": 9.958219432060226e-05, + "loss": 0.24852497577667237, + "step": 1970 + }, + { + "epoch": 0.008500553823961258, + "grad_norm": 2.324030876159668, + "learning_rate": 9.957788260048464e-05, + "loss": 0.4419265270233154, + "step": 1980 + }, + { + "epoch": 0.008543485913981264, + "grad_norm": 0.13140447437763214, + "learning_rate": 9.957357088036702e-05, + "loss": 0.2700542688369751, + "step": 1990 + }, + { + "epoch": 0.00858641800400127, + "grad_norm": 0.13265693187713623, + "learning_rate": 9.95692591602494e-05, + "loss": 0.14963483810424805, + "step": 2000 + }, + { + "epoch": 0.00858641800400127, + "eval_loss": 0.5978419780731201, + "eval_runtime": 27.4428, + "eval_samples_per_second": 3.644, + "eval_steps_per_second": 3.644, + "step": 2000 + }, + { + "epoch": 0.008629350094021278, + "grad_norm": 2.834585666656494, + "learning_rate": 9.956494744013177e-05, + "loss": 0.3619921922683716, + "step": 2010 + }, + { + "epoch": 0.008672282184041284, + "grad_norm": 1.4783027172088623, + "learning_rate": 9.956063572001415e-05, + "loss": 0.3987175941467285, + "step": 2020 + }, + { + "epoch": 0.00871521427406129, + "grad_norm": 0.1267155259847641, + "learning_rate": 9.955632399989653e-05, + "loss": 0.031463423371315004, + "step": 2030 + }, + { + "epoch": 0.008758146364081296, + "grad_norm": 0.013138389214873314, + "learning_rate": 9.955201227977889e-05, + "loss": 0.33531138896942136, + "step": 2040 + }, + { + "epoch": 0.008801078454101302, + "grad_norm": 0.03606367111206055, + "learning_rate": 9.954770055966127e-05, + "loss": 0.3791128873825073, + "step": 2050 + }, + { + "epoch": 0.008844010544121308, + "grad_norm": 0.011809916235506535, + "learning_rate": 9.954338883954364e-05, + "loss": 0.22674219608306884, + "step": 2060 + }, + { + "epoch": 0.008886942634141316, + "grad_norm": 0.028264962136745453, + "learning_rate": 9.953907711942602e-05, + "loss": 0.4846948623657227, + "step": 2070 + }, + { + "epoch": 0.008929874724161322, + "grad_norm": 2.4816222190856934, + "learning_rate": 9.95347653993084e-05, + "loss": 0.28535277843475343, + "step": 2080 + }, + { + "epoch": 0.008972806814181328, + "grad_norm": 0.15443404018878937, + "learning_rate": 9.953045367919078e-05, + "loss": 0.3292974948883057, + "step": 2090 + }, + { + "epoch": 0.009015738904201334, + "grad_norm": 0.2226814329624176, + "learning_rate": 9.952614195907315e-05, + "loss": 0.1713486909866333, + "step": 2100 + }, + { + "epoch": 0.00905867099422134, + "grad_norm": 0.12499313801527023, + "learning_rate": 9.952183023895553e-05, + "loss": 0.27860250473022463, + "step": 2110 + }, + { + "epoch": 0.009101603084241346, + "grad_norm": 0.37484830617904663, + "learning_rate": 9.951751851883791e-05, + "loss": 0.16570254564285278, + "step": 2120 + }, + { + "epoch": 0.009144535174261354, + "grad_norm": 1.3329097032546997, + "learning_rate": 9.951320679872029e-05, + "loss": 0.4097251892089844, + "step": 2130 + }, + { + "epoch": 0.00918746726428136, + "grad_norm": 0.030536966398358345, + "learning_rate": 9.950889507860266e-05, + "loss": 0.254646372795105, + "step": 2140 + }, + { + "epoch": 0.009230399354301366, + "grad_norm": 2.485957622528076, + "learning_rate": 9.950458335848504e-05, + "loss": 0.36496973037719727, + "step": 2150 + }, + { + "epoch": 0.009273331444321372, + "grad_norm": 0.028866570442914963, + "learning_rate": 9.950027163836742e-05, + "loss": 0.2507104158401489, + "step": 2160 + }, + { + "epoch": 0.009316263534341378, + "grad_norm": 0.027181223034858704, + "learning_rate": 9.94959599182498e-05, + "loss": 0.2541772127151489, + "step": 2170 + }, + { + "epoch": 0.009359195624361384, + "grad_norm": 0.04736591503024101, + "learning_rate": 9.949164819813217e-05, + "loss": 0.3932987451553345, + "step": 2180 + }, + { + "epoch": 0.009402127714381392, + "grad_norm": 0.8770988583564758, + "learning_rate": 9.948733647801455e-05, + "loss": 0.3800723314285278, + "step": 2190 + }, + { + "epoch": 0.009445059804401398, + "grad_norm": 0.6233557462692261, + "learning_rate": 9.948302475789693e-05, + "loss": 0.27814540863037107, + "step": 2200 + }, + { + "epoch": 0.009487991894421404, + "grad_norm": 3.1134743690490723, + "learning_rate": 9.947871303777929e-05, + "loss": 0.2906489372253418, + "step": 2210 + }, + { + "epoch": 0.00953092398444141, + "grad_norm": 1.6282368898391724, + "learning_rate": 9.947440131766167e-05, + "loss": 0.40398077964782714, + "step": 2220 + }, + { + "epoch": 0.009573856074461417, + "grad_norm": 2.197567939758301, + "learning_rate": 9.947008959754405e-05, + "loss": 0.24783599376678467, + "step": 2230 + }, + { + "epoch": 0.009616788164481423, + "grad_norm": 0.3791491985321045, + "learning_rate": 9.946577787742642e-05, + "loss": 0.3287055253982544, + "step": 2240 + }, + { + "epoch": 0.00965972025450143, + "grad_norm": 1.3653607368469238, + "learning_rate": 9.94614661573088e-05, + "loss": 0.3267251253128052, + "step": 2250 + }, + { + "epoch": 0.009702652344521437, + "grad_norm": 0.005248530767858028, + "learning_rate": 9.945715443719118e-05, + "loss": 0.42192516326904295, + "step": 2260 + }, + { + "epoch": 0.009745584434541443, + "grad_norm": 1.4583594799041748, + "learning_rate": 9.945284271707356e-05, + "loss": 0.17951282262802123, + "step": 2270 + }, + { + "epoch": 0.009788516524561449, + "grad_norm": 1.393897533416748, + "learning_rate": 9.944853099695593e-05, + "loss": 0.38530232906341555, + "step": 2280 + }, + { + "epoch": 0.009831448614581455, + "grad_norm": 2.1110920906066895, + "learning_rate": 9.94442192768383e-05, + "loss": 0.23096773624420167, + "step": 2290 + }, + { + "epoch": 0.00987438070460146, + "grad_norm": 2.964541435241699, + "learning_rate": 9.943990755672067e-05, + "loss": 0.5022055625915527, + "step": 2300 + }, + { + "epoch": 0.009917312794621469, + "grad_norm": 0.00699465861544013, + "learning_rate": 9.943559583660305e-05, + "loss": 0.21364190578460693, + "step": 2310 + }, + { + "epoch": 0.009960244884641475, + "grad_norm": 7.6037092208862305, + "learning_rate": 9.943128411648543e-05, + "loss": 0.404204797744751, + "step": 2320 + }, + { + "epoch": 0.01000317697466148, + "grad_norm": 0.004018974490463734, + "learning_rate": 9.94269723963678e-05, + "loss": 0.3444742441177368, + "step": 2330 + }, + { + "epoch": 0.010046109064681487, + "grad_norm": 0.2844197154045105, + "learning_rate": 9.942266067625018e-05, + "loss": 0.28289504051208497, + "step": 2340 + }, + { + "epoch": 0.010089041154701493, + "grad_norm": 0.0012115357676520944, + "learning_rate": 9.941834895613257e-05, + "loss": 0.24548511505126952, + "step": 2350 + }, + { + "epoch": 0.010131973244721499, + "grad_norm": 2.089751958847046, + "learning_rate": 9.941403723601495e-05, + "loss": 0.44277057647705076, + "step": 2360 + }, + { + "epoch": 0.010174905334741505, + "grad_norm": 0.24774102866649628, + "learning_rate": 9.940972551589732e-05, + "loss": 0.3112922191619873, + "step": 2370 + }, + { + "epoch": 0.010217837424761513, + "grad_norm": 1.9152735471725464, + "learning_rate": 9.940541379577969e-05, + "loss": 0.3603362560272217, + "step": 2380 + }, + { + "epoch": 0.010260769514781519, + "grad_norm": 1.628706693649292, + "learning_rate": 9.940110207566207e-05, + "loss": 0.21727685928344725, + "step": 2390 + }, + { + "epoch": 0.010303701604801525, + "grad_norm": 2.09602689743042, + "learning_rate": 9.939679035554445e-05, + "loss": 0.4794466495513916, + "step": 2400 + }, + { + "epoch": 0.010346633694821531, + "grad_norm": 0.26855140924453735, + "learning_rate": 9.939247863542682e-05, + "loss": 0.13301979303359984, + "step": 2410 + }, + { + "epoch": 0.010389565784841537, + "grad_norm": 0.026117384433746338, + "learning_rate": 9.93881669153092e-05, + "loss": 0.2615302562713623, + "step": 2420 + }, + { + "epoch": 0.010432497874861543, + "grad_norm": 0.09771610796451569, + "learning_rate": 9.938385519519158e-05, + "loss": 0.28199918270111085, + "step": 2430 + }, + { + "epoch": 0.010475429964881551, + "grad_norm": 0.025545388460159302, + "learning_rate": 9.937954347507396e-05, + "loss": 0.10328556299209594, + "step": 2440 + }, + { + "epoch": 0.010518362054901557, + "grad_norm": 0.9250702261924744, + "learning_rate": 9.937523175495632e-05, + "loss": 0.42665858268737794, + "step": 2450 + }, + { + "epoch": 0.010561294144921563, + "grad_norm": 4.911713600158691, + "learning_rate": 9.93709200348387e-05, + "loss": 0.36646499633789065, + "step": 2460 + }, + { + "epoch": 0.01060422623494157, + "grad_norm": 0.04883352667093277, + "learning_rate": 9.936660831472108e-05, + "loss": 0.2213623046875, + "step": 2470 + }, + { + "epoch": 0.010647158324961575, + "grad_norm": 4.471435070037842, + "learning_rate": 9.936229659460345e-05, + "loss": 0.1262168049812317, + "step": 2480 + }, + { + "epoch": 0.010690090414981581, + "grad_norm": 4.575348377227783, + "learning_rate": 9.935798487448583e-05, + "loss": 0.346947979927063, + "step": 2490 + }, + { + "epoch": 0.01073302250500159, + "grad_norm": 0.4783248007297516, + "learning_rate": 9.935367315436821e-05, + "loss": 0.48537659645080566, + "step": 2500 + }, + { + "epoch": 0.010775954595021595, + "grad_norm": 0.6317282915115356, + "learning_rate": 9.934936143425058e-05, + "loss": 0.2175139904022217, + "step": 2510 + }, + { + "epoch": 0.010818886685041601, + "grad_norm": 0.1488918513059616, + "learning_rate": 9.934504971413296e-05, + "loss": 0.21612226963043213, + "step": 2520 + }, + { + "epoch": 0.010861818775061607, + "grad_norm": 0.04579382389783859, + "learning_rate": 9.934073799401534e-05, + "loss": 0.1966726303100586, + "step": 2530 + }, + { + "epoch": 0.010904750865081613, + "grad_norm": 2.0830743312835693, + "learning_rate": 9.93364262738977e-05, + "loss": 0.3072007417678833, + "step": 2540 + }, + { + "epoch": 0.01094768295510162, + "grad_norm": 0.0915113240480423, + "learning_rate": 9.933211455378008e-05, + "loss": 0.20240769386291504, + "step": 2550 + }, + { + "epoch": 0.010990615045121627, + "grad_norm": 7.322422981262207, + "learning_rate": 9.932780283366246e-05, + "loss": 0.3262624263763428, + "step": 2560 + }, + { + "epoch": 0.011033547135141633, + "grad_norm": 0.7302671670913696, + "learning_rate": 9.932349111354485e-05, + "loss": 0.3114789962768555, + "step": 2570 + }, + { + "epoch": 0.01107647922516164, + "grad_norm": 0.4549922049045563, + "learning_rate": 9.931917939342723e-05, + "loss": 0.2467747926712036, + "step": 2580 + }, + { + "epoch": 0.011119411315181646, + "grad_norm": 6.378932476043701, + "learning_rate": 9.93148676733096e-05, + "loss": 0.24566287994384767, + "step": 2590 + }, + { + "epoch": 0.011162343405201652, + "grad_norm": 5.444248199462891, + "learning_rate": 9.931055595319198e-05, + "loss": 0.3524611473083496, + "step": 2600 + }, + { + "epoch": 0.011205275495221658, + "grad_norm": 0.018262803554534912, + "learning_rate": 9.930624423307436e-05, + "loss": 0.35527334213256834, + "step": 2610 + }, + { + "epoch": 0.011248207585241666, + "grad_norm": 0.04499860107898712, + "learning_rate": 9.930193251295672e-05, + "loss": 0.2583042621612549, + "step": 2620 + }, + { + "epoch": 0.011291139675261672, + "grad_norm": 1.3830759525299072, + "learning_rate": 9.92976207928391e-05, + "loss": 0.44364104270935056, + "step": 2630 + }, + { + "epoch": 0.011334071765281678, + "grad_norm": 0.05449075996875763, + "learning_rate": 9.929330907272148e-05, + "loss": 0.1979648232460022, + "step": 2640 + }, + { + "epoch": 0.011377003855301684, + "grad_norm": 0.08771967887878418, + "learning_rate": 9.928899735260385e-05, + "loss": 0.3086463451385498, + "step": 2650 + }, + { + "epoch": 0.01141993594532169, + "grad_norm": 2.2221310138702393, + "learning_rate": 9.928468563248623e-05, + "loss": 0.34687089920043945, + "step": 2660 + }, + { + "epoch": 0.011462868035341696, + "grad_norm": 0.020478779450058937, + "learning_rate": 9.928037391236861e-05, + "loss": 0.17319800853729247, + "step": 2670 + }, + { + "epoch": 0.011505800125361704, + "grad_norm": 0.24883264303207397, + "learning_rate": 9.927606219225099e-05, + "loss": 0.3666259288787842, + "step": 2680 + }, + { + "epoch": 0.01154873221538171, + "grad_norm": 0.15961961448192596, + "learning_rate": 9.927175047213336e-05, + "loss": 0.2250300645828247, + "step": 2690 + }, + { + "epoch": 0.011591664305401716, + "grad_norm": 0.4793941080570221, + "learning_rate": 9.926743875201573e-05, + "loss": 0.15139001607894897, + "step": 2700 + }, + { + "epoch": 0.011634596395421722, + "grad_norm": 4.141111373901367, + "learning_rate": 9.92631270318981e-05, + "loss": 0.3816200256347656, + "step": 2710 + }, + { + "epoch": 0.011677528485441728, + "grad_norm": 0.0375005304813385, + "learning_rate": 9.925881531178048e-05, + "loss": 0.1840839147567749, + "step": 2720 + }, + { + "epoch": 0.011720460575461734, + "grad_norm": 1.4425864219665527, + "learning_rate": 9.925450359166286e-05, + "loss": 0.1900520920753479, + "step": 2730 + }, + { + "epoch": 0.01176339266548174, + "grad_norm": 2.4802801609039307, + "learning_rate": 9.925019187154524e-05, + "loss": 0.2789858341217041, + "step": 2740 + }, + { + "epoch": 0.011806324755501748, + "grad_norm": 0.07315325736999512, + "learning_rate": 9.924588015142761e-05, + "loss": 0.5190474510192871, + "step": 2750 + }, + { + "epoch": 0.011849256845521754, + "grad_norm": 0.02303297631442547, + "learning_rate": 9.924156843130999e-05, + "loss": 0.3321236610412598, + "step": 2760 + }, + { + "epoch": 0.01189218893554176, + "grad_norm": 0.8053700923919678, + "learning_rate": 9.923725671119237e-05, + "loss": 0.2221919059753418, + "step": 2770 + }, + { + "epoch": 0.011935121025561766, + "grad_norm": 3.6358132362365723, + "learning_rate": 9.923294499107473e-05, + "loss": 0.18539355993270873, + "step": 2780 + }, + { + "epoch": 0.011978053115581772, + "grad_norm": 0.11861757934093475, + "learning_rate": 9.922863327095712e-05, + "loss": 0.38046112060546877, + "step": 2790 + }, + { + "epoch": 0.012020985205601778, + "grad_norm": 0.29847654700279236, + "learning_rate": 9.92243215508395e-05, + "loss": 0.14161444902420045, + "step": 2800 + }, + { + "epoch": 0.012063917295621786, + "grad_norm": 0.0315970703959465, + "learning_rate": 9.922000983072188e-05, + "loss": 0.2748135805130005, + "step": 2810 + }, + { + "epoch": 0.012106849385641792, + "grad_norm": 2.0554513931274414, + "learning_rate": 9.921569811060426e-05, + "loss": 0.11751353740692139, + "step": 2820 + }, + { + "epoch": 0.012149781475661798, + "grad_norm": 7.321534633636475, + "learning_rate": 9.921138639048663e-05, + "loss": 0.5105506896972656, + "step": 2830 + }, + { + "epoch": 0.012192713565681804, + "grad_norm": 0.8315069079399109, + "learning_rate": 9.920707467036901e-05, + "loss": 0.2693990707397461, + "step": 2840 + }, + { + "epoch": 0.01223564565570181, + "grad_norm": 0.07569185644388199, + "learning_rate": 9.920276295025139e-05, + "loss": 0.2765660285949707, + "step": 2850 + }, + { + "epoch": 0.012278577745721816, + "grad_norm": 1.9169636964797974, + "learning_rate": 9.919845123013376e-05, + "loss": 0.33867225646972654, + "step": 2860 + }, + { + "epoch": 0.012321509835741824, + "grad_norm": 0.7454385161399841, + "learning_rate": 9.919413951001613e-05, + "loss": 0.23863754272460938, + "step": 2870 + }, + { + "epoch": 0.01236444192576183, + "grad_norm": 0.4546082615852356, + "learning_rate": 9.91898277898985e-05, + "loss": 0.182702374458313, + "step": 2880 + }, + { + "epoch": 0.012407374015781836, + "grad_norm": 5.7228169441223145, + "learning_rate": 9.918551606978088e-05, + "loss": 0.2921399354934692, + "step": 2890 + }, + { + "epoch": 0.012450306105801843, + "grad_norm": 0.23378795385360718, + "learning_rate": 9.918120434966326e-05, + "loss": 0.37920374870300294, + "step": 2900 + }, + { + "epoch": 0.012493238195821849, + "grad_norm": 0.06861526519060135, + "learning_rate": 9.917689262954564e-05, + "loss": 0.07848119139671325, + "step": 2910 + }, + { + "epoch": 0.012536170285841855, + "grad_norm": 1.5398024320602417, + "learning_rate": 9.917258090942801e-05, + "loss": 0.34091403484344485, + "step": 2920 + }, + { + "epoch": 0.012579102375861862, + "grad_norm": 0.05262840539216995, + "learning_rate": 9.916826918931039e-05, + "loss": 0.12453473806381225, + "step": 2930 + }, + { + "epoch": 0.012622034465881869, + "grad_norm": 3.1975438594818115, + "learning_rate": 9.916395746919277e-05, + "loss": 0.3661008834838867, + "step": 2940 + }, + { + "epoch": 0.012664966555901875, + "grad_norm": 2.7964837551116943, + "learning_rate": 9.915964574907513e-05, + "loss": 0.19229893684387206, + "step": 2950 + }, + { + "epoch": 0.01270789864592188, + "grad_norm": 0.1857406049966812, + "learning_rate": 9.915533402895751e-05, + "loss": 0.25612127780914307, + "step": 2960 + }, + { + "epoch": 0.012750830735941887, + "grad_norm": 0.023666221648454666, + "learning_rate": 9.915102230883989e-05, + "loss": 0.5594746112823487, + "step": 2970 + }, + { + "epoch": 0.012793762825961893, + "grad_norm": 1.8557242155075073, + "learning_rate": 9.914671058872227e-05, + "loss": 0.1518352746963501, + "step": 2980 + }, + { + "epoch": 0.0128366949159819, + "grad_norm": 1.4806312322616577, + "learning_rate": 9.914239886860464e-05, + "loss": 0.431013822555542, + "step": 2990 + }, + { + "epoch": 0.012879627006001907, + "grad_norm": 0.06178463250398636, + "learning_rate": 9.913808714848702e-05, + "loss": 0.27926638126373293, + "step": 3000 + }, + { + "epoch": 0.012879627006001907, + "eval_loss": 0.5856176614761353, + "eval_runtime": 27.4381, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 3000 + }, + { + "epoch": 0.012922559096021913, + "grad_norm": 1.1055095195770264, + "learning_rate": 9.91337754283694e-05, + "loss": 0.24324522018432618, + "step": 3010 + }, + { + "epoch": 0.012965491186041919, + "grad_norm": 0.41986164450645447, + "learning_rate": 9.912946370825177e-05, + "loss": 0.5372348308563233, + "step": 3020 + }, + { + "epoch": 0.013008423276061925, + "grad_norm": 8.023941040039062, + "learning_rate": 9.912515198813415e-05, + "loss": 0.48131527900695803, + "step": 3030 + }, + { + "epoch": 0.013051355366081931, + "grad_norm": 0.047052349895238876, + "learning_rate": 9.912084026801653e-05, + "loss": 0.17677942514419556, + "step": 3040 + }, + { + "epoch": 0.013094287456101939, + "grad_norm": 0.6833871603012085, + "learning_rate": 9.911652854789891e-05, + "loss": 0.09258891344070434, + "step": 3050 + }, + { + "epoch": 0.013137219546121945, + "grad_norm": 0.14966531097888947, + "learning_rate": 9.911221682778128e-05, + "loss": 0.18587244749069215, + "step": 3060 + }, + { + "epoch": 0.013180151636141951, + "grad_norm": 0.06690580397844315, + "learning_rate": 9.910790510766366e-05, + "loss": 0.22195734977722167, + "step": 3070 + }, + { + "epoch": 0.013223083726161957, + "grad_norm": 2.9302849769592285, + "learning_rate": 9.910359338754604e-05, + "loss": 0.3047139644622803, + "step": 3080 + }, + { + "epoch": 0.013266015816181963, + "grad_norm": 1.5165706872940063, + "learning_rate": 9.909928166742842e-05, + "loss": 0.4636370182037354, + "step": 3090 + }, + { + "epoch": 0.01330894790620197, + "grad_norm": 2.706740617752075, + "learning_rate": 9.90949699473108e-05, + "loss": 0.3813605546951294, + "step": 3100 + }, + { + "epoch": 0.013351879996221975, + "grad_norm": 0.03528573364019394, + "learning_rate": 9.909065822719316e-05, + "loss": 0.15370768308639526, + "step": 3110 + }, + { + "epoch": 0.013394812086241983, + "grad_norm": 0.06504935771226883, + "learning_rate": 9.908634650707553e-05, + "loss": 0.32649986743927, + "step": 3120 + }, + { + "epoch": 0.013437744176261989, + "grad_norm": 0.01829485222697258, + "learning_rate": 9.908203478695791e-05, + "loss": 0.20302400588989258, + "step": 3130 + }, + { + "epoch": 0.013480676266281995, + "grad_norm": 0.7548179626464844, + "learning_rate": 9.907772306684029e-05, + "loss": 0.39861307144165037, + "step": 3140 + }, + { + "epoch": 0.013523608356302001, + "grad_norm": 0.1664927899837494, + "learning_rate": 9.907341134672267e-05, + "loss": 0.3844759464263916, + "step": 3150 + }, + { + "epoch": 0.013566540446322007, + "grad_norm": 1.1451348066329956, + "learning_rate": 9.906909962660504e-05, + "loss": 0.24213221073150634, + "step": 3160 + }, + { + "epoch": 0.013609472536342013, + "grad_norm": 4.82208776473999, + "learning_rate": 9.906478790648742e-05, + "loss": 0.4157630443572998, + "step": 3170 + }, + { + "epoch": 0.013652404626362021, + "grad_norm": 3.070974588394165, + "learning_rate": 9.90604761863698e-05, + "loss": 0.29048967361450195, + "step": 3180 + }, + { + "epoch": 0.013695336716382027, + "grad_norm": 5.502252101898193, + "learning_rate": 9.905616446625216e-05, + "loss": 0.45506629943847654, + "step": 3190 + }, + { + "epoch": 0.013738268806402033, + "grad_norm": 4.653741836547852, + "learning_rate": 9.905185274613454e-05, + "loss": 0.58369460105896, + "step": 3200 + }, + { + "epoch": 0.01378120089642204, + "grad_norm": 0.18484069406986237, + "learning_rate": 9.904754102601692e-05, + "loss": 0.3674613952636719, + "step": 3210 + }, + { + "epoch": 0.013824132986442045, + "grad_norm": 0.15102382004261017, + "learning_rate": 9.90432293058993e-05, + "loss": 0.3542445659637451, + "step": 3220 + }, + { + "epoch": 0.013867065076462052, + "grad_norm": 0.25356340408325195, + "learning_rate": 9.903891758578167e-05, + "loss": 0.1704845428466797, + "step": 3230 + }, + { + "epoch": 0.01390999716648206, + "grad_norm": 5.379843711853027, + "learning_rate": 9.903460586566405e-05, + "loss": 0.4127946853637695, + "step": 3240 + }, + { + "epoch": 0.013952929256502065, + "grad_norm": 0.08451557159423828, + "learning_rate": 9.903029414554643e-05, + "loss": 0.38672802448272703, + "step": 3250 + }, + { + "epoch": 0.013995861346522072, + "grad_norm": 0.12616507709026337, + "learning_rate": 9.90259824254288e-05, + "loss": 0.23725695610046388, + "step": 3260 + }, + { + "epoch": 0.014038793436542078, + "grad_norm": 0.07903803884983063, + "learning_rate": 9.902167070531118e-05, + "loss": 0.14826754331588746, + "step": 3270 + }, + { + "epoch": 0.014081725526562084, + "grad_norm": 1.9865450859069824, + "learning_rate": 9.901735898519356e-05, + "loss": 0.4523752212524414, + "step": 3280 + }, + { + "epoch": 0.01412465761658209, + "grad_norm": 0.05560386925935745, + "learning_rate": 9.901304726507594e-05, + "loss": 0.2219390869140625, + "step": 3290 + }, + { + "epoch": 0.014167589706602098, + "grad_norm": 0.18131622672080994, + "learning_rate": 9.900873554495831e-05, + "loss": 0.42689967155456543, + "step": 3300 + }, + { + "epoch": 0.014210521796622104, + "grad_norm": 0.03924006223678589, + "learning_rate": 9.900442382484069e-05, + "loss": 0.19628711938858032, + "step": 3310 + }, + { + "epoch": 0.01425345388664211, + "grad_norm": 0.02912324294447899, + "learning_rate": 9.900011210472307e-05, + "loss": 0.33134822845458983, + "step": 3320 + }, + { + "epoch": 0.014296385976662116, + "grad_norm": 1.1800485849380493, + "learning_rate": 9.899580038460545e-05, + "loss": 0.23386192321777344, + "step": 3330 + }, + { + "epoch": 0.014339318066682122, + "grad_norm": 0.015988456085324287, + "learning_rate": 9.899148866448782e-05, + "loss": 0.20085477828979492, + "step": 3340 + }, + { + "epoch": 0.014382250156702128, + "grad_norm": 0.9935466051101685, + "learning_rate": 9.89871769443702e-05, + "loss": 0.45007762908935545, + "step": 3350 + }, + { + "epoch": 0.014425182246722136, + "grad_norm": 0.023739760741591454, + "learning_rate": 9.898286522425256e-05, + "loss": 0.3121100902557373, + "step": 3360 + }, + { + "epoch": 0.014468114336742142, + "grad_norm": 0.0075697763822972775, + "learning_rate": 9.897855350413494e-05, + "loss": 0.26780340671539304, + "step": 3370 + }, + { + "epoch": 0.014511046426762148, + "grad_norm": 0.8707277774810791, + "learning_rate": 9.897424178401732e-05, + "loss": 0.40902557373046877, + "step": 3380 + }, + { + "epoch": 0.014553978516782154, + "grad_norm": 0.09020499140024185, + "learning_rate": 9.89699300638997e-05, + "loss": 0.09538947939872741, + "step": 3390 + }, + { + "epoch": 0.01459691060680216, + "grad_norm": 15.655129432678223, + "learning_rate": 9.896561834378207e-05, + "loss": 0.27243244647979736, + "step": 3400 + }, + { + "epoch": 0.014639842696822166, + "grad_norm": 0.09583932906389236, + "learning_rate": 9.896130662366445e-05, + "loss": 0.27807865142822263, + "step": 3410 + }, + { + "epoch": 0.014682774786842174, + "grad_norm": 0.3359926640987396, + "learning_rate": 9.895699490354683e-05, + "loss": 0.5408889293670655, + "step": 3420 + }, + { + "epoch": 0.01472570687686218, + "grad_norm": 1.483046293258667, + "learning_rate": 9.89526831834292e-05, + "loss": 0.470717191696167, + "step": 3430 + }, + { + "epoch": 0.014768638966882186, + "grad_norm": 0.0958387553691864, + "learning_rate": 9.894837146331157e-05, + "loss": 0.20184338092803955, + "step": 3440 + }, + { + "epoch": 0.014811571056902192, + "grad_norm": 2.537693738937378, + "learning_rate": 9.894405974319395e-05, + "loss": 0.5307780265808105, + "step": 3450 + }, + { + "epoch": 0.014854503146922198, + "grad_norm": 1.3038251399993896, + "learning_rate": 9.893974802307632e-05, + "loss": 0.2795207977294922, + "step": 3460 + }, + { + "epoch": 0.014897435236942204, + "grad_norm": 0.29755839705467224, + "learning_rate": 9.89354363029587e-05, + "loss": 0.16141926050186156, + "step": 3470 + }, + { + "epoch": 0.01494036732696221, + "grad_norm": 0.008788419887423515, + "learning_rate": 9.893112458284108e-05, + "loss": 0.37876596450805666, + "step": 3480 + }, + { + "epoch": 0.014983299416982218, + "grad_norm": 0.2352016270160675, + "learning_rate": 9.892681286272346e-05, + "loss": 0.31247496604919434, + "step": 3490 + }, + { + "epoch": 0.015026231507002224, + "grad_norm": 2.5700554847717285, + "learning_rate": 9.892250114260583e-05, + "loss": 0.21215453147888183, + "step": 3500 + }, + { + "epoch": 0.01506916359702223, + "grad_norm": 2.7226498126983643, + "learning_rate": 9.891818942248821e-05, + "loss": 0.251985502243042, + "step": 3510 + }, + { + "epoch": 0.015112095687042236, + "grad_norm": 1.7493064403533936, + "learning_rate": 9.891387770237059e-05, + "loss": 0.28089330196380613, + "step": 3520 + }, + { + "epoch": 0.015155027777062242, + "grad_norm": 0.0026909897569566965, + "learning_rate": 9.890956598225297e-05, + "loss": 0.1568443775177002, + "step": 3530 + }, + { + "epoch": 0.015197959867082248, + "grad_norm": 3.8230791091918945, + "learning_rate": 9.890525426213534e-05, + "loss": 0.19587650299072265, + "step": 3540 + }, + { + "epoch": 0.015240891957102256, + "grad_norm": 0.0006467084749601781, + "learning_rate": 9.890094254201772e-05, + "loss": 0.07268471121788025, + "step": 3550 + }, + { + "epoch": 0.015283824047122262, + "grad_norm": 0.02135465107858181, + "learning_rate": 9.88966308219001e-05, + "loss": 0.3168449401855469, + "step": 3560 + }, + { + "epoch": 0.015326756137142268, + "grad_norm": 0.6610631346702576, + "learning_rate": 9.889231910178247e-05, + "loss": 0.26426920890808103, + "step": 3570 + }, + { + "epoch": 0.015369688227162275, + "grad_norm": 0.004358239006251097, + "learning_rate": 9.888800738166485e-05, + "loss": 0.42137608528137205, + "step": 3580 + }, + { + "epoch": 0.01541262031718228, + "grad_norm": 0.0010839806636795402, + "learning_rate": 9.888369566154723e-05, + "loss": 0.040946352481842044, + "step": 3590 + }, + { + "epoch": 0.015455552407202287, + "grad_norm": 1.6554967164993286, + "learning_rate": 9.88793839414296e-05, + "loss": 0.24365475177764892, + "step": 3600 + }, + { + "epoch": 0.015498484497222294, + "grad_norm": 2.889256715774536, + "learning_rate": 9.887507222131197e-05, + "loss": 0.7947826862335206, + "step": 3610 + }, + { + "epoch": 0.0155414165872423, + "grad_norm": 16.558940887451172, + "learning_rate": 9.887076050119435e-05, + "loss": 0.40970120429992674, + "step": 3620 + }, + { + "epoch": 0.015584348677262307, + "grad_norm": 10.725292205810547, + "learning_rate": 9.886644878107673e-05, + "loss": 0.477548885345459, + "step": 3630 + }, + { + "epoch": 0.015627280767282314, + "grad_norm": 0.03413480520248413, + "learning_rate": 9.88621370609591e-05, + "loss": 0.15648822784423827, + "step": 3640 + }, + { + "epoch": 0.01567021285730232, + "grad_norm": 3.0283219814300537, + "learning_rate": 9.885782534084148e-05, + "loss": 0.2519017934799194, + "step": 3650 + }, + { + "epoch": 0.015713144947322327, + "grad_norm": 0.1699555665254593, + "learning_rate": 9.885351362072386e-05, + "loss": 0.28411762714385985, + "step": 3660 + }, + { + "epoch": 0.01575607703734233, + "grad_norm": 0.4008488059043884, + "learning_rate": 9.884920190060623e-05, + "loss": 0.16037325859069823, + "step": 3670 + }, + { + "epoch": 0.01579900912736234, + "grad_norm": 0.027547165751457214, + "learning_rate": 9.884489018048861e-05, + "loss": 0.16036679744720458, + "step": 3680 + }, + { + "epoch": 0.015841941217382343, + "grad_norm": 1.5933310985565186, + "learning_rate": 9.884057846037098e-05, + "loss": 0.39816043376922605, + "step": 3690 + }, + { + "epoch": 0.01588487330740235, + "grad_norm": 4.830496788024902, + "learning_rate": 9.883626674025335e-05, + "loss": 0.21261231899261473, + "step": 3700 + }, + { + "epoch": 0.01592780539742236, + "grad_norm": 1.8774749040603638, + "learning_rate": 9.883195502013573e-05, + "loss": 0.2600428581237793, + "step": 3710 + }, + { + "epoch": 0.015970737487442363, + "grad_norm": 3.542771816253662, + "learning_rate": 9.882764330001811e-05, + "loss": 0.29696452617645264, + "step": 3720 + }, + { + "epoch": 0.01601366957746237, + "grad_norm": 0.059213295578956604, + "learning_rate": 9.882333157990048e-05, + "loss": 0.4307241916656494, + "step": 3730 + }, + { + "epoch": 0.016056601667482375, + "grad_norm": 1.7407482862472534, + "learning_rate": 9.881901985978286e-05, + "loss": 0.4724702835083008, + "step": 3740 + }, + { + "epoch": 0.016099533757502383, + "grad_norm": 0.2899302840232849, + "learning_rate": 9.881470813966524e-05, + "loss": 0.13045870065689086, + "step": 3750 + }, + { + "epoch": 0.01614246584752239, + "grad_norm": 2.5515809059143066, + "learning_rate": 9.881039641954763e-05, + "loss": 0.3518421411514282, + "step": 3760 + }, + { + "epoch": 0.016185397937542395, + "grad_norm": 0.06440158188343048, + "learning_rate": 9.880608469943e-05, + "loss": 0.28357388973236086, + "step": 3770 + }, + { + "epoch": 0.016228330027562403, + "grad_norm": 1.2010902166366577, + "learning_rate": 9.880177297931237e-05, + "loss": 0.17321739196777344, + "step": 3780 + }, + { + "epoch": 0.016271262117582407, + "grad_norm": 0.006291663274168968, + "learning_rate": 9.879746125919475e-05, + "loss": 0.08664214015007018, + "step": 3790 + }, + { + "epoch": 0.016314194207602415, + "grad_norm": 0.4199025630950928, + "learning_rate": 9.879314953907713e-05, + "loss": 0.43680973052978517, + "step": 3800 + }, + { + "epoch": 0.01635712629762242, + "grad_norm": 1.7818711996078491, + "learning_rate": 9.87888378189595e-05, + "loss": 0.1816726803779602, + "step": 3810 + }, + { + "epoch": 0.016400058387642427, + "grad_norm": 9.08803939819336, + "learning_rate": 9.878452609884188e-05, + "loss": 0.4264963626861572, + "step": 3820 + }, + { + "epoch": 0.016442990477662435, + "grad_norm": 0.12058204412460327, + "learning_rate": 9.878021437872426e-05, + "loss": 0.3006183624267578, + "step": 3830 + }, + { + "epoch": 0.01648592256768244, + "grad_norm": 0.006680858321487904, + "learning_rate": 9.877590265860664e-05, + "loss": 0.2540662050247192, + "step": 3840 + }, + { + "epoch": 0.016528854657702447, + "grad_norm": 6.741055011749268, + "learning_rate": 9.8771590938489e-05, + "loss": 0.20651772022247314, + "step": 3850 + }, + { + "epoch": 0.01657178674772245, + "grad_norm": 3.3949806690216064, + "learning_rate": 9.876727921837138e-05, + "loss": 0.3159367561340332, + "step": 3860 + }, + { + "epoch": 0.01661471883774246, + "grad_norm": 1.8325541019439697, + "learning_rate": 9.876296749825375e-05, + "loss": 0.4747579097747803, + "step": 3870 + }, + { + "epoch": 0.016657650927762464, + "grad_norm": 0.1203535795211792, + "learning_rate": 9.875865577813613e-05, + "loss": 0.2717746257781982, + "step": 3880 + }, + { + "epoch": 0.01670058301778247, + "grad_norm": 0.09284611791372299, + "learning_rate": 9.875434405801851e-05, + "loss": 0.3075088977813721, + "step": 3890 + }, + { + "epoch": 0.01674351510780248, + "grad_norm": 0.04041628912091255, + "learning_rate": 9.875003233790089e-05, + "loss": 0.4742414951324463, + "step": 3900 + }, + { + "epoch": 0.016786447197822484, + "grad_norm": 2.080106735229492, + "learning_rate": 9.874572061778326e-05, + "loss": 0.4726258277893066, + "step": 3910 + }, + { + "epoch": 0.01682937928784249, + "grad_norm": 0.04380369931459427, + "learning_rate": 9.874140889766564e-05, + "loss": 0.14977000951766967, + "step": 3920 + }, + { + "epoch": 0.016872311377862496, + "grad_norm": 1.6222591400146484, + "learning_rate": 9.8737097177548e-05, + "loss": 0.18420748710632323, + "step": 3930 + }, + { + "epoch": 0.016915243467882504, + "grad_norm": 0.45909276604652405, + "learning_rate": 9.873278545743038e-05, + "loss": 0.1671479821205139, + "step": 3940 + }, + { + "epoch": 0.01695817555790251, + "grad_norm": 0.0356089286506176, + "learning_rate": 9.872847373731276e-05, + "loss": 0.13457057476043702, + "step": 3950 + }, + { + "epoch": 0.017001107647922516, + "grad_norm": 1.6384329795837402, + "learning_rate": 9.872416201719514e-05, + "loss": 0.12244757413864135, + "step": 3960 + }, + { + "epoch": 0.017044039737942523, + "grad_norm": 0.01140339020639658, + "learning_rate": 9.871985029707751e-05, + "loss": 0.20679588317871095, + "step": 3970 + }, + { + "epoch": 0.017086971827962528, + "grad_norm": 0.006532014813274145, + "learning_rate": 9.87155385769599e-05, + "loss": 0.2710320234298706, + "step": 3980 + }, + { + "epoch": 0.017129903917982536, + "grad_norm": 0.8082075715065002, + "learning_rate": 9.871122685684228e-05, + "loss": 0.5106721878051758, + "step": 3990 + }, + { + "epoch": 0.01717283600800254, + "grad_norm": 1.0562989711761475, + "learning_rate": 9.870691513672466e-05, + "loss": 0.5093242168426514, + "step": 4000 + }, + { + "epoch": 0.01717283600800254, + "eval_loss": 0.5767123103141785, + "eval_runtime": 27.4361, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 4000 + }, + { + "epoch": 0.017215768098022548, + "grad_norm": 0.004736120812594891, + "learning_rate": 9.870260341660704e-05, + "loss": 0.15739403963088988, + "step": 4010 + }, + { + "epoch": 0.017258700188042556, + "grad_norm": 0.04196614772081375, + "learning_rate": 9.86982916964894e-05, + "loss": 0.31789140701293944, + "step": 4020 + }, + { + "epoch": 0.01730163227806256, + "grad_norm": 1.7766358852386475, + "learning_rate": 9.869397997637178e-05, + "loss": 0.27827773094177244, + "step": 4030 + }, + { + "epoch": 0.017344564368082568, + "grad_norm": 1.262876272201538, + "learning_rate": 9.868966825625416e-05, + "loss": 0.298844575881958, + "step": 4040 + }, + { + "epoch": 0.017387496458102572, + "grad_norm": 0.02867383137345314, + "learning_rate": 9.868535653613653e-05, + "loss": 0.353701114654541, + "step": 4050 + }, + { + "epoch": 0.01743042854812258, + "grad_norm": 0.49855148792266846, + "learning_rate": 9.868104481601891e-05, + "loss": 0.13109270334243775, + "step": 4060 + }, + { + "epoch": 0.017473360638142588, + "grad_norm": 1.795688509941101, + "learning_rate": 9.867673309590129e-05, + "loss": 0.5057300090789795, + "step": 4070 + }, + { + "epoch": 0.017516292728162592, + "grad_norm": 3.00192928314209, + "learning_rate": 9.867242137578366e-05, + "loss": 0.216597580909729, + "step": 4080 + }, + { + "epoch": 0.0175592248181826, + "grad_norm": 0.8749079704284668, + "learning_rate": 9.866810965566604e-05, + "loss": 0.28190293312072756, + "step": 4090 + }, + { + "epoch": 0.017602156908202604, + "grad_norm": 2.5471808910369873, + "learning_rate": 9.86637979355484e-05, + "loss": 0.2924044609069824, + "step": 4100 + }, + { + "epoch": 0.017645088998222612, + "grad_norm": 0.0811334103345871, + "learning_rate": 9.865948621543078e-05, + "loss": 0.4297049045562744, + "step": 4110 + }, + { + "epoch": 0.017688021088242616, + "grad_norm": 4.748598098754883, + "learning_rate": 9.865517449531316e-05, + "loss": 0.27084424495697024, + "step": 4120 + }, + { + "epoch": 0.017730953178262624, + "grad_norm": 0.09125273674726486, + "learning_rate": 9.865086277519554e-05, + "loss": 0.27569165229797366, + "step": 4130 + }, + { + "epoch": 0.017773885268282632, + "grad_norm": 0.1283591389656067, + "learning_rate": 9.864655105507792e-05, + "loss": 0.2747859239578247, + "step": 4140 + }, + { + "epoch": 0.017816817358302636, + "grad_norm": 1.7652469873428345, + "learning_rate": 9.864223933496029e-05, + "loss": 0.4440613269805908, + "step": 4150 + }, + { + "epoch": 0.017859749448322644, + "grad_norm": 0.5249306559562683, + "learning_rate": 9.863792761484267e-05, + "loss": 0.1520497679710388, + "step": 4160 + }, + { + "epoch": 0.01790268153834265, + "grad_norm": 0.9873408079147339, + "learning_rate": 9.863361589472505e-05, + "loss": 0.3086827516555786, + "step": 4170 + }, + { + "epoch": 0.017945613628362656, + "grad_norm": 1.5772066116333008, + "learning_rate": 9.862930417460741e-05, + "loss": 0.3315431594848633, + "step": 4180 + }, + { + "epoch": 0.017988545718382664, + "grad_norm": 4.330613136291504, + "learning_rate": 9.862499245448979e-05, + "loss": 0.3991940259933472, + "step": 4190 + }, + { + "epoch": 0.01803147780840267, + "grad_norm": 0.22321546077728271, + "learning_rate": 9.862068073437218e-05, + "loss": 0.3414437294006348, + "step": 4200 + }, + { + "epoch": 0.018074409898422676, + "grad_norm": 1.90219247341156, + "learning_rate": 9.861636901425456e-05, + "loss": 0.3252838611602783, + "step": 4210 + }, + { + "epoch": 0.01811734198844268, + "grad_norm": 0.08938544988632202, + "learning_rate": 9.861205729413693e-05, + "loss": 0.3285228729248047, + "step": 4220 + }, + { + "epoch": 0.01816027407846269, + "grad_norm": 1.1169179677963257, + "learning_rate": 9.860774557401931e-05, + "loss": 0.41843581199645996, + "step": 4230 + }, + { + "epoch": 0.018203206168482693, + "grad_norm": 3.2259140014648438, + "learning_rate": 9.860343385390169e-05, + "loss": 0.4892786979675293, + "step": 4240 + }, + { + "epoch": 0.0182461382585027, + "grad_norm": 2.497501850128174, + "learning_rate": 9.859912213378407e-05, + "loss": 0.25899789333343504, + "step": 4250 + }, + { + "epoch": 0.018289070348522708, + "grad_norm": 2.090446949005127, + "learning_rate": 9.859481041366643e-05, + "loss": 0.34546072483062745, + "step": 4260 + }, + { + "epoch": 0.018332002438542713, + "grad_norm": 0.05979840084910393, + "learning_rate": 9.859049869354881e-05, + "loss": 0.3825571537017822, + "step": 4270 + }, + { + "epoch": 0.01837493452856272, + "grad_norm": 3.4556007385253906, + "learning_rate": 9.858618697343118e-05, + "loss": 0.4224254131317139, + "step": 4280 + }, + { + "epoch": 0.018417866618582725, + "grad_norm": 0.04259246960282326, + "learning_rate": 9.858187525331356e-05, + "loss": 0.19651782512664795, + "step": 4290 + }, + { + "epoch": 0.018460798708602733, + "grad_norm": 0.057309672236442566, + "learning_rate": 9.857756353319594e-05, + "loss": 0.22271180152893066, + "step": 4300 + }, + { + "epoch": 0.018503730798622737, + "grad_norm": 0.8981066942214966, + "learning_rate": 9.857325181307832e-05, + "loss": 0.4255647659301758, + "step": 4310 + }, + { + "epoch": 0.018546662888642745, + "grad_norm": 0.04550351947546005, + "learning_rate": 9.85689400929607e-05, + "loss": 0.2925698280334473, + "step": 4320 + }, + { + "epoch": 0.018589594978662752, + "grad_norm": 3.6106345653533936, + "learning_rate": 9.856462837284307e-05, + "loss": 0.2803173303604126, + "step": 4330 + }, + { + "epoch": 0.018632527068682757, + "grad_norm": 4.29874849319458, + "learning_rate": 9.856031665272545e-05, + "loss": 0.2041841506958008, + "step": 4340 + }, + { + "epoch": 0.018675459158702765, + "grad_norm": 0.028424430638551712, + "learning_rate": 9.855600493260781e-05, + "loss": 0.20737464427948, + "step": 4350 + }, + { + "epoch": 0.01871839124872277, + "grad_norm": 0.07163364440202713, + "learning_rate": 9.855169321249019e-05, + "loss": 0.12331546545028686, + "step": 4360 + }, + { + "epoch": 0.018761323338742777, + "grad_norm": 2.0091068744659424, + "learning_rate": 9.854738149237257e-05, + "loss": 0.1653214693069458, + "step": 4370 + }, + { + "epoch": 0.018804255428762785, + "grad_norm": 0.13968168199062347, + "learning_rate": 9.854306977225494e-05, + "loss": 0.13523595333099364, + "step": 4380 + }, + { + "epoch": 0.01884718751878279, + "grad_norm": 0.2080664187669754, + "learning_rate": 9.853875805213732e-05, + "loss": 0.38797965049743655, + "step": 4390 + }, + { + "epoch": 0.018890119608802797, + "grad_norm": 0.4882734417915344, + "learning_rate": 9.85344463320197e-05, + "loss": 0.16562176942825318, + "step": 4400 + }, + { + "epoch": 0.0189330516988228, + "grad_norm": 0.009212512522935867, + "learning_rate": 9.853013461190208e-05, + "loss": 0.12350815534591675, + "step": 4410 + }, + { + "epoch": 0.01897598378884281, + "grad_norm": 0.014012163504958153, + "learning_rate": 9.852582289178445e-05, + "loss": 0.3011958122253418, + "step": 4420 + }, + { + "epoch": 0.019018915878862813, + "grad_norm": 0.004959054756909609, + "learning_rate": 9.852151117166683e-05, + "loss": 0.160746431350708, + "step": 4430 + }, + { + "epoch": 0.01906184796888282, + "grad_norm": 2.8610429763793945, + "learning_rate": 9.851719945154921e-05, + "loss": 0.33746287822723386, + "step": 4440 + }, + { + "epoch": 0.01910478005890283, + "grad_norm": 1.2206952571868896, + "learning_rate": 9.851288773143159e-05, + "loss": 0.39143075942993166, + "step": 4450 + }, + { + "epoch": 0.019147712148922833, + "grad_norm": 0.013409411534667015, + "learning_rate": 9.850857601131396e-05, + "loss": 0.3400929689407349, + "step": 4460 + }, + { + "epoch": 0.01919064423894284, + "grad_norm": 16.17251968383789, + "learning_rate": 9.850426429119634e-05, + "loss": 0.22110345363616943, + "step": 4470 + }, + { + "epoch": 0.019233576328962845, + "grad_norm": 0.009083151817321777, + "learning_rate": 9.849995257107872e-05, + "loss": 0.29708778858184814, + "step": 4480 + }, + { + "epoch": 0.019276508418982853, + "grad_norm": 1.265600562095642, + "learning_rate": 9.84956408509611e-05, + "loss": 0.34580533504486083, + "step": 4490 + }, + { + "epoch": 0.01931944050900286, + "grad_norm": 0.9040108919143677, + "learning_rate": 9.849132913084347e-05, + "loss": 0.4236451148986816, + "step": 4500 + }, + { + "epoch": 0.019362372599022865, + "grad_norm": 1.5615957975387573, + "learning_rate": 9.848701741072584e-05, + "loss": 0.5036997318267822, + "step": 4510 + }, + { + "epoch": 0.019405304689042873, + "grad_norm": 1.2325830459594727, + "learning_rate": 9.848270569060821e-05, + "loss": 0.5182486057281495, + "step": 4520 + }, + { + "epoch": 0.019448236779062877, + "grad_norm": 0.7714042663574219, + "learning_rate": 9.847839397049059e-05, + "loss": 0.4611947536468506, + "step": 4530 + }, + { + "epoch": 0.019491168869082885, + "grad_norm": 0.06427251547574997, + "learning_rate": 9.847408225037297e-05, + "loss": 0.4581316947937012, + "step": 4540 + }, + { + "epoch": 0.01953410095910289, + "grad_norm": 0.04174116253852844, + "learning_rate": 9.846977053025535e-05, + "loss": 0.052045691013336184, + "step": 4550 + }, + { + "epoch": 0.019577033049122897, + "grad_norm": 0.059649962931871414, + "learning_rate": 9.846545881013772e-05, + "loss": 0.24959084987640381, + "step": 4560 + }, + { + "epoch": 0.019619965139142905, + "grad_norm": 0.06549002230167389, + "learning_rate": 9.84611470900201e-05, + "loss": 0.3865674018859863, + "step": 4570 + }, + { + "epoch": 0.01966289722916291, + "grad_norm": 2.8574886322021484, + "learning_rate": 9.845683536990248e-05, + "loss": 0.39156625270843504, + "step": 4580 + }, + { + "epoch": 0.019705829319182917, + "grad_norm": 1.1553655862808228, + "learning_rate": 9.845252364978484e-05, + "loss": 0.3730152606964111, + "step": 4590 + }, + { + "epoch": 0.01974876140920292, + "grad_norm": 0.13113917410373688, + "learning_rate": 9.844821192966722e-05, + "loss": 0.2739130735397339, + "step": 4600 + }, + { + "epoch": 0.01979169349922293, + "grad_norm": 1.5252152681350708, + "learning_rate": 9.84439002095496e-05, + "loss": 0.30946390628814696, + "step": 4610 + }, + { + "epoch": 0.019834625589242937, + "grad_norm": 1.3319313526153564, + "learning_rate": 9.843958848943197e-05, + "loss": 0.4056663513183594, + "step": 4620 + }, + { + "epoch": 0.01987755767926294, + "grad_norm": 2.8948614597320557, + "learning_rate": 9.843527676931435e-05, + "loss": 0.4438942909240723, + "step": 4630 + }, + { + "epoch": 0.01992048976928295, + "grad_norm": 1.7223323583602905, + "learning_rate": 9.843096504919673e-05, + "loss": 0.2097313404083252, + "step": 4640 + }, + { + "epoch": 0.019963421859302954, + "grad_norm": 0.14699095487594604, + "learning_rate": 9.84266533290791e-05, + "loss": 0.3881974458694458, + "step": 4650 + }, + { + "epoch": 0.02000635394932296, + "grad_norm": 5.661250114440918, + "learning_rate": 9.842234160896148e-05, + "loss": 0.2718313694000244, + "step": 4660 + }, + { + "epoch": 0.020049286039342966, + "grad_norm": 0.37304461002349854, + "learning_rate": 9.841802988884386e-05, + "loss": 0.1976300835609436, + "step": 4670 + }, + { + "epoch": 0.020092218129362974, + "grad_norm": 0.10088256746530533, + "learning_rate": 9.841371816872624e-05, + "loss": 0.24343338012695312, + "step": 4680 + }, + { + "epoch": 0.02013515021938298, + "grad_norm": 3.5855891704559326, + "learning_rate": 9.840940644860861e-05, + "loss": 0.5209373950958252, + "step": 4690 + }, + { + "epoch": 0.020178082309402986, + "grad_norm": 0.08754919469356537, + "learning_rate": 9.840509472849099e-05, + "loss": 0.3627974271774292, + "step": 4700 + }, + { + "epoch": 0.020221014399422994, + "grad_norm": 0.05948247015476227, + "learning_rate": 9.840078300837337e-05, + "loss": 0.2495173692703247, + "step": 4710 + }, + { + "epoch": 0.020263946489442998, + "grad_norm": 0.13405494391918182, + "learning_rate": 9.839647128825575e-05, + "loss": 0.1566326141357422, + "step": 4720 + }, + { + "epoch": 0.020306878579463006, + "grad_norm": 0.04851258918642998, + "learning_rate": 9.839215956813812e-05, + "loss": 0.09986941814422608, + "step": 4730 + }, + { + "epoch": 0.02034981066948301, + "grad_norm": 1.3744168281555176, + "learning_rate": 9.83878478480205e-05, + "loss": 0.2830467462539673, + "step": 4740 + }, + { + "epoch": 0.020392742759503018, + "grad_norm": 0.04340994358062744, + "learning_rate": 9.838353612790288e-05, + "loss": 0.18021624088287352, + "step": 4750 + }, + { + "epoch": 0.020435674849523026, + "grad_norm": 2.098802089691162, + "learning_rate": 9.837922440778524e-05, + "loss": 0.5201596736907959, + "step": 4760 + }, + { + "epoch": 0.02047860693954303, + "grad_norm": 0.12326524406671524, + "learning_rate": 9.837491268766762e-05, + "loss": 0.30697102546691896, + "step": 4770 + }, + { + "epoch": 0.020521539029563038, + "grad_norm": 1.1608352661132812, + "learning_rate": 9.837060096755e-05, + "loss": 0.385021448135376, + "step": 4780 + }, + { + "epoch": 0.020564471119583042, + "grad_norm": 14.848011016845703, + "learning_rate": 9.836628924743237e-05, + "loss": 0.28464808464050295, + "step": 4790 + }, + { + "epoch": 0.02060740320960305, + "grad_norm": 0.4765234887599945, + "learning_rate": 9.836197752731475e-05, + "loss": 0.27699947357177734, + "step": 4800 + }, + { + "epoch": 0.020650335299623058, + "grad_norm": 14.751072883605957, + "learning_rate": 9.835766580719713e-05, + "loss": 0.13528286218643187, + "step": 4810 + }, + { + "epoch": 0.020693267389643062, + "grad_norm": 0.006392804905772209, + "learning_rate": 9.835335408707951e-05, + "loss": 0.31837561130523684, + "step": 4820 + }, + { + "epoch": 0.02073619947966307, + "grad_norm": 6.483633041381836, + "learning_rate": 9.834904236696188e-05, + "loss": 0.25327458381652834, + "step": 4830 + }, + { + "epoch": 0.020779131569683074, + "grad_norm": 0.027963057160377502, + "learning_rate": 9.834473064684425e-05, + "loss": 0.2266265869140625, + "step": 4840 + }, + { + "epoch": 0.020822063659703082, + "grad_norm": 0.0027327449060976505, + "learning_rate": 9.834041892672663e-05, + "loss": 0.17874222993850708, + "step": 4850 + }, + { + "epoch": 0.020864995749723086, + "grad_norm": 1.8083893060684204, + "learning_rate": 9.8336107206609e-05, + "loss": 0.27457170486450194, + "step": 4860 + }, + { + "epoch": 0.020907927839743094, + "grad_norm": 0.019822167232632637, + "learning_rate": 9.833179548649138e-05, + "loss": 0.3151801109313965, + "step": 4870 + }, + { + "epoch": 0.020950859929763102, + "grad_norm": 1.336995244026184, + "learning_rate": 9.832748376637376e-05, + "loss": 0.3638888359069824, + "step": 4880 + }, + { + "epoch": 0.020993792019783106, + "grad_norm": 8.536113739013672, + "learning_rate": 9.832317204625613e-05, + "loss": 0.19324772357940673, + "step": 4890 + }, + { + "epoch": 0.021036724109803114, + "grad_norm": 0.46870413422584534, + "learning_rate": 9.831886032613851e-05, + "loss": 0.4881044864654541, + "step": 4900 + }, + { + "epoch": 0.02107965619982312, + "grad_norm": 3.869471788406372, + "learning_rate": 9.831454860602089e-05, + "loss": 0.1654451608657837, + "step": 4910 + }, + { + "epoch": 0.021122588289843126, + "grad_norm": 0.8998738527297974, + "learning_rate": 9.831023688590327e-05, + "loss": 0.23226535320281982, + "step": 4920 + }, + { + "epoch": 0.021165520379863134, + "grad_norm": 0.10120627284049988, + "learning_rate": 9.830592516578564e-05, + "loss": 0.215118145942688, + "step": 4930 + }, + { + "epoch": 0.02120845246988314, + "grad_norm": 0.005523098167032003, + "learning_rate": 9.830161344566802e-05, + "loss": 0.24013869762420653, + "step": 4940 + }, + { + "epoch": 0.021251384559903146, + "grad_norm": 0.03459906205534935, + "learning_rate": 9.82973017255504e-05, + "loss": 0.10165373086929322, + "step": 4950 + }, + { + "epoch": 0.02129431664992315, + "grad_norm": 2.4921891689300537, + "learning_rate": 9.829299000543278e-05, + "loss": 0.6230008602142334, + "step": 4960 + }, + { + "epoch": 0.02133724873994316, + "grad_norm": 1.8752906322479248, + "learning_rate": 9.828867828531515e-05, + "loss": 0.22859764099121094, + "step": 4970 + }, + { + "epoch": 0.021380180829963163, + "grad_norm": 0.13080959022045135, + "learning_rate": 9.828436656519753e-05, + "loss": 0.22646758556365967, + "step": 4980 + }, + { + "epoch": 0.02142311291998317, + "grad_norm": 2.735193967819214, + "learning_rate": 9.828005484507991e-05, + "loss": 0.15832440853118895, + "step": 4990 + }, + { + "epoch": 0.02146604501000318, + "grad_norm": 7.5099310874938965, + "learning_rate": 9.827574312496227e-05, + "loss": 0.4262217998504639, + "step": 5000 + }, + { + "epoch": 0.02146604501000318, + "eval_loss": 0.5495235323905945, + "eval_runtime": 27.4385, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 5000 + }, + { + "epoch": 0.021508977100023183, + "grad_norm": 0.7837478518486023, + "learning_rate": 9.827143140484465e-05, + "loss": 0.18935658931732177, + "step": 5010 + }, + { + "epoch": 0.02155190919004319, + "grad_norm": 0.01325380802154541, + "learning_rate": 9.826711968472703e-05, + "loss": 0.1605769395828247, + "step": 5020 + }, + { + "epoch": 0.021594841280063195, + "grad_norm": 1.1370841264724731, + "learning_rate": 9.82628079646094e-05, + "loss": 0.296016526222229, + "step": 5030 + }, + { + "epoch": 0.021637773370083203, + "grad_norm": 0.03137364238500595, + "learning_rate": 9.825849624449178e-05, + "loss": 0.339589524269104, + "step": 5040 + }, + { + "epoch": 0.021680705460103207, + "grad_norm": 0.05219261720776558, + "learning_rate": 9.825418452437416e-05, + "loss": 0.3410472869873047, + "step": 5050 + }, + { + "epoch": 0.021723637550123215, + "grad_norm": 0.011812934651970863, + "learning_rate": 9.824987280425654e-05, + "loss": 0.13874655961990356, + "step": 5060 + }, + { + "epoch": 0.021766569640143223, + "grad_norm": 0.027524782344698906, + "learning_rate": 9.824556108413891e-05, + "loss": 0.23152687549591064, + "step": 5070 + }, + { + "epoch": 0.021809501730163227, + "grad_norm": 0.09816437214612961, + "learning_rate": 9.824124936402129e-05, + "loss": 0.33842058181762696, + "step": 5080 + }, + { + "epoch": 0.021852433820183235, + "grad_norm": 6.345348358154297, + "learning_rate": 9.823693764390365e-05, + "loss": 0.4534308433532715, + "step": 5090 + }, + { + "epoch": 0.02189536591020324, + "grad_norm": 2.9625370502471924, + "learning_rate": 9.823262592378603e-05, + "loss": 0.2510680198669434, + "step": 5100 + }, + { + "epoch": 0.021938298000223247, + "grad_norm": 1.2221940755844116, + "learning_rate": 9.822831420366841e-05, + "loss": 0.28204386234283446, + "step": 5110 + }, + { + "epoch": 0.021981230090243255, + "grad_norm": 2.2707417011260986, + "learning_rate": 9.822400248355079e-05, + "loss": 0.335250997543335, + "step": 5120 + }, + { + "epoch": 0.02202416218026326, + "grad_norm": 3.409400701522827, + "learning_rate": 9.821969076343316e-05, + "loss": 0.3914987087249756, + "step": 5130 + }, + { + "epoch": 0.022067094270283267, + "grad_norm": 1.3195754289627075, + "learning_rate": 9.821537904331554e-05, + "loss": 0.49390902519226076, + "step": 5140 + }, + { + "epoch": 0.02211002636030327, + "grad_norm": 0.27802133560180664, + "learning_rate": 9.821106732319792e-05, + "loss": 0.18718713521957397, + "step": 5150 + }, + { + "epoch": 0.02215295845032328, + "grad_norm": 3.635807514190674, + "learning_rate": 9.820675560308031e-05, + "loss": 0.3083020210266113, + "step": 5160 + }, + { + "epoch": 0.022195890540343283, + "grad_norm": 0.11027365177869797, + "learning_rate": 9.820244388296267e-05, + "loss": 0.3606464385986328, + "step": 5170 + }, + { + "epoch": 0.02223882263036329, + "grad_norm": 0.9344707727432251, + "learning_rate": 9.819813216284505e-05, + "loss": 0.2589896202087402, + "step": 5180 + }, + { + "epoch": 0.0222817547203833, + "grad_norm": 0.24270321428775787, + "learning_rate": 9.819382044272743e-05, + "loss": 0.3288132667541504, + "step": 5190 + }, + { + "epoch": 0.022324686810403303, + "grad_norm": 0.06861063838005066, + "learning_rate": 9.81895087226098e-05, + "loss": 0.3274968147277832, + "step": 5200 + }, + { + "epoch": 0.02236761890042331, + "grad_norm": 2.494279146194458, + "learning_rate": 9.818519700249218e-05, + "loss": 0.12895745038986206, + "step": 5210 + }, + { + "epoch": 0.022410550990443315, + "grad_norm": 10.221489906311035, + "learning_rate": 9.818088528237456e-05, + "loss": 0.15850175619125367, + "step": 5220 + }, + { + "epoch": 0.022453483080463323, + "grad_norm": 0.10302328318357468, + "learning_rate": 9.817657356225694e-05, + "loss": 0.26316812038421633, + "step": 5230 + }, + { + "epoch": 0.02249641517048333, + "grad_norm": 4.582814693450928, + "learning_rate": 9.817226184213931e-05, + "loss": 0.2203810691833496, + "step": 5240 + }, + { + "epoch": 0.022539347260503335, + "grad_norm": 0.011359277181327343, + "learning_rate": 9.816795012202168e-05, + "loss": 0.3015336036682129, + "step": 5250 + }, + { + "epoch": 0.022582279350523343, + "grad_norm": 0.09045564383268356, + "learning_rate": 9.816363840190406e-05, + "loss": 0.32758321762084963, + "step": 5260 + }, + { + "epoch": 0.022625211440543348, + "grad_norm": 0.036633238196372986, + "learning_rate": 9.815932668178643e-05, + "loss": 0.2647268533706665, + "step": 5270 + }, + { + "epoch": 0.022668143530563355, + "grad_norm": 0.004347283858805895, + "learning_rate": 9.815501496166881e-05, + "loss": 0.2765114545822144, + "step": 5280 + }, + { + "epoch": 0.02271107562058336, + "grad_norm": 0.19933733344078064, + "learning_rate": 9.815070324155119e-05, + "loss": 0.37896840572357177, + "step": 5290 + }, + { + "epoch": 0.022754007710603368, + "grad_norm": 0.04528508707880974, + "learning_rate": 9.814639152143357e-05, + "loss": 0.3411917448043823, + "step": 5300 + }, + { + "epoch": 0.022796939800623375, + "grad_norm": 1.2522482872009277, + "learning_rate": 9.814207980131594e-05, + "loss": 0.21302356719970703, + "step": 5310 + }, + { + "epoch": 0.02283987189064338, + "grad_norm": 4.6196746826171875, + "learning_rate": 9.813776808119832e-05, + "loss": 0.3238266706466675, + "step": 5320 + }, + { + "epoch": 0.022882803980663387, + "grad_norm": 1.3250828981399536, + "learning_rate": 9.813345636108068e-05, + "loss": 0.30894622802734373, + "step": 5330 + }, + { + "epoch": 0.022925736070683392, + "grad_norm": 0.2489081472158432, + "learning_rate": 9.812914464096306e-05, + "loss": 0.20912477970123292, + "step": 5340 + }, + { + "epoch": 0.0229686681607034, + "grad_norm": 4.305191993713379, + "learning_rate": 9.812483292084544e-05, + "loss": 0.22660810947418214, + "step": 5350 + }, + { + "epoch": 0.023011600250723407, + "grad_norm": 5.360749244689941, + "learning_rate": 9.812052120072782e-05, + "loss": 0.46240839958190916, + "step": 5360 + }, + { + "epoch": 0.023054532340743412, + "grad_norm": 0.011522599495947361, + "learning_rate": 9.811620948061019e-05, + "loss": 0.17703713178634645, + "step": 5370 + }, + { + "epoch": 0.02309746443076342, + "grad_norm": 4.123495578765869, + "learning_rate": 9.811189776049258e-05, + "loss": 0.4162435531616211, + "step": 5380 + }, + { + "epoch": 0.023140396520783424, + "grad_norm": 0.1706576645374298, + "learning_rate": 9.810758604037496e-05, + "loss": 0.2197955846786499, + "step": 5390 + }, + { + "epoch": 0.02318332861080343, + "grad_norm": 1.6054201126098633, + "learning_rate": 9.810327432025734e-05, + "loss": 0.5216985225677491, + "step": 5400 + }, + { + "epoch": 0.023226260700823436, + "grad_norm": 2.200653553009033, + "learning_rate": 9.809896260013972e-05, + "loss": 0.30880715847015383, + "step": 5410 + }, + { + "epoch": 0.023269192790843444, + "grad_norm": 6.275848865509033, + "learning_rate": 9.809465088002208e-05, + "loss": 0.4033853054046631, + "step": 5420 + }, + { + "epoch": 0.02331212488086345, + "grad_norm": 0.5680272579193115, + "learning_rate": 9.809033915990446e-05, + "loss": 0.18146917819976807, + "step": 5430 + }, + { + "epoch": 0.023355056970883456, + "grad_norm": 0.37831783294677734, + "learning_rate": 9.808602743978683e-05, + "loss": 0.2999399185180664, + "step": 5440 + }, + { + "epoch": 0.023397989060903464, + "grad_norm": 3.798578977584839, + "learning_rate": 9.808171571966921e-05, + "loss": 0.28671579360961913, + "step": 5450 + }, + { + "epoch": 0.023440921150923468, + "grad_norm": 0.01146702840924263, + "learning_rate": 9.807740399955159e-05, + "loss": 0.3553640365600586, + "step": 5460 + }, + { + "epoch": 0.023483853240943476, + "grad_norm": 0.03364414721727371, + "learning_rate": 9.807309227943397e-05, + "loss": 0.28850817680358887, + "step": 5470 + }, + { + "epoch": 0.02352678533096348, + "grad_norm": 0.45987215638160706, + "learning_rate": 9.806878055931634e-05, + "loss": 0.22811505794525147, + "step": 5480 + }, + { + "epoch": 0.023569717420983488, + "grad_norm": 0.08059200644493103, + "learning_rate": 9.806446883919872e-05, + "loss": 0.19109236001968383, + "step": 5490 + }, + { + "epoch": 0.023612649511003496, + "grad_norm": 0.6227476596832275, + "learning_rate": 9.806015711908108e-05, + "loss": 0.27296125888824463, + "step": 5500 + }, + { + "epoch": 0.0236555816010235, + "grad_norm": 0.02348339930176735, + "learning_rate": 9.805584539896346e-05, + "loss": 0.30638632774353025, + "step": 5510 + }, + { + "epoch": 0.023698513691043508, + "grad_norm": 0.5054450631141663, + "learning_rate": 9.805153367884584e-05, + "loss": 0.31245372295379636, + "step": 5520 + }, + { + "epoch": 0.023741445781063512, + "grad_norm": 9.5831937789917, + "learning_rate": 9.804722195872822e-05, + "loss": 0.2308908462524414, + "step": 5530 + }, + { + "epoch": 0.02378437787108352, + "grad_norm": 0.024453381076455116, + "learning_rate": 9.80429102386106e-05, + "loss": 0.20383148193359374, + "step": 5540 + }, + { + "epoch": 0.023827309961103528, + "grad_norm": 0.0007577983778901398, + "learning_rate": 9.803859851849297e-05, + "loss": 0.20267860889434813, + "step": 5550 + }, + { + "epoch": 0.023870242051123532, + "grad_norm": 0.002279347274452448, + "learning_rate": 9.803428679837535e-05, + "loss": 0.1286637783050537, + "step": 5560 + }, + { + "epoch": 0.02391317414114354, + "grad_norm": 1.7347980737686157, + "learning_rate": 9.802997507825773e-05, + "loss": 0.3176673412322998, + "step": 5570 + }, + { + "epoch": 0.023956106231163544, + "grad_norm": 0.002336501609534025, + "learning_rate": 9.802566335814009e-05, + "loss": 0.2759399890899658, + "step": 5580 + }, + { + "epoch": 0.023999038321183552, + "grad_norm": 2.0233569145202637, + "learning_rate": 9.802135163802247e-05, + "loss": 0.2526408195495605, + "step": 5590 + }, + { + "epoch": 0.024041970411203557, + "grad_norm": 0.0020779764745384455, + "learning_rate": 9.801703991790486e-05, + "loss": 0.15542728900909425, + "step": 5600 + }, + { + "epoch": 0.024084902501223564, + "grad_norm": 4.984875679016113, + "learning_rate": 9.801272819778724e-05, + "loss": 0.3817164182662964, + "step": 5610 + }, + { + "epoch": 0.024127834591243572, + "grad_norm": 0.11502945423126221, + "learning_rate": 9.800841647766961e-05, + "loss": 0.2196964979171753, + "step": 5620 + }, + { + "epoch": 0.024170766681263577, + "grad_norm": 0.002261434681713581, + "learning_rate": 9.800410475755199e-05, + "loss": 0.05837162733078003, + "step": 5630 + }, + { + "epoch": 0.024213698771283584, + "grad_norm": 0.00039357831701636314, + "learning_rate": 9.799979303743437e-05, + "loss": 0.07530305981636047, + "step": 5640 + }, + { + "epoch": 0.02425663086130359, + "grad_norm": 0.0011043796548619866, + "learning_rate": 9.799548131731674e-05, + "loss": 0.46832637786865233, + "step": 5650 + }, + { + "epoch": 0.024299562951323597, + "grad_norm": 1.8432406187057495, + "learning_rate": 9.799116959719911e-05, + "loss": 0.20423316955566406, + "step": 5660 + }, + { + "epoch": 0.024342495041343604, + "grad_norm": 1.1161885261535645, + "learning_rate": 9.798685787708149e-05, + "loss": 0.3098815679550171, + "step": 5670 + }, + { + "epoch": 0.02438542713136361, + "grad_norm": 0.015093029476702213, + "learning_rate": 9.798254615696386e-05, + "loss": 0.37404484748840333, + "step": 5680 + }, + { + "epoch": 0.024428359221383616, + "grad_norm": 0.007660869043320417, + "learning_rate": 9.797823443684624e-05, + "loss": 0.3126490354537964, + "step": 5690 + }, + { + "epoch": 0.02447129131140362, + "grad_norm": 0.1040581539273262, + "learning_rate": 9.797392271672862e-05, + "loss": 0.5040182113647461, + "step": 5700 + }, + { + "epoch": 0.02451422340142363, + "grad_norm": 0.7959339022636414, + "learning_rate": 9.7969610996611e-05, + "loss": 0.15577397346496583, + "step": 5710 + }, + { + "epoch": 0.024557155491443633, + "grad_norm": 0.005332805681973696, + "learning_rate": 9.796529927649337e-05, + "loss": 0.1662292718887329, + "step": 5720 + }, + { + "epoch": 0.02460008758146364, + "grad_norm": 1.613714575767517, + "learning_rate": 9.796098755637575e-05, + "loss": 0.3151638269424438, + "step": 5730 + }, + { + "epoch": 0.02464301967148365, + "grad_norm": 0.009166977368295193, + "learning_rate": 9.795667583625811e-05, + "loss": 0.22575023174285888, + "step": 5740 + }, + { + "epoch": 0.024685951761503653, + "grad_norm": 1.1688400506973267, + "learning_rate": 9.795236411614049e-05, + "loss": 0.230286169052124, + "step": 5750 + }, + { + "epoch": 0.02472888385152366, + "grad_norm": 0.04237162321805954, + "learning_rate": 9.794805239602287e-05, + "loss": 0.3470681428909302, + "step": 5760 + }, + { + "epoch": 0.024771815941543665, + "grad_norm": 0.054464444518089294, + "learning_rate": 9.794374067590525e-05, + "loss": 0.14838558435440063, + "step": 5770 + }, + { + "epoch": 0.024814748031563673, + "grad_norm": 1.083876132965088, + "learning_rate": 9.793942895578762e-05, + "loss": 0.3141624450683594, + "step": 5780 + }, + { + "epoch": 0.024857680121583677, + "grad_norm": 0.02843441627919674, + "learning_rate": 9.793511723567e-05, + "loss": 0.11999982595443726, + "step": 5790 + }, + { + "epoch": 0.024900612211603685, + "grad_norm": 0.0133874686434865, + "learning_rate": 9.793080551555238e-05, + "loss": 0.19130725860595704, + "step": 5800 + }, + { + "epoch": 0.024943544301623693, + "grad_norm": 9.813251495361328, + "learning_rate": 9.792649379543476e-05, + "loss": 0.3944342613220215, + "step": 5810 + }, + { + "epoch": 0.024986476391643697, + "grad_norm": 1.1463874578475952, + "learning_rate": 9.792218207531713e-05, + "loss": 0.40923099517822265, + "step": 5820 + }, + { + "epoch": 0.025029408481663705, + "grad_norm": 0.8478333950042725, + "learning_rate": 9.791787035519951e-05, + "loss": 0.15334019660949708, + "step": 5830 + }, + { + "epoch": 0.02507234057168371, + "grad_norm": 2.906216621398926, + "learning_rate": 9.791355863508189e-05, + "loss": 0.4113955020904541, + "step": 5840 + }, + { + "epoch": 0.025115272661703717, + "grad_norm": 31.99359130859375, + "learning_rate": 9.790924691496426e-05, + "loss": 0.2590095281600952, + "step": 5850 + }, + { + "epoch": 0.025158204751723725, + "grad_norm": 0.6950059533119202, + "learning_rate": 9.790493519484664e-05, + "loss": 0.5012688636779785, + "step": 5860 + }, + { + "epoch": 0.02520113684174373, + "grad_norm": 0.020123276859521866, + "learning_rate": 9.790062347472902e-05, + "loss": 0.2581526517868042, + "step": 5870 + }, + { + "epoch": 0.025244068931763737, + "grad_norm": 3.909289598464966, + "learning_rate": 9.78963117546114e-05, + "loss": 0.21584734916687012, + "step": 5880 + }, + { + "epoch": 0.02528700102178374, + "grad_norm": 1.5961849689483643, + "learning_rate": 9.789200003449377e-05, + "loss": 0.16069986820220947, + "step": 5890 + }, + { + "epoch": 0.02532993311180375, + "grad_norm": 0.1236676275730133, + "learning_rate": 9.788768831437615e-05, + "loss": 0.09747138023376464, + "step": 5900 + }, + { + "epoch": 0.025372865201823754, + "grad_norm": 1.4221770763397217, + "learning_rate": 9.788337659425852e-05, + "loss": 0.25554373264312746, + "step": 5910 + }, + { + "epoch": 0.02541579729184376, + "grad_norm": 0.007482404820621014, + "learning_rate": 9.787906487414089e-05, + "loss": 0.42077908515930174, + "step": 5920 + }, + { + "epoch": 0.02545872938186377, + "grad_norm": 0.753391683101654, + "learning_rate": 9.787475315402327e-05, + "loss": 0.24806034564971924, + "step": 5930 + }, + { + "epoch": 0.025501661471883773, + "grad_norm": 0.677297055721283, + "learning_rate": 9.787044143390565e-05, + "loss": 0.30080883502960204, + "step": 5940 + }, + { + "epoch": 0.02554459356190378, + "grad_norm": 0.007284692022949457, + "learning_rate": 9.786612971378802e-05, + "loss": 0.2560673713684082, + "step": 5950 + }, + { + "epoch": 0.025587525651923786, + "grad_norm": 0.06195899099111557, + "learning_rate": 9.78618179936704e-05, + "loss": 0.2860154390335083, + "step": 5960 + }, + { + "epoch": 0.025630457741943793, + "grad_norm": 0.029079634696245193, + "learning_rate": 9.785750627355278e-05, + "loss": 0.30883972644805907, + "step": 5970 + }, + { + "epoch": 0.0256733898319638, + "grad_norm": 0.1602584421634674, + "learning_rate": 9.785319455343516e-05, + "loss": 0.2173325777053833, + "step": 5980 + }, + { + "epoch": 0.025716321921983806, + "grad_norm": 1.476770281791687, + "learning_rate": 9.784888283331752e-05, + "loss": 0.42102804183959963, + "step": 5990 + }, + { + "epoch": 0.025759254012003813, + "grad_norm": 0.05325939878821373, + "learning_rate": 9.78445711131999e-05, + "loss": 0.31200518608093264, + "step": 6000 + }, + { + "epoch": 0.025759254012003813, + "eval_loss": 0.5343221426010132, + "eval_runtime": 27.4848, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 3.638, + "step": 6000 + }, + { + "epoch": 0.025802186102023818, + "grad_norm": 2.2895023822784424, + "learning_rate": 9.784025939308228e-05, + "loss": 0.22626352310180664, + "step": 6010 + }, + { + "epoch": 0.025845118192043826, + "grad_norm": 0.3198728561401367, + "learning_rate": 9.783594767296465e-05, + "loss": 0.44563970565795896, + "step": 6020 + }, + { + "epoch": 0.02588805028206383, + "grad_norm": 2.3215620517730713, + "learning_rate": 9.783163595284703e-05, + "loss": 0.3011783123016357, + "step": 6030 + }, + { + "epoch": 0.025930982372083838, + "grad_norm": 0.813892126083374, + "learning_rate": 9.782732423272941e-05, + "loss": 0.4721865177154541, + "step": 6040 + }, + { + "epoch": 0.025973914462103845, + "grad_norm": 0.5148369073867798, + "learning_rate": 9.782301251261178e-05, + "loss": 0.4094189167022705, + "step": 6050 + }, + { + "epoch": 0.02601684655212385, + "grad_norm": 0.04373027756810188, + "learning_rate": 9.781870079249416e-05, + "loss": 0.2998760223388672, + "step": 6060 + }, + { + "epoch": 0.026059778642143858, + "grad_norm": 0.05778109282255173, + "learning_rate": 9.781438907237654e-05, + "loss": 0.17186734676361085, + "step": 6070 + }, + { + "epoch": 0.026102710732163862, + "grad_norm": 1.2136205434799194, + "learning_rate": 9.781007735225892e-05, + "loss": 0.26401219367980955, + "step": 6080 + }, + { + "epoch": 0.02614564282218387, + "grad_norm": 0.2967321276664734, + "learning_rate": 9.78057656321413e-05, + "loss": 0.09281305074691773, + "step": 6090 + }, + { + "epoch": 0.026188574912203878, + "grad_norm": 1.52336847782135, + "learning_rate": 9.780145391202367e-05, + "loss": 0.4042607307434082, + "step": 6100 + }, + { + "epoch": 0.026231507002223882, + "grad_norm": 1.7487553358078003, + "learning_rate": 9.779714219190605e-05, + "loss": 0.38055739402770994, + "step": 6110 + }, + { + "epoch": 0.02627443909224389, + "grad_norm": 0.022000228986144066, + "learning_rate": 9.779283047178843e-05, + "loss": 0.28518545627593994, + "step": 6120 + }, + { + "epoch": 0.026317371182263894, + "grad_norm": 0.007792261429131031, + "learning_rate": 9.77885187516708e-05, + "loss": 0.16593751907348633, + "step": 6130 + }, + { + "epoch": 0.026360303272283902, + "grad_norm": 0.04965021833777428, + "learning_rate": 9.778420703155318e-05, + "loss": 0.26540796756744384, + "step": 6140 + }, + { + "epoch": 0.026403235362303906, + "grad_norm": 0.5761759877204895, + "learning_rate": 9.777989531143556e-05, + "loss": 0.39497976303100585, + "step": 6150 + }, + { + "epoch": 0.026446167452323914, + "grad_norm": 1.5157667398452759, + "learning_rate": 9.777558359131792e-05, + "loss": 0.4709909915924072, + "step": 6160 + }, + { + "epoch": 0.026489099542343922, + "grad_norm": 19.292585372924805, + "learning_rate": 9.77712718712003e-05, + "loss": 0.4004258632659912, + "step": 6170 + }, + { + "epoch": 0.026532031632363926, + "grad_norm": 1.9996172189712524, + "learning_rate": 9.776696015108268e-05, + "loss": 0.28473002910614015, + "step": 6180 + }, + { + "epoch": 0.026574963722383934, + "grad_norm": 0.23382000625133514, + "learning_rate": 9.776264843096505e-05, + "loss": 0.49767556190490725, + "step": 6190 + }, + { + "epoch": 0.02661789581240394, + "grad_norm": 0.38020217418670654, + "learning_rate": 9.775833671084743e-05, + "loss": 0.18720144033432007, + "step": 6200 + }, + { + "epoch": 0.026660827902423946, + "grad_norm": 2.16418194770813, + "learning_rate": 9.775402499072981e-05, + "loss": 0.21355876922607422, + "step": 6210 + }, + { + "epoch": 0.02670375999244395, + "grad_norm": 0.32309773564338684, + "learning_rate": 9.774971327061219e-05, + "loss": 0.18259401321411134, + "step": 6220 + }, + { + "epoch": 0.02674669208246396, + "grad_norm": 0.1124923974275589, + "learning_rate": 9.774540155049456e-05, + "loss": 0.31576013565063477, + "step": 6230 + }, + { + "epoch": 0.026789624172483966, + "grad_norm": 0.061951570212841034, + "learning_rate": 9.774108983037693e-05, + "loss": 0.021699841320514678, + "step": 6240 + }, + { + "epoch": 0.02683255626250397, + "grad_norm": 0.1531129628419876, + "learning_rate": 9.77367781102593e-05, + "loss": 0.3296054840087891, + "step": 6250 + }, + { + "epoch": 0.026875488352523978, + "grad_norm": 0.040223345160484314, + "learning_rate": 9.773246639014168e-05, + "loss": 0.06907052397727967, + "step": 6260 + }, + { + "epoch": 0.026918420442543983, + "grad_norm": 4.550394058227539, + "learning_rate": 9.772815467002406e-05, + "loss": 0.31173703670501707, + "step": 6270 + }, + { + "epoch": 0.02696135253256399, + "grad_norm": 0.2127457857131958, + "learning_rate": 9.772384294990644e-05, + "loss": 0.20917901992797852, + "step": 6280 + }, + { + "epoch": 0.027004284622583998, + "grad_norm": 1.6591854095458984, + "learning_rate": 9.771953122978881e-05, + "loss": 0.4044227600097656, + "step": 6290 + }, + { + "epoch": 0.027047216712604003, + "grad_norm": 0.010656671598553658, + "learning_rate": 9.771521950967119e-05, + "loss": 0.2549355268478394, + "step": 6300 + }, + { + "epoch": 0.02709014880262401, + "grad_norm": 2.4075212478637695, + "learning_rate": 9.771090778955357e-05, + "loss": 0.22155897617340087, + "step": 6310 + }, + { + "epoch": 0.027133080892644015, + "grad_norm": 0.12818868458271027, + "learning_rate": 9.770659606943595e-05, + "loss": 0.28032028675079346, + "step": 6320 + }, + { + "epoch": 0.027176012982664022, + "grad_norm": 0.14577949047088623, + "learning_rate": 9.770228434931832e-05, + "loss": 0.04989034235477448, + "step": 6330 + }, + { + "epoch": 0.027218945072684027, + "grad_norm": 0.07681692391633987, + "learning_rate": 9.76979726292007e-05, + "loss": 0.26616883277893066, + "step": 6340 + }, + { + "epoch": 0.027261877162704035, + "grad_norm": 3.748929738998413, + "learning_rate": 9.769366090908308e-05, + "loss": 0.19273843765258789, + "step": 6350 + }, + { + "epoch": 0.027304809252724042, + "grad_norm": 0.9309549331665039, + "learning_rate": 9.768934918896546e-05, + "loss": 0.24083504676818848, + "step": 6360 + }, + { + "epoch": 0.027347741342744047, + "grad_norm": 0.014099097810685635, + "learning_rate": 9.768503746884783e-05, + "loss": 0.2923043489456177, + "step": 6370 + }, + { + "epoch": 0.027390673432764055, + "grad_norm": 0.12325286120176315, + "learning_rate": 9.768072574873021e-05, + "loss": 0.1641521692276001, + "step": 6380 + }, + { + "epoch": 0.02743360552278406, + "grad_norm": 1.3514981269836426, + "learning_rate": 9.767641402861259e-05, + "loss": 0.21309914588928222, + "step": 6390 + }, + { + "epoch": 0.027476537612804067, + "grad_norm": 0.001690853270702064, + "learning_rate": 9.767210230849495e-05, + "loss": 0.21486878395080566, + "step": 6400 + }, + { + "epoch": 0.027519469702824075, + "grad_norm": 0.00460071163251996, + "learning_rate": 9.766779058837733e-05, + "loss": 0.49279141426086426, + "step": 6410 + }, + { + "epoch": 0.02756240179284408, + "grad_norm": 0.0049337283708155155, + "learning_rate": 9.76634788682597e-05, + "loss": 0.4525193691253662, + "step": 6420 + }, + { + "epoch": 0.027605333882864087, + "grad_norm": 1.9769833087921143, + "learning_rate": 9.765916714814208e-05, + "loss": 0.5393567085266113, + "step": 6430 + }, + { + "epoch": 0.02764826597288409, + "grad_norm": 9.51612377166748, + "learning_rate": 9.765485542802446e-05, + "loss": 0.25011520385742186, + "step": 6440 + }, + { + "epoch": 0.0276911980629041, + "grad_norm": 0.08957642316818237, + "learning_rate": 9.765054370790684e-05, + "loss": 0.2099766731262207, + "step": 6450 + }, + { + "epoch": 0.027734130152924103, + "grad_norm": 0.20063121616840363, + "learning_rate": 9.764623198778921e-05, + "loss": 0.34571573734283445, + "step": 6460 + }, + { + "epoch": 0.02777706224294411, + "grad_norm": 4.328144073486328, + "learning_rate": 9.764192026767159e-05, + "loss": 0.18804190158843995, + "step": 6470 + }, + { + "epoch": 0.02781999433296412, + "grad_norm": 0.9789665341377258, + "learning_rate": 9.763760854755396e-05, + "loss": 0.3236433506011963, + "step": 6480 + }, + { + "epoch": 0.027862926422984123, + "grad_norm": 0.06257350742816925, + "learning_rate": 9.763329682743633e-05, + "loss": 0.2720318794250488, + "step": 6490 + }, + { + "epoch": 0.02790585851300413, + "grad_norm": 0.021415017545223236, + "learning_rate": 9.762898510731871e-05, + "loss": 0.14671536684036254, + "step": 6500 + }, + { + "epoch": 0.027948790603024135, + "grad_norm": 0.07387561351060867, + "learning_rate": 9.762467338720109e-05, + "loss": 0.34039785861968996, + "step": 6510 + }, + { + "epoch": 0.027991722693044143, + "grad_norm": 0.34364739060401917, + "learning_rate": 9.762036166708347e-05, + "loss": 0.38445446491241453, + "step": 6520 + }, + { + "epoch": 0.02803465478306415, + "grad_norm": 0.009217793121933937, + "learning_rate": 9.761604994696584e-05, + "loss": 0.48569574356079104, + "step": 6530 + }, + { + "epoch": 0.028077586873084155, + "grad_norm": 0.05632294341921806, + "learning_rate": 9.761173822684822e-05, + "loss": 0.42383580207824706, + "step": 6540 + }, + { + "epoch": 0.028120518963104163, + "grad_norm": 0.15093988180160522, + "learning_rate": 9.76074265067306e-05, + "loss": 0.16558367013931274, + "step": 6550 + }, + { + "epoch": 0.028163451053124167, + "grad_norm": 0.7927748560905457, + "learning_rate": 9.760311478661297e-05, + "loss": 0.3002749443054199, + "step": 6560 + }, + { + "epoch": 0.028206383143144175, + "grad_norm": 0.18128614127635956, + "learning_rate": 9.759880306649535e-05, + "loss": 0.1237905740737915, + "step": 6570 + }, + { + "epoch": 0.02824931523316418, + "grad_norm": 1.25431489944458, + "learning_rate": 9.759449134637773e-05, + "loss": 0.176636004447937, + "step": 6580 + }, + { + "epoch": 0.028292247323184187, + "grad_norm": 0.11274126172065735, + "learning_rate": 9.759017962626011e-05, + "loss": 0.2627155065536499, + "step": 6590 + }, + { + "epoch": 0.028335179413204195, + "grad_norm": 0.9054426550865173, + "learning_rate": 9.758586790614248e-05, + "loss": 0.3994295120239258, + "step": 6600 + }, + { + "epoch": 0.0283781115032242, + "grad_norm": 0.3764042258262634, + "learning_rate": 9.758155618602486e-05, + "loss": 0.32515180110931396, + "step": 6610 + }, + { + "epoch": 0.028421043593244207, + "grad_norm": 3.711716651916504, + "learning_rate": 9.757724446590724e-05, + "loss": 0.14573302268981933, + "step": 6620 + }, + { + "epoch": 0.02846397568326421, + "grad_norm": 0.09025450050830841, + "learning_rate": 9.757293274578962e-05, + "loss": 0.31324641704559325, + "step": 6630 + }, + { + "epoch": 0.02850690777328422, + "grad_norm": 2.1718194484710693, + "learning_rate": 9.7568621025672e-05, + "loss": 0.2684445858001709, + "step": 6640 + }, + { + "epoch": 0.028549839863304224, + "grad_norm": 0.7220675945281982, + "learning_rate": 9.756430930555436e-05, + "loss": 0.2534363269805908, + "step": 6650 + }, + { + "epoch": 0.02859277195332423, + "grad_norm": 0.6206772327423096, + "learning_rate": 9.755999758543673e-05, + "loss": 0.2221672773361206, + "step": 6660 + }, + { + "epoch": 0.02863570404334424, + "grad_norm": 0.008323497138917446, + "learning_rate": 9.755568586531911e-05, + "loss": 0.599291467666626, + "step": 6670 + }, + { + "epoch": 0.028678636133364244, + "grad_norm": 1.3727705478668213, + "learning_rate": 9.755137414520149e-05, + "loss": 0.35412213802337644, + "step": 6680 + }, + { + "epoch": 0.02872156822338425, + "grad_norm": 0.03529423102736473, + "learning_rate": 9.754706242508387e-05, + "loss": 0.21443359851837157, + "step": 6690 + }, + { + "epoch": 0.028764500313404256, + "grad_norm": 1.579532504081726, + "learning_rate": 9.754275070496624e-05, + "loss": 0.24339027404785157, + "step": 6700 + }, + { + "epoch": 0.028807432403424264, + "grad_norm": 1.398242712020874, + "learning_rate": 9.753843898484862e-05, + "loss": 0.31939172744750977, + "step": 6710 + }, + { + "epoch": 0.02885036449344427, + "grad_norm": 0.07951159030199051, + "learning_rate": 9.7534127264731e-05, + "loss": 0.33771500587463377, + "step": 6720 + }, + { + "epoch": 0.028893296583464276, + "grad_norm": 6.611891746520996, + "learning_rate": 9.752981554461336e-05, + "loss": 0.18760627508163452, + "step": 6730 + }, + { + "epoch": 0.028936228673484284, + "grad_norm": 0.1047222837805748, + "learning_rate": 9.752550382449574e-05, + "loss": 0.2564627408981323, + "step": 6740 + }, + { + "epoch": 0.028979160763504288, + "grad_norm": 0.5849888920783997, + "learning_rate": 9.752119210437812e-05, + "loss": 0.4227277755737305, + "step": 6750 + }, + { + "epoch": 0.029022092853524296, + "grad_norm": 0.0048598735593259335, + "learning_rate": 9.75168803842605e-05, + "loss": 0.11246702671051026, + "step": 6760 + }, + { + "epoch": 0.0290650249435443, + "grad_norm": 0.09654171764850616, + "learning_rate": 9.751256866414287e-05, + "loss": 0.4050751686096191, + "step": 6770 + }, + { + "epoch": 0.029107957033564308, + "grad_norm": 0.21590472757816315, + "learning_rate": 9.750825694402525e-05, + "loss": 0.16975542306900024, + "step": 6780 + }, + { + "epoch": 0.029150889123584316, + "grad_norm": 0.24438561499118805, + "learning_rate": 9.750394522390764e-05, + "loss": 0.018460248410701752, + "step": 6790 + }, + { + "epoch": 0.02919382121360432, + "grad_norm": 0.05073995888233185, + "learning_rate": 9.749963350379002e-05, + "loss": 0.15996166467666625, + "step": 6800 + }, + { + "epoch": 0.029236753303624328, + "grad_norm": 2.4738333225250244, + "learning_rate": 9.749532178367238e-05, + "loss": 0.1290574073791504, + "step": 6810 + }, + { + "epoch": 0.029279685393644332, + "grad_norm": 2.667616844177246, + "learning_rate": 9.749101006355476e-05, + "loss": 0.19219365119934081, + "step": 6820 + }, + { + "epoch": 0.02932261748366434, + "grad_norm": 1.5775728225708008, + "learning_rate": 9.748669834343714e-05, + "loss": 0.21368303298950195, + "step": 6830 + }, + { + "epoch": 0.029365549573684348, + "grad_norm": 0.9113060832023621, + "learning_rate": 9.748238662331951e-05, + "loss": 0.41691412925720217, + "step": 6840 + }, + { + "epoch": 0.029408481663704352, + "grad_norm": 0.0026496616192162037, + "learning_rate": 9.747807490320189e-05, + "loss": 0.32430739402770997, + "step": 6850 + }, + { + "epoch": 0.02945141375372436, + "grad_norm": 0.016415616497397423, + "learning_rate": 9.747376318308427e-05, + "loss": 0.25125372409820557, + "step": 6860 + }, + { + "epoch": 0.029494345843744364, + "grad_norm": 1.4902602434158325, + "learning_rate": 9.746945146296665e-05, + "loss": 0.35351219177246096, + "step": 6870 + }, + { + "epoch": 0.029537277933764372, + "grad_norm": 0.036443907767534256, + "learning_rate": 9.746513974284902e-05, + "loss": 0.38958556652069093, + "step": 6880 + }, + { + "epoch": 0.029580210023784376, + "grad_norm": 0.03572531044483185, + "learning_rate": 9.74608280227314e-05, + "loss": 0.09846047163009644, + "step": 6890 + }, + { + "epoch": 0.029623142113804384, + "grad_norm": 2.647747278213501, + "learning_rate": 9.745651630261376e-05, + "loss": 0.3600107192993164, + "step": 6900 + }, + { + "epoch": 0.029666074203824392, + "grad_norm": 2.517021894454956, + "learning_rate": 9.745220458249614e-05, + "loss": 0.15333187580108643, + "step": 6910 + }, + { + "epoch": 0.029709006293844396, + "grad_norm": 12.12330436706543, + "learning_rate": 9.744789286237852e-05, + "loss": 0.3379157543182373, + "step": 6920 + }, + { + "epoch": 0.029751938383864404, + "grad_norm": 0.801120400428772, + "learning_rate": 9.74435811422609e-05, + "loss": 0.414486026763916, + "step": 6930 + }, + { + "epoch": 0.02979487047388441, + "grad_norm": 0.80088871717453, + "learning_rate": 9.743926942214327e-05, + "loss": 0.2384410858154297, + "step": 6940 + }, + { + "epoch": 0.029837802563904416, + "grad_norm": 0.22154009342193604, + "learning_rate": 9.743495770202565e-05, + "loss": 0.2071291208267212, + "step": 6950 + }, + { + "epoch": 0.02988073465392442, + "grad_norm": 2.093872308731079, + "learning_rate": 9.743064598190803e-05, + "loss": 0.2516919136047363, + "step": 6960 + }, + { + "epoch": 0.02992366674394443, + "grad_norm": 18.139604568481445, + "learning_rate": 9.74263342617904e-05, + "loss": 0.442185115814209, + "step": 6970 + }, + { + "epoch": 0.029966598833964436, + "grad_norm": 0.8278317451477051, + "learning_rate": 9.742202254167277e-05, + "loss": 0.11277594566345214, + "step": 6980 + }, + { + "epoch": 0.03000953092398444, + "grad_norm": 0.6661980152130127, + "learning_rate": 9.741771082155515e-05, + "loss": 0.5284313678741455, + "step": 6990 + }, + { + "epoch": 0.03005246301400445, + "grad_norm": 0.14535479247570038, + "learning_rate": 9.741339910143752e-05, + "loss": 0.19856830835342407, + "step": 7000 + }, + { + "epoch": 0.03005246301400445, + "eval_loss": 0.5455799698829651, + "eval_runtime": 27.5675, + "eval_samples_per_second": 3.627, + "eval_steps_per_second": 3.627, + "step": 7000 + }, + { + "epoch": 0.030095395104024453, + "grad_norm": 1.4745711088180542, + "learning_rate": 9.740908738131991e-05, + "loss": 0.322112512588501, + "step": 7010 + }, + { + "epoch": 0.03013832719404446, + "grad_norm": 0.3533012270927429, + "learning_rate": 9.740477566120229e-05, + "loss": 0.1881626844406128, + "step": 7020 + }, + { + "epoch": 0.03018125928406447, + "grad_norm": 0.1096927747130394, + "learning_rate": 9.740046394108467e-05, + "loss": 0.12380988597869873, + "step": 7030 + }, + { + "epoch": 0.030224191374084473, + "grad_norm": 0.020163699984550476, + "learning_rate": 9.739615222096705e-05, + "loss": 0.4424473285675049, + "step": 7040 + }, + { + "epoch": 0.03026712346410448, + "grad_norm": 6.096765518188477, + "learning_rate": 9.739184050084942e-05, + "loss": 0.3141198635101318, + "step": 7050 + }, + { + "epoch": 0.030310055554124485, + "grad_norm": 1.6224082708358765, + "learning_rate": 9.738752878073179e-05, + "loss": 0.3304691553115845, + "step": 7060 + }, + { + "epoch": 0.030352987644144493, + "grad_norm": 3.0215723514556885, + "learning_rate": 9.738321706061417e-05, + "loss": 0.2341548442840576, + "step": 7070 + }, + { + "epoch": 0.030395919734164497, + "grad_norm": 0.29276198148727417, + "learning_rate": 9.737890534049654e-05, + "loss": 0.22664909362792968, + "step": 7080 + }, + { + "epoch": 0.030438851824184505, + "grad_norm": 0.08035605400800705, + "learning_rate": 9.737459362037892e-05, + "loss": 0.30606210231781006, + "step": 7090 + }, + { + "epoch": 0.030481783914204513, + "grad_norm": 0.0922674909234047, + "learning_rate": 9.73702819002613e-05, + "loss": 0.15513947010040283, + "step": 7100 + }, + { + "epoch": 0.030524716004224517, + "grad_norm": 0.02588404156267643, + "learning_rate": 9.736597018014367e-05, + "loss": 0.27363131046295164, + "step": 7110 + }, + { + "epoch": 0.030567648094244525, + "grad_norm": 0.42078208923339844, + "learning_rate": 9.736165846002605e-05, + "loss": 0.3170907497406006, + "step": 7120 + }, + { + "epoch": 0.03061058018426453, + "grad_norm": 4.322187900543213, + "learning_rate": 9.735734673990843e-05, + "loss": 0.3747772216796875, + "step": 7130 + }, + { + "epoch": 0.030653512274284537, + "grad_norm": 1.444366693496704, + "learning_rate": 9.735303501979079e-05, + "loss": 0.3339802026748657, + "step": 7140 + }, + { + "epoch": 0.030696444364304545, + "grad_norm": 0.04174191132187843, + "learning_rate": 9.734872329967317e-05, + "loss": 0.2466111421585083, + "step": 7150 + }, + { + "epoch": 0.03073937645432455, + "grad_norm": 0.26613515615463257, + "learning_rate": 9.734441157955555e-05, + "loss": 0.19390870332717897, + "step": 7160 + }, + { + "epoch": 0.030782308544344557, + "grad_norm": 7.866360187530518, + "learning_rate": 9.734009985943792e-05, + "loss": 0.22820355892181396, + "step": 7170 + }, + { + "epoch": 0.03082524063436456, + "grad_norm": 2.5541608333587646, + "learning_rate": 9.73357881393203e-05, + "loss": 0.274747633934021, + "step": 7180 + }, + { + "epoch": 0.03086817272438457, + "grad_norm": 0.15199995040893555, + "learning_rate": 9.733147641920268e-05, + "loss": 0.10083954334259033, + "step": 7190 + }, + { + "epoch": 0.030911104814404573, + "grad_norm": 16.545623779296875, + "learning_rate": 9.732716469908506e-05, + "loss": 0.33107154369354247, + "step": 7200 + }, + { + "epoch": 0.03095403690442458, + "grad_norm": 6.0723371505737305, + "learning_rate": 9.732285297896743e-05, + "loss": 0.27182056903839114, + "step": 7210 + }, + { + "epoch": 0.03099696899444459, + "grad_norm": 3.1646676063537598, + "learning_rate": 9.73185412588498e-05, + "loss": 0.2546936750411987, + "step": 7220 + }, + { + "epoch": 0.031039901084464593, + "grad_norm": 0.054179687052965164, + "learning_rate": 9.731422953873219e-05, + "loss": 0.6185836315155029, + "step": 7230 + }, + { + "epoch": 0.0310828331744846, + "grad_norm": 0.5206108093261719, + "learning_rate": 9.730991781861457e-05, + "loss": 0.37745678424835205, + "step": 7240 + }, + { + "epoch": 0.031125765264504605, + "grad_norm": 0.08467987179756165, + "learning_rate": 9.730560609849694e-05, + "loss": 0.31608712673187256, + "step": 7250 + }, + { + "epoch": 0.031168697354524613, + "grad_norm": 1.9469053745269775, + "learning_rate": 9.730129437837932e-05, + "loss": 0.32785539627075194, + "step": 7260 + }, + { + "epoch": 0.03121162944454462, + "grad_norm": 0.5517263412475586, + "learning_rate": 9.72969826582617e-05, + "loss": 0.14303758144378662, + "step": 7270 + }, + { + "epoch": 0.03125456153456463, + "grad_norm": 2.292470693588257, + "learning_rate": 9.729267093814408e-05, + "loss": 0.40169806480407716, + "step": 7280 + }, + { + "epoch": 0.03129749362458463, + "grad_norm": 0.21029160916805267, + "learning_rate": 9.728835921802645e-05, + "loss": 0.1665691018104553, + "step": 7290 + }, + { + "epoch": 0.03134042571460464, + "grad_norm": 1.327445387840271, + "learning_rate": 9.728404749790883e-05, + "loss": 0.2855032682418823, + "step": 7300 + }, + { + "epoch": 0.031383357804624645, + "grad_norm": 0.7883787155151367, + "learning_rate": 9.72797357777912e-05, + "loss": 0.18686634302139282, + "step": 7310 + }, + { + "epoch": 0.03142628989464465, + "grad_norm": 1.578082799911499, + "learning_rate": 9.727542405767357e-05, + "loss": 0.6148167610168457, + "step": 7320 + }, + { + "epoch": 0.031469221984664654, + "grad_norm": 3.069875478744507, + "learning_rate": 9.727111233755595e-05, + "loss": 0.41655569076538085, + "step": 7330 + }, + { + "epoch": 0.03151215407468466, + "grad_norm": 0.0648500993847847, + "learning_rate": 9.726680061743833e-05, + "loss": 0.13719666004180908, + "step": 7340 + }, + { + "epoch": 0.03155508616470467, + "grad_norm": 0.69016033411026, + "learning_rate": 9.72624888973207e-05, + "loss": 0.12963091135025023, + "step": 7350 + }, + { + "epoch": 0.03159801825472468, + "grad_norm": 6.941385746002197, + "learning_rate": 9.725817717720308e-05, + "loss": 0.16945242881774902, + "step": 7360 + }, + { + "epoch": 0.031640950344744685, + "grad_norm": 0.0247952863574028, + "learning_rate": 9.725386545708546e-05, + "loss": 0.19187296628952027, + "step": 7370 + }, + { + "epoch": 0.031683882434764686, + "grad_norm": 14.04268741607666, + "learning_rate": 9.724955373696784e-05, + "loss": 0.1674031972885132, + "step": 7380 + }, + { + "epoch": 0.031726814524784694, + "grad_norm": 0.12602205574512482, + "learning_rate": 9.72452420168502e-05, + "loss": 0.3435555934906006, + "step": 7390 + }, + { + "epoch": 0.0317697466148047, + "grad_norm": 0.6790499687194824, + "learning_rate": 9.724093029673258e-05, + "loss": 0.4150827407836914, + "step": 7400 + }, + { + "epoch": 0.03181267870482471, + "grad_norm": 0.016099615022540092, + "learning_rate": 9.723661857661495e-05, + "loss": 0.144336473941803, + "step": 7410 + }, + { + "epoch": 0.03185561079484472, + "grad_norm": 2.7291247844696045, + "learning_rate": 9.723230685649733e-05, + "loss": 0.4487579822540283, + "step": 7420 + }, + { + "epoch": 0.03189854288486472, + "grad_norm": 3.9720041751861572, + "learning_rate": 9.722799513637971e-05, + "loss": 0.22907283306121826, + "step": 7430 + }, + { + "epoch": 0.031941474974884726, + "grad_norm": 0.013614165596663952, + "learning_rate": 9.722368341626209e-05, + "loss": 0.1962208032608032, + "step": 7440 + }, + { + "epoch": 0.031984407064904734, + "grad_norm": 9.466171264648438, + "learning_rate": 9.721937169614446e-05, + "loss": 0.1261853337287903, + "step": 7450 + }, + { + "epoch": 0.03202733915492474, + "grad_norm": 16.18906593322754, + "learning_rate": 9.721505997602684e-05, + "loss": 0.1645114541053772, + "step": 7460 + }, + { + "epoch": 0.03207027124494475, + "grad_norm": 0.03784366324543953, + "learning_rate": 9.721074825590922e-05, + "loss": 0.20082948207855225, + "step": 7470 + }, + { + "epoch": 0.03211320333496475, + "grad_norm": 0.001986953429877758, + "learning_rate": 9.72064365357916e-05, + "loss": 0.2706491231918335, + "step": 7480 + }, + { + "epoch": 0.03215613542498476, + "grad_norm": 8.900535583496094, + "learning_rate": 9.720212481567397e-05, + "loss": 0.34498045444488523, + "step": 7490 + }, + { + "epoch": 0.032199067515004766, + "grad_norm": 13.675838470458984, + "learning_rate": 9.719781309555635e-05, + "loss": 0.27431817054748536, + "step": 7500 + }, + { + "epoch": 0.032241999605024774, + "grad_norm": 0.008109820075333118, + "learning_rate": 9.719350137543873e-05, + "loss": 0.45601425170898435, + "step": 7510 + }, + { + "epoch": 0.03228493169504478, + "grad_norm": 0.060955505818128586, + "learning_rate": 9.71891896553211e-05, + "loss": 0.3038959980010986, + "step": 7520 + }, + { + "epoch": 0.03232786378506478, + "grad_norm": 0.03209978714585304, + "learning_rate": 9.718487793520348e-05, + "loss": 0.22680823802947997, + "step": 7530 + }, + { + "epoch": 0.03237079587508479, + "grad_norm": 1.2087823152542114, + "learning_rate": 9.718056621508586e-05, + "loss": 0.2598066806793213, + "step": 7540 + }, + { + "epoch": 0.0324137279651048, + "grad_norm": 0.1490267813205719, + "learning_rate": 9.717625449496822e-05, + "loss": 0.3735307216644287, + "step": 7550 + }, + { + "epoch": 0.032456660055124806, + "grad_norm": 1.4957003593444824, + "learning_rate": 9.71719427748506e-05, + "loss": 0.22178177833557128, + "step": 7560 + }, + { + "epoch": 0.03249959214514481, + "grad_norm": 4.107588291168213, + "learning_rate": 9.716763105473298e-05, + "loss": 0.2578477382659912, + "step": 7570 + }, + { + "epoch": 0.032542524235164814, + "grad_norm": 1.4969598054885864, + "learning_rate": 9.716331933461536e-05, + "loss": 0.2491441249847412, + "step": 7580 + }, + { + "epoch": 0.03258545632518482, + "grad_norm": 0.012556682340800762, + "learning_rate": 9.715900761449773e-05, + "loss": 0.10336203575134277, + "step": 7590 + }, + { + "epoch": 0.03262838841520483, + "grad_norm": 0.5561529994010925, + "learning_rate": 9.715469589438011e-05, + "loss": 0.2101090669631958, + "step": 7600 + }, + { + "epoch": 0.03267132050522484, + "grad_norm": 0.02059774659574032, + "learning_rate": 9.715038417426249e-05, + "loss": 0.11110254526138305, + "step": 7610 + }, + { + "epoch": 0.03271425259524484, + "grad_norm": 0.034311916679143906, + "learning_rate": 9.714607245414486e-05, + "loss": 0.364142918586731, + "step": 7620 + }, + { + "epoch": 0.03275718468526485, + "grad_norm": 11.460116386413574, + "learning_rate": 9.714176073402724e-05, + "loss": 0.3021425247192383, + "step": 7630 + }, + { + "epoch": 0.032800116775284854, + "grad_norm": 4.155307769775391, + "learning_rate": 9.71374490139096e-05, + "loss": 0.5520795822143555, + "step": 7640 + }, + { + "epoch": 0.03284304886530486, + "grad_norm": 0.372206449508667, + "learning_rate": 9.713313729379198e-05, + "loss": 0.3026304006576538, + "step": 7650 + }, + { + "epoch": 0.03288598095532487, + "grad_norm": 0.045250970870256424, + "learning_rate": 9.712882557367436e-05, + "loss": 0.21342151165008544, + "step": 7660 + }, + { + "epoch": 0.03292891304534487, + "grad_norm": 0.015753526240587234, + "learning_rate": 9.712451385355674e-05, + "loss": 0.21125319004058837, + "step": 7670 + }, + { + "epoch": 0.03297184513536488, + "grad_norm": 0.05379635840654373, + "learning_rate": 9.712020213343912e-05, + "loss": 0.29408860206604004, + "step": 7680 + }, + { + "epoch": 0.033014777225384886, + "grad_norm": 0.02940794639289379, + "learning_rate": 9.711589041332149e-05, + "loss": 0.34212937355041506, + "step": 7690 + }, + { + "epoch": 0.033057709315404894, + "grad_norm": 3.105747938156128, + "learning_rate": 9.711157869320387e-05, + "loss": 0.43459086418151854, + "step": 7700 + }, + { + "epoch": 0.0331006414054249, + "grad_norm": 0.9573909640312195, + "learning_rate": 9.710726697308625e-05, + "loss": 0.08446192145347595, + "step": 7710 + }, + { + "epoch": 0.0331435734954449, + "grad_norm": 0.06909049302339554, + "learning_rate": 9.710295525296862e-05, + "loss": 0.16420615911483766, + "step": 7720 + }, + { + "epoch": 0.03318650558546491, + "grad_norm": 1.0823670625686646, + "learning_rate": 9.7098643532851e-05, + "loss": 0.3220996618270874, + "step": 7730 + }, + { + "epoch": 0.03322943767548492, + "grad_norm": 0.010625721886754036, + "learning_rate": 9.709433181273338e-05, + "loss": 0.27078948020935056, + "step": 7740 + }, + { + "epoch": 0.033272369765504926, + "grad_norm": 0.01120381336659193, + "learning_rate": 9.709002009261576e-05, + "loss": 0.20582714080810546, + "step": 7750 + }, + { + "epoch": 0.03331530185552493, + "grad_norm": 0.04764774441719055, + "learning_rate": 9.708570837249813e-05, + "loss": 0.2562859296798706, + "step": 7760 + }, + { + "epoch": 0.033358233945544935, + "grad_norm": 0.02361004240810871, + "learning_rate": 9.708139665238051e-05, + "loss": 0.1400162696838379, + "step": 7770 + }, + { + "epoch": 0.03340116603556494, + "grad_norm": 11.817069053649902, + "learning_rate": 9.707708493226289e-05, + "loss": 0.31052820682525634, + "step": 7780 + }, + { + "epoch": 0.03344409812558495, + "grad_norm": 0.22933602333068848, + "learning_rate": 9.707277321214527e-05, + "loss": 0.3289341449737549, + "step": 7790 + }, + { + "epoch": 0.03348703021560496, + "grad_norm": 0.6220147609710693, + "learning_rate": 9.706846149202763e-05, + "loss": 0.19306013584136963, + "step": 7800 + }, + { + "epoch": 0.03352996230562496, + "grad_norm": 2.844313383102417, + "learning_rate": 9.706414977191001e-05, + "loss": 0.08610989451408387, + "step": 7810 + }, + { + "epoch": 0.03357289439564497, + "grad_norm": 5.575182914733887, + "learning_rate": 9.705983805179238e-05, + "loss": 0.2134272575378418, + "step": 7820 + }, + { + "epoch": 0.033615826485664975, + "grad_norm": 0.06617298722267151, + "learning_rate": 9.705552633167476e-05, + "loss": 0.19738913774490358, + "step": 7830 + }, + { + "epoch": 0.03365875857568498, + "grad_norm": 0.01672886684536934, + "learning_rate": 9.705121461155714e-05, + "loss": 0.25262153148651123, + "step": 7840 + }, + { + "epoch": 0.03370169066570499, + "grad_norm": 0.4519057273864746, + "learning_rate": 9.704690289143952e-05, + "loss": 0.1626684069633484, + "step": 7850 + }, + { + "epoch": 0.03374462275572499, + "grad_norm": 0.6325151920318604, + "learning_rate": 9.70425911713219e-05, + "loss": 0.3695283651351929, + "step": 7860 + }, + { + "epoch": 0.033787554845745, + "grad_norm": 1.0352468490600586, + "learning_rate": 9.703827945120427e-05, + "loss": 0.260190749168396, + "step": 7870 + }, + { + "epoch": 0.03383048693576501, + "grad_norm": 0.08981958031654358, + "learning_rate": 9.703396773108663e-05, + "loss": 0.29672319889068605, + "step": 7880 + }, + { + "epoch": 0.033873419025785015, + "grad_norm": 0.9225382208824158, + "learning_rate": 9.702965601096901e-05, + "loss": 0.18002406358718873, + "step": 7890 + }, + { + "epoch": 0.03391635111580502, + "grad_norm": 2.109400749206543, + "learning_rate": 9.702534429085139e-05, + "loss": 0.10338685512542725, + "step": 7900 + }, + { + "epoch": 0.033959283205825024, + "grad_norm": 0.05137547478079796, + "learning_rate": 9.702103257073377e-05, + "loss": 0.24235978126525878, + "step": 7910 + }, + { + "epoch": 0.03400221529584503, + "grad_norm": 1.9895689487457275, + "learning_rate": 9.701672085061614e-05, + "loss": 0.17490397691726683, + "step": 7920 + }, + { + "epoch": 0.03404514738586504, + "grad_norm": 4.55086088180542, + "learning_rate": 9.701240913049852e-05, + "loss": 0.2837538719177246, + "step": 7930 + }, + { + "epoch": 0.03408807947588505, + "grad_norm": 6.31823205947876, + "learning_rate": 9.70080974103809e-05, + "loss": 0.2951412916183472, + "step": 7940 + }, + { + "epoch": 0.034131011565905055, + "grad_norm": 0.03623563051223755, + "learning_rate": 9.700378569026328e-05, + "loss": 0.06038150191307068, + "step": 7950 + }, + { + "epoch": 0.034173943655925056, + "grad_norm": 0.15342311561107635, + "learning_rate": 9.699947397014565e-05, + "loss": 0.24344947338104247, + "step": 7960 + }, + { + "epoch": 0.03421687574594506, + "grad_norm": 5.179479598999023, + "learning_rate": 9.699516225002803e-05, + "loss": 0.20732619762420654, + "step": 7970 + }, + { + "epoch": 0.03425980783596507, + "grad_norm": 0.01350562646985054, + "learning_rate": 9.699085052991041e-05, + "loss": 0.3624278545379639, + "step": 7980 + }, + { + "epoch": 0.03430273992598508, + "grad_norm": 3.3068830966949463, + "learning_rate": 9.698653880979279e-05, + "loss": 0.45979924201965333, + "step": 7990 + }, + { + "epoch": 0.03434567201600508, + "grad_norm": 1.9648011922836304, + "learning_rate": 9.698222708967516e-05, + "loss": 0.38761961460113525, + "step": 8000 + }, + { + "epoch": 0.03434567201600508, + "eval_loss": 0.5358114838600159, + "eval_runtime": 27.4067, + "eval_samples_per_second": 3.649, + "eval_steps_per_second": 3.649, + "step": 8000 + }, + { + "epoch": 0.03438860410602509, + "grad_norm": 7.072238445281982, + "learning_rate": 9.697791536955754e-05, + "loss": 0.32979588508605956, + "step": 8010 + }, + { + "epoch": 0.034431536196045096, + "grad_norm": 0.47375860810279846, + "learning_rate": 9.697360364943992e-05, + "loss": 0.35645227432250975, + "step": 8020 + }, + { + "epoch": 0.0344744682860651, + "grad_norm": 6.805625915527344, + "learning_rate": 9.69692919293223e-05, + "loss": 0.45725326538085936, + "step": 8030 + }, + { + "epoch": 0.03451740037608511, + "grad_norm": 1.7845739126205444, + "learning_rate": 9.696498020920467e-05, + "loss": 0.3593878269195557, + "step": 8040 + }, + { + "epoch": 0.03456033246610511, + "grad_norm": 3.2586820125579834, + "learning_rate": 9.696066848908704e-05, + "loss": 0.22868585586547852, + "step": 8050 + }, + { + "epoch": 0.03460326455612512, + "grad_norm": 3.4146788120269775, + "learning_rate": 9.695635676896941e-05, + "loss": 0.33050105571746824, + "step": 8060 + }, + { + "epoch": 0.03464619664614513, + "grad_norm": 0.10858240723609924, + "learning_rate": 9.695204504885179e-05, + "loss": 0.3089393377304077, + "step": 8070 + }, + { + "epoch": 0.034689128736165135, + "grad_norm": 4.090073108673096, + "learning_rate": 9.694773332873417e-05, + "loss": 0.2587622404098511, + "step": 8080 + }, + { + "epoch": 0.03473206082618514, + "grad_norm": 1.6025853157043457, + "learning_rate": 9.694342160861655e-05, + "loss": 0.2438603162765503, + "step": 8090 + }, + { + "epoch": 0.034774992916205144, + "grad_norm": 1.3067179918289185, + "learning_rate": 9.693910988849892e-05, + "loss": 0.40871009826660154, + "step": 8100 + }, + { + "epoch": 0.03481792500622515, + "grad_norm": 1.1220425367355347, + "learning_rate": 9.69347981683813e-05, + "loss": 0.2651923656463623, + "step": 8110 + }, + { + "epoch": 0.03486085709624516, + "grad_norm": 5.3396124839782715, + "learning_rate": 9.693048644826368e-05, + "loss": 0.3964865684509277, + "step": 8120 + }, + { + "epoch": 0.03490378918626517, + "grad_norm": 0.22761444747447968, + "learning_rate": 9.692617472814604e-05, + "loss": 0.3448660373687744, + "step": 8130 + }, + { + "epoch": 0.034946721276285175, + "grad_norm": 1.0162402391433716, + "learning_rate": 9.692186300802842e-05, + "loss": 0.4648458480834961, + "step": 8140 + }, + { + "epoch": 0.034989653366305176, + "grad_norm": 4.638467311859131, + "learning_rate": 9.69175512879108e-05, + "loss": 0.18537842035293578, + "step": 8150 + }, + { + "epoch": 0.035032585456325184, + "grad_norm": 0.12439771741628647, + "learning_rate": 9.691323956779317e-05, + "loss": 0.34654300212860106, + "step": 8160 + }, + { + "epoch": 0.03507551754634519, + "grad_norm": 1.0256967544555664, + "learning_rate": 9.690892784767555e-05, + "loss": 0.19255000352859497, + "step": 8170 + }, + { + "epoch": 0.0351184496363652, + "grad_norm": 0.98158860206604, + "learning_rate": 9.690461612755793e-05, + "loss": 0.1516349196434021, + "step": 8180 + }, + { + "epoch": 0.0351613817263852, + "grad_norm": 1.5796022415161133, + "learning_rate": 9.69003044074403e-05, + "loss": 0.19203962087631227, + "step": 8190 + }, + { + "epoch": 0.03520431381640521, + "grad_norm": 0.25830766558647156, + "learning_rate": 9.68959926873227e-05, + "loss": 0.2479844331741333, + "step": 8200 + }, + { + "epoch": 0.035247245906425216, + "grad_norm": 1.2110050916671753, + "learning_rate": 9.689168096720506e-05, + "loss": 0.13889732360839843, + "step": 8210 + }, + { + "epoch": 0.035290177996445224, + "grad_norm": 3.430422306060791, + "learning_rate": 9.688736924708744e-05, + "loss": 0.3377073287963867, + "step": 8220 + }, + { + "epoch": 0.03533311008646523, + "grad_norm": 1.386106014251709, + "learning_rate": 9.688305752696981e-05, + "loss": 0.3620351552963257, + "step": 8230 + }, + { + "epoch": 0.03537604217648523, + "grad_norm": 0.27436643838882446, + "learning_rate": 9.687874580685219e-05, + "loss": 0.0908839225769043, + "step": 8240 + }, + { + "epoch": 0.03541897426650524, + "grad_norm": 0.08288343250751495, + "learning_rate": 9.687443408673457e-05, + "loss": 0.20369722843170165, + "step": 8250 + }, + { + "epoch": 0.03546190635652525, + "grad_norm": 0.0119446637108922, + "learning_rate": 9.687012236661695e-05, + "loss": 0.4165679931640625, + "step": 8260 + }, + { + "epoch": 0.035504838446545256, + "grad_norm": 0.8607707619667053, + "learning_rate": 9.686581064649932e-05, + "loss": 0.13348482847213744, + "step": 8270 + }, + { + "epoch": 0.035547770536565264, + "grad_norm": 1.3179068565368652, + "learning_rate": 9.68614989263817e-05, + "loss": 0.22979404926300048, + "step": 8280 + }, + { + "epoch": 0.035590702626585265, + "grad_norm": 1.5900253057479858, + "learning_rate": 9.685718720626407e-05, + "loss": 0.35587825775146487, + "step": 8290 + }, + { + "epoch": 0.03563363471660527, + "grad_norm": 0.029487568885087967, + "learning_rate": 9.685287548614644e-05, + "loss": 0.20372748374938965, + "step": 8300 + }, + { + "epoch": 0.03567656680662528, + "grad_norm": 0.9263527989387512, + "learning_rate": 9.684856376602882e-05, + "loss": 0.17852827310562133, + "step": 8310 + }, + { + "epoch": 0.03571949889664529, + "grad_norm": 2.308929204940796, + "learning_rate": 9.68442520459112e-05, + "loss": 0.21045780181884766, + "step": 8320 + }, + { + "epoch": 0.035762430986665296, + "grad_norm": 0.1372317671775818, + "learning_rate": 9.683994032579357e-05, + "loss": 0.1990463376045227, + "step": 8330 + }, + { + "epoch": 0.0358053630766853, + "grad_norm": 0.036591432988643646, + "learning_rate": 9.683562860567595e-05, + "loss": 0.33791069984436034, + "step": 8340 + }, + { + "epoch": 0.035848295166705305, + "grad_norm": 0.03384740278124809, + "learning_rate": 9.683131688555833e-05, + "loss": 0.17970336675643922, + "step": 8350 + }, + { + "epoch": 0.03589122725672531, + "grad_norm": 0.06115401163697243, + "learning_rate": 9.68270051654407e-05, + "loss": 0.38286256790161133, + "step": 8360 + }, + { + "epoch": 0.03593415934674532, + "grad_norm": 1.263088345527649, + "learning_rate": 9.682269344532308e-05, + "loss": 0.42797145843505857, + "step": 8370 + }, + { + "epoch": 0.03597709143676533, + "grad_norm": 1.6685699224472046, + "learning_rate": 9.681838172520545e-05, + "loss": 0.4042956352233887, + "step": 8380 + }, + { + "epoch": 0.03602002352678533, + "grad_norm": 0.38085174560546875, + "learning_rate": 9.681407000508783e-05, + "loss": 0.3088876008987427, + "step": 8390 + }, + { + "epoch": 0.03606295561680534, + "grad_norm": 1.448493242263794, + "learning_rate": 9.68097582849702e-05, + "loss": 0.2769587755203247, + "step": 8400 + }, + { + "epoch": 0.036105887706825344, + "grad_norm": 0.03457849845290184, + "learning_rate": 9.680544656485258e-05, + "loss": 0.15025020837783815, + "step": 8410 + }, + { + "epoch": 0.03614881979684535, + "grad_norm": 0.06887169182300568, + "learning_rate": 9.680113484473497e-05, + "loss": 0.2938064098358154, + "step": 8420 + }, + { + "epoch": 0.03619175188686535, + "grad_norm": 1.1634526252746582, + "learning_rate": 9.679682312461735e-05, + "loss": 0.2335674524307251, + "step": 8430 + }, + { + "epoch": 0.03623468397688536, + "grad_norm": 1.4806514978408813, + "learning_rate": 9.679251140449973e-05, + "loss": 0.3837287902832031, + "step": 8440 + }, + { + "epoch": 0.03627761606690537, + "grad_norm": 2.3947699069976807, + "learning_rate": 9.67881996843821e-05, + "loss": 0.22996058464050292, + "step": 8450 + }, + { + "epoch": 0.03632054815692538, + "grad_norm": 1.957567811012268, + "learning_rate": 9.678388796426447e-05, + "loss": 0.35875611305236815, + "step": 8460 + }, + { + "epoch": 0.036363480246945384, + "grad_norm": 1.2009612321853638, + "learning_rate": 9.677957624414684e-05, + "loss": 0.3392146587371826, + "step": 8470 + }, + { + "epoch": 0.036406412336965385, + "grad_norm": 0.06605121493339539, + "learning_rate": 9.677526452402922e-05, + "loss": 0.13843777179718017, + "step": 8480 + }, + { + "epoch": 0.03644934442698539, + "grad_norm": 0.011332832276821136, + "learning_rate": 9.67709528039116e-05, + "loss": 0.1935071587562561, + "step": 8490 + }, + { + "epoch": 0.0364922765170054, + "grad_norm": 0.030879681929945946, + "learning_rate": 9.676664108379398e-05, + "loss": 0.24691197872161866, + "step": 8500 + }, + { + "epoch": 0.03653520860702541, + "grad_norm": 1.4498698711395264, + "learning_rate": 9.676232936367635e-05, + "loss": 0.2351780891418457, + "step": 8510 + }, + { + "epoch": 0.036578140697045416, + "grad_norm": 11.490880966186523, + "learning_rate": 9.675801764355873e-05, + "loss": 0.20597553253173828, + "step": 8520 + }, + { + "epoch": 0.03662107278706542, + "grad_norm": 26.793243408203125, + "learning_rate": 9.675370592344111e-05, + "loss": 0.30491702556610106, + "step": 8530 + }, + { + "epoch": 0.036664004877085425, + "grad_norm": 2.609809637069702, + "learning_rate": 9.674939420332347e-05, + "loss": 0.28061366081237793, + "step": 8540 + }, + { + "epoch": 0.03670693696710543, + "grad_norm": 3.811695098876953, + "learning_rate": 9.674508248320585e-05, + "loss": 0.20124802589416504, + "step": 8550 + }, + { + "epoch": 0.03674986905712544, + "grad_norm": 2.070798635482788, + "learning_rate": 9.674077076308823e-05, + "loss": 0.26569738388061526, + "step": 8560 + }, + { + "epoch": 0.03679280114714545, + "grad_norm": 0.004637656267732382, + "learning_rate": 9.67364590429706e-05, + "loss": 0.23294551372528077, + "step": 8570 + }, + { + "epoch": 0.03683573323716545, + "grad_norm": 1.5657484531402588, + "learning_rate": 9.673214732285298e-05, + "loss": 0.5221776008605957, + "step": 8580 + }, + { + "epoch": 0.03687866532718546, + "grad_norm": 0.05727636069059372, + "learning_rate": 9.672783560273536e-05, + "loss": 0.30965242385864256, + "step": 8590 + }, + { + "epoch": 0.036921597417205465, + "grad_norm": 0.1257968246936798, + "learning_rate": 9.672352388261774e-05, + "loss": 0.3840054512023926, + "step": 8600 + }, + { + "epoch": 0.03696452950722547, + "grad_norm": 1.1655479669570923, + "learning_rate": 9.671921216250011e-05, + "loss": 0.26555461883544923, + "step": 8610 + }, + { + "epoch": 0.037007461597245474, + "grad_norm": 1.1678481101989746, + "learning_rate": 9.671490044238248e-05, + "loss": 0.22103781700134278, + "step": 8620 + }, + { + "epoch": 0.03705039368726548, + "grad_norm": 2.2759597301483154, + "learning_rate": 9.671058872226485e-05, + "loss": 0.2630741596221924, + "step": 8630 + }, + { + "epoch": 0.03709332577728549, + "grad_norm": 0.07085221260786057, + "learning_rate": 9.670627700214725e-05, + "loss": 0.36764571666717527, + "step": 8640 + }, + { + "epoch": 0.0371362578673055, + "grad_norm": 0.13278961181640625, + "learning_rate": 9.670196528202962e-05, + "loss": 0.21104824542999268, + "step": 8650 + }, + { + "epoch": 0.037179189957325505, + "grad_norm": 0.27886033058166504, + "learning_rate": 9.6697653561912e-05, + "loss": 0.20368828773498535, + "step": 8660 + }, + { + "epoch": 0.037222122047345506, + "grad_norm": 0.15865810215473175, + "learning_rate": 9.669334184179438e-05, + "loss": 0.11764969825744628, + "step": 8670 + }, + { + "epoch": 0.037265054137365514, + "grad_norm": 2.8486526012420654, + "learning_rate": 9.668903012167675e-05, + "loss": 0.3190793991088867, + "step": 8680 + }, + { + "epoch": 0.03730798622738552, + "grad_norm": 2.6860265731811523, + "learning_rate": 9.668471840155913e-05, + "loss": 0.34027435779571535, + "step": 8690 + }, + { + "epoch": 0.03735091831740553, + "grad_norm": 0.10721374303102493, + "learning_rate": 9.668040668144151e-05, + "loss": 0.14675365686416625, + "step": 8700 + }, + { + "epoch": 0.03739385040742554, + "grad_norm": 1.0816599130630493, + "learning_rate": 9.667609496132387e-05, + "loss": 0.20134527683258058, + "step": 8710 + }, + { + "epoch": 0.03743678249744554, + "grad_norm": 3.764616012573242, + "learning_rate": 9.667178324120625e-05, + "loss": 0.37100017070770264, + "step": 8720 + }, + { + "epoch": 0.037479714587465546, + "grad_norm": 0.07891738414764404, + "learning_rate": 9.666747152108863e-05, + "loss": 0.1324814200401306, + "step": 8730 + }, + { + "epoch": 0.037522646677485554, + "grad_norm": 0.8121300935745239, + "learning_rate": 9.6663159800971e-05, + "loss": 0.1912623643875122, + "step": 8740 + }, + { + "epoch": 0.03756557876750556, + "grad_norm": 0.6871600151062012, + "learning_rate": 9.665884808085338e-05, + "loss": 0.40148634910583497, + "step": 8750 + }, + { + "epoch": 0.03760851085752557, + "grad_norm": 1.060194492340088, + "learning_rate": 9.665453636073576e-05, + "loss": 0.23012053966522217, + "step": 8760 + }, + { + "epoch": 0.03765144294754557, + "grad_norm": 4.619421005249023, + "learning_rate": 9.665022464061814e-05, + "loss": 0.447019100189209, + "step": 8770 + }, + { + "epoch": 0.03769437503756558, + "grad_norm": 1.044754147529602, + "learning_rate": 9.664591292050051e-05, + "loss": 0.2604886293411255, + "step": 8780 + }, + { + "epoch": 0.037737307127585586, + "grad_norm": 0.06234363093972206, + "learning_rate": 9.664160120038288e-05, + "loss": 0.3195174694061279, + "step": 8790 + }, + { + "epoch": 0.03778023921760559, + "grad_norm": 10.548328399658203, + "learning_rate": 9.663728948026526e-05, + "loss": 0.5750315666198731, + "step": 8800 + }, + { + "epoch": 0.0378231713076256, + "grad_norm": 0.00025344491587020457, + "learning_rate": 9.663297776014763e-05, + "loss": 0.09640651941299438, + "step": 8810 + }, + { + "epoch": 0.0378661033976456, + "grad_norm": 0.042145974934101105, + "learning_rate": 9.662866604003001e-05, + "loss": 0.42340850830078125, + "step": 8820 + }, + { + "epoch": 0.03790903548766561, + "grad_norm": 13.936864852905273, + "learning_rate": 9.662435431991239e-05, + "loss": 0.2134354829788208, + "step": 8830 + }, + { + "epoch": 0.03795196757768562, + "grad_norm": 7.359281539916992, + "learning_rate": 9.662004259979477e-05, + "loss": 0.3773427248001099, + "step": 8840 + }, + { + "epoch": 0.037994899667705626, + "grad_norm": 1.1258646249771118, + "learning_rate": 9.661573087967714e-05, + "loss": 0.15646349191665648, + "step": 8850 + }, + { + "epoch": 0.038037831757725626, + "grad_norm": 0.6341869831085205, + "learning_rate": 9.661141915955952e-05, + "loss": 0.23707275390625, + "step": 8860 + }, + { + "epoch": 0.038080763847745634, + "grad_norm": 0.02454477548599243, + "learning_rate": 9.66071074394419e-05, + "loss": 0.3640810251235962, + "step": 8870 + }, + { + "epoch": 0.03812369593776564, + "grad_norm": 0.47005584836006165, + "learning_rate": 9.660279571932427e-05, + "loss": 0.1894970417022705, + "step": 8880 + }, + { + "epoch": 0.03816662802778565, + "grad_norm": 4.86158561706543, + "learning_rate": 9.659848399920665e-05, + "loss": 0.23494954109191896, + "step": 8890 + }, + { + "epoch": 0.03820956011780566, + "grad_norm": 1.4044920206069946, + "learning_rate": 9.659417227908903e-05, + "loss": 0.23539886474609376, + "step": 8900 + }, + { + "epoch": 0.03825249220782566, + "grad_norm": 1.6679009199142456, + "learning_rate": 9.65898605589714e-05, + "loss": 0.32893080711364747, + "step": 8910 + }, + { + "epoch": 0.038295424297845666, + "grad_norm": 0.13777175545692444, + "learning_rate": 9.658554883885378e-05, + "loss": 0.35626609325408937, + "step": 8920 + }, + { + "epoch": 0.038338356387865674, + "grad_norm": 2.8348093032836914, + "learning_rate": 9.658123711873616e-05, + "loss": 0.24591367244720458, + "step": 8930 + }, + { + "epoch": 0.03838128847788568, + "grad_norm": 5.880456447601318, + "learning_rate": 9.657692539861854e-05, + "loss": 0.40456109046936034, + "step": 8940 + }, + { + "epoch": 0.03842422056790569, + "grad_norm": 1.8546247482299805, + "learning_rate": 9.65726136785009e-05, + "loss": 0.34841208457946776, + "step": 8950 + }, + { + "epoch": 0.03846715265792569, + "grad_norm": 0.12906832993030548, + "learning_rate": 9.656830195838328e-05, + "loss": 0.19071357250213622, + "step": 8960 + }, + { + "epoch": 0.0385100847479457, + "grad_norm": 1.4908124208450317, + "learning_rate": 9.656399023826566e-05, + "loss": 0.14313187599182128, + "step": 8970 + }, + { + "epoch": 0.038553016837965706, + "grad_norm": 1.9998912811279297, + "learning_rate": 9.655967851814803e-05, + "loss": 0.27379117012023924, + "step": 8980 + }, + { + "epoch": 0.038595948927985714, + "grad_norm": 3.8102869987487793, + "learning_rate": 9.655536679803041e-05, + "loss": 0.18558567762374878, + "step": 8990 + }, + { + "epoch": 0.03863888101800572, + "grad_norm": 0.010952414944767952, + "learning_rate": 9.655105507791279e-05, + "loss": 0.2001218318939209, + "step": 9000 + }, + { + "epoch": 0.03863888101800572, + "eval_loss": 0.5299412608146667, + "eval_runtime": 27.3994, + "eval_samples_per_second": 3.65, + "eval_steps_per_second": 3.65, + "step": 9000 + }, + { + "epoch": 0.03868181310802572, + "grad_norm": 0.018099522218108177, + "learning_rate": 9.654674335779517e-05, + "loss": 0.2434915065765381, + "step": 9010 + }, + { + "epoch": 0.03872474519804573, + "grad_norm": 2.5344526767730713, + "learning_rate": 9.654243163767754e-05, + "loss": 0.1441143274307251, + "step": 9020 + }, + { + "epoch": 0.03876767728806574, + "grad_norm": 0.2745101749897003, + "learning_rate": 9.653811991755991e-05, + "loss": 0.26484873294830324, + "step": 9030 + }, + { + "epoch": 0.038810609378085746, + "grad_norm": 3.360513925552368, + "learning_rate": 9.653380819744228e-05, + "loss": 0.1652582883834839, + "step": 9040 + }, + { + "epoch": 0.03885354146810575, + "grad_norm": 2.069638729095459, + "learning_rate": 9.652949647732466e-05, + "loss": 0.037511253356933595, + "step": 9050 + }, + { + "epoch": 0.038896473558125755, + "grad_norm": 0.9083765745162964, + "learning_rate": 9.652518475720704e-05, + "loss": 0.1980634808540344, + "step": 9060 + }, + { + "epoch": 0.03893940564814576, + "grad_norm": 2.604505777359009, + "learning_rate": 9.652087303708942e-05, + "loss": 0.329839825630188, + "step": 9070 + }, + { + "epoch": 0.03898233773816577, + "grad_norm": 0.0077382526360452175, + "learning_rate": 9.65165613169718e-05, + "loss": 0.2762957334518433, + "step": 9080 + }, + { + "epoch": 0.03902526982818578, + "grad_norm": 0.0359419547021389, + "learning_rate": 9.651224959685417e-05, + "loss": 0.1422470211982727, + "step": 9090 + }, + { + "epoch": 0.03906820191820578, + "grad_norm": 0.015391089953482151, + "learning_rate": 9.650793787673655e-05, + "loss": 0.2391214370727539, + "step": 9100 + }, + { + "epoch": 0.03911113400822579, + "grad_norm": 3.860656976699829, + "learning_rate": 9.650362615661893e-05, + "loss": 0.3611257553100586, + "step": 9110 + }, + { + "epoch": 0.039154066098245795, + "grad_norm": 0.0004279070708435029, + "learning_rate": 9.64993144365013e-05, + "loss": 0.10785633325576782, + "step": 9120 + }, + { + "epoch": 0.0391969981882658, + "grad_norm": 2.397517204284668, + "learning_rate": 9.649500271638368e-05, + "loss": 0.2859855890274048, + "step": 9130 + }, + { + "epoch": 0.03923993027828581, + "grad_norm": 0.003950928803533316, + "learning_rate": 9.649069099626606e-05, + "loss": 0.41271052360534666, + "step": 9140 + }, + { + "epoch": 0.03928286236830581, + "grad_norm": 0.005247312132269144, + "learning_rate": 9.648637927614844e-05, + "loss": 0.015036699175834656, + "step": 9150 + }, + { + "epoch": 0.03932579445832582, + "grad_norm": 0.058225881308317184, + "learning_rate": 9.648206755603081e-05, + "loss": 0.23445894718170165, + "step": 9160 + }, + { + "epoch": 0.03936872654834583, + "grad_norm": 0.16761603951454163, + "learning_rate": 9.647775583591319e-05, + "loss": 0.2269341230392456, + "step": 9170 + }, + { + "epoch": 0.039411658638365835, + "grad_norm": 0.09319623559713364, + "learning_rate": 9.647344411579557e-05, + "loss": 0.2253098726272583, + "step": 9180 + }, + { + "epoch": 0.03945459072838584, + "grad_norm": 0.0020092339254915714, + "learning_rate": 9.646913239567794e-05, + "loss": 0.20597522258758544, + "step": 9190 + }, + { + "epoch": 0.03949752281840584, + "grad_norm": 0.03602172061800957, + "learning_rate": 9.646482067556031e-05, + "loss": 0.19670722484588624, + "step": 9200 + }, + { + "epoch": 0.03954045490842585, + "grad_norm": 0.00028024084167554975, + "learning_rate": 9.646050895544269e-05, + "loss": 0.2556697607040405, + "step": 9210 + }, + { + "epoch": 0.03958338699844586, + "grad_norm": 0.00420133862644434, + "learning_rate": 9.645619723532506e-05, + "loss": 0.292952561378479, + "step": 9220 + }, + { + "epoch": 0.03962631908846587, + "grad_norm": 5.0143351554870605, + "learning_rate": 9.645188551520744e-05, + "loss": 0.20465404987335206, + "step": 9230 + }, + { + "epoch": 0.039669251178485875, + "grad_norm": 0.3830595016479492, + "learning_rate": 9.644757379508982e-05, + "loss": 0.36404032707214357, + "step": 9240 + }, + { + "epoch": 0.039712183268505875, + "grad_norm": 0.19368067383766174, + "learning_rate": 9.64432620749722e-05, + "loss": 0.19242271184921264, + "step": 9250 + }, + { + "epoch": 0.03975511535852588, + "grad_norm": 1.0904446840286255, + "learning_rate": 9.643895035485457e-05, + "loss": 0.36784839630126953, + "step": 9260 + }, + { + "epoch": 0.03979804744854589, + "grad_norm": 0.11875243484973907, + "learning_rate": 9.643463863473695e-05, + "loss": 0.10239818096160888, + "step": 9270 + }, + { + "epoch": 0.0398409795385659, + "grad_norm": 1.6222749948501587, + "learning_rate": 9.643032691461931e-05, + "loss": 0.1739649772644043, + "step": 9280 + }, + { + "epoch": 0.0398839116285859, + "grad_norm": 0.04937111586332321, + "learning_rate": 9.642601519450169e-05, + "loss": 0.28331646919250486, + "step": 9290 + }, + { + "epoch": 0.03992684371860591, + "grad_norm": 0.16273736953735352, + "learning_rate": 9.642170347438407e-05, + "loss": 0.5411765575408936, + "step": 9300 + }, + { + "epoch": 0.039969775808625915, + "grad_norm": 0.07416556030511856, + "learning_rate": 9.641739175426645e-05, + "loss": 0.4258988380432129, + "step": 9310 + }, + { + "epoch": 0.04001270789864592, + "grad_norm": 1.5317014455795288, + "learning_rate": 9.641308003414882e-05, + "loss": 0.3962272644042969, + "step": 9320 + }, + { + "epoch": 0.04005563998866593, + "grad_norm": 0.04391628876328468, + "learning_rate": 9.64087683140312e-05, + "loss": 0.12165892124176025, + "step": 9330 + }, + { + "epoch": 0.04009857207868593, + "grad_norm": 0.01463572308421135, + "learning_rate": 9.640445659391358e-05, + "loss": 0.05805497765541077, + "step": 9340 + }, + { + "epoch": 0.04014150416870594, + "grad_norm": 0.4974989891052246, + "learning_rate": 9.640014487379596e-05, + "loss": 0.41655526161193845, + "step": 9350 + }, + { + "epoch": 0.04018443625872595, + "grad_norm": 0.016591578722000122, + "learning_rate": 9.639583315367833e-05, + "loss": 0.1912694454193115, + "step": 9360 + }, + { + "epoch": 0.040227368348745955, + "grad_norm": 4.737026214599609, + "learning_rate": 9.639152143356071e-05, + "loss": 0.31126954555511477, + "step": 9370 + }, + { + "epoch": 0.04027030043876596, + "grad_norm": 9.977210998535156, + "learning_rate": 9.638720971344309e-05, + "loss": 0.24631636142730712, + "step": 9380 + }, + { + "epoch": 0.040313232528785964, + "grad_norm": 2.310892105102539, + "learning_rate": 9.638289799332546e-05, + "loss": 0.22611894607543945, + "step": 9390 + }, + { + "epoch": 0.04035616461880597, + "grad_norm": 0.2425278276205063, + "learning_rate": 9.637858627320784e-05, + "loss": 0.1980130195617676, + "step": 9400 + }, + { + "epoch": 0.04039909670882598, + "grad_norm": 3.5154387950897217, + "learning_rate": 9.637427455309022e-05, + "loss": 0.3665068864822388, + "step": 9410 + }, + { + "epoch": 0.04044202879884599, + "grad_norm": 0.0005664866184815764, + "learning_rate": 9.63699628329726e-05, + "loss": 0.09622042179107666, + "step": 9420 + }, + { + "epoch": 0.040484960888865995, + "grad_norm": 0.485542356967926, + "learning_rate": 9.636565111285497e-05, + "loss": 0.1345600962638855, + "step": 9430 + }, + { + "epoch": 0.040527892978885996, + "grad_norm": 0.0050071184523403645, + "learning_rate": 9.636133939273735e-05, + "loss": 0.18029772043228148, + "step": 9440 + }, + { + "epoch": 0.040570825068906004, + "grad_norm": 0.18960146605968475, + "learning_rate": 9.635702767261972e-05, + "loss": 0.3446162939071655, + "step": 9450 + }, + { + "epoch": 0.04061375715892601, + "grad_norm": 1.3484828472137451, + "learning_rate": 9.635271595250209e-05, + "loss": 0.3293300151824951, + "step": 9460 + }, + { + "epoch": 0.04065668924894602, + "grad_norm": 11.18226146697998, + "learning_rate": 9.634840423238447e-05, + "loss": 0.24156620502471923, + "step": 9470 + }, + { + "epoch": 0.04069962133896602, + "grad_norm": 0.29985514283180237, + "learning_rate": 9.634409251226685e-05, + "loss": 0.4308938026428223, + "step": 9480 + }, + { + "epoch": 0.04074255342898603, + "grad_norm": 0.7258480191230774, + "learning_rate": 9.633978079214922e-05, + "loss": 0.3799870729446411, + "step": 9490 + }, + { + "epoch": 0.040785485519006036, + "grad_norm": 0.06664323806762695, + "learning_rate": 9.63354690720316e-05, + "loss": 0.399434494972229, + "step": 9500 + }, + { + "epoch": 0.040828417609026044, + "grad_norm": 0.0492563471198082, + "learning_rate": 9.633115735191398e-05, + "loss": 0.253987193107605, + "step": 9510 + }, + { + "epoch": 0.04087134969904605, + "grad_norm": 0.0004341186722740531, + "learning_rate": 9.632684563179636e-05, + "loss": 0.22375996112823487, + "step": 9520 + }, + { + "epoch": 0.04091428178906605, + "grad_norm": 1.2711418867111206, + "learning_rate": 9.632253391167872e-05, + "loss": 0.4512036800384521, + "step": 9530 + }, + { + "epoch": 0.04095721387908606, + "grad_norm": 0.993800163269043, + "learning_rate": 9.63182221915611e-05, + "loss": 0.269797682762146, + "step": 9540 + }, + { + "epoch": 0.04100014596910607, + "grad_norm": 1.7412045001983643, + "learning_rate": 9.631391047144348e-05, + "loss": 0.3986591577529907, + "step": 9550 + }, + { + "epoch": 0.041043078059126076, + "grad_norm": 1.9665592908859253, + "learning_rate": 9.630959875132585e-05, + "loss": 0.3017338991165161, + "step": 9560 + }, + { + "epoch": 0.041086010149146084, + "grad_norm": 2.8084936141967773, + "learning_rate": 9.630528703120823e-05, + "loss": 0.572250509262085, + "step": 9570 + }, + { + "epoch": 0.041128942239166084, + "grad_norm": 0.5030828714370728, + "learning_rate": 9.630097531109061e-05, + "loss": 0.16660358905792236, + "step": 9580 + }, + { + "epoch": 0.04117187432918609, + "grad_norm": 0.05629109963774681, + "learning_rate": 9.629666359097298e-05, + "loss": 0.3073833465576172, + "step": 9590 + }, + { + "epoch": 0.0412148064192061, + "grad_norm": 0.2685781419277191, + "learning_rate": 9.629235187085536e-05, + "loss": 0.13678088188171386, + "step": 9600 + }, + { + "epoch": 0.04125773850922611, + "grad_norm": 0.522137463092804, + "learning_rate": 9.628804015073774e-05, + "loss": 0.5181337833404541, + "step": 9610 + }, + { + "epoch": 0.041300670599246116, + "grad_norm": 4.692966938018799, + "learning_rate": 9.628372843062012e-05, + "loss": 0.23460650444030762, + "step": 9620 + }, + { + "epoch": 0.04134360268926612, + "grad_norm": 0.04318451136350632, + "learning_rate": 9.62794167105025e-05, + "loss": 0.15581555366516114, + "step": 9630 + }, + { + "epoch": 0.041386534779286124, + "grad_norm": 8.463669776916504, + "learning_rate": 9.627510499038487e-05, + "loss": 0.2866586923599243, + "step": 9640 + }, + { + "epoch": 0.04142946686930613, + "grad_norm": 0.04951479285955429, + "learning_rate": 9.627079327026725e-05, + "loss": 0.3578941345214844, + "step": 9650 + }, + { + "epoch": 0.04147239895932614, + "grad_norm": 1.2476704120635986, + "learning_rate": 9.626648155014963e-05, + "loss": 0.23208155632019042, + "step": 9660 + }, + { + "epoch": 0.04151533104934614, + "grad_norm": 0.04957498982548714, + "learning_rate": 9.6262169830032e-05, + "loss": 0.09177039861679077, + "step": 9670 + }, + { + "epoch": 0.04155826313936615, + "grad_norm": 2.3629302978515625, + "learning_rate": 9.625785810991438e-05, + "loss": 0.35858590602874757, + "step": 9680 + }, + { + "epoch": 0.041601195229386156, + "grad_norm": 0.026421379297971725, + "learning_rate": 9.625354638979674e-05, + "loss": 0.19241467714309693, + "step": 9690 + }, + { + "epoch": 0.041644127319406164, + "grad_norm": 0.681612491607666, + "learning_rate": 9.624923466967912e-05, + "loss": 0.33527445793151855, + "step": 9700 + }, + { + "epoch": 0.04168705940942617, + "grad_norm": 5.426156044006348, + "learning_rate": 9.62449229495615e-05, + "loss": 0.49652848243713377, + "step": 9710 + }, + { + "epoch": 0.04172999149944617, + "grad_norm": 0.8129890561103821, + "learning_rate": 9.624061122944388e-05, + "loss": 0.28535943031311034, + "step": 9720 + }, + { + "epoch": 0.04177292358946618, + "grad_norm": 0.9698901176452637, + "learning_rate": 9.623629950932625e-05, + "loss": 0.5740030765533447, + "step": 9730 + }, + { + "epoch": 0.04181585567948619, + "grad_norm": 1.4005306959152222, + "learning_rate": 9.623198778920863e-05, + "loss": 0.5011133193969727, + "step": 9740 + }, + { + "epoch": 0.041858787769506196, + "grad_norm": 0.0013849869137629867, + "learning_rate": 9.622767606909101e-05, + "loss": 0.291644811630249, + "step": 9750 + }, + { + "epoch": 0.041901719859526204, + "grad_norm": 0.0004153474292252213, + "learning_rate": 9.622336434897339e-05, + "loss": 0.3130736112594604, + "step": 9760 + }, + { + "epoch": 0.041944651949546205, + "grad_norm": 7.660757541656494, + "learning_rate": 9.621905262885575e-05, + "loss": 0.29653196334838866, + "step": 9770 + }, + { + "epoch": 0.04198758403956621, + "grad_norm": 2.530366897583008, + "learning_rate": 9.621474090873813e-05, + "loss": 0.16898977756500244, + "step": 9780 + }, + { + "epoch": 0.04203051612958622, + "grad_norm": 0.23101027309894562, + "learning_rate": 9.62104291886205e-05, + "loss": 0.12413444519042968, + "step": 9790 + }, + { + "epoch": 0.04207344821960623, + "grad_norm": 28.41404151916504, + "learning_rate": 9.620611746850288e-05, + "loss": 0.15441542863845825, + "step": 9800 + }, + { + "epoch": 0.042116380309626236, + "grad_norm": 1.6069025993347168, + "learning_rate": 9.620180574838526e-05, + "loss": 0.3309926509857178, + "step": 9810 + }, + { + "epoch": 0.04215931239964624, + "grad_norm": 0.01833909936249256, + "learning_rate": 9.619749402826764e-05, + "loss": 0.18249744176864624, + "step": 9820 + }, + { + "epoch": 0.042202244489666245, + "grad_norm": 0.002266461029648781, + "learning_rate": 9.619318230815003e-05, + "loss": 0.34940640926361083, + "step": 9830 + }, + { + "epoch": 0.04224517657968625, + "grad_norm": 0.07000841945409775, + "learning_rate": 9.61888705880324e-05, + "loss": 0.1899822473526001, + "step": 9840 + }, + { + "epoch": 0.04228810866970626, + "grad_norm": 0.5853293538093567, + "learning_rate": 9.618455886791478e-05, + "loss": 0.40880498886108396, + "step": 9850 + }, + { + "epoch": 0.04233104075972627, + "grad_norm": 1.3320908546447754, + "learning_rate": 9.618024714779715e-05, + "loss": 0.3591744422912598, + "step": 9860 + }, + { + "epoch": 0.04237397284974627, + "grad_norm": 0.3487735986709595, + "learning_rate": 9.617593542767952e-05, + "loss": 0.1401280641555786, + "step": 9870 + }, + { + "epoch": 0.04241690493976628, + "grad_norm": 0.9527150988578796, + "learning_rate": 9.61716237075619e-05, + "loss": 0.2530334949493408, + "step": 9880 + }, + { + "epoch": 0.042459837029786285, + "grad_norm": 16.177743911743164, + "learning_rate": 9.616731198744428e-05, + "loss": 0.5055931568145752, + "step": 9890 + }, + { + "epoch": 0.04250276911980629, + "grad_norm": 0.6620330810546875, + "learning_rate": 9.616300026732665e-05, + "loss": 0.3270712375640869, + "step": 9900 + }, + { + "epoch": 0.042545701209826293, + "grad_norm": 0.22643379867076874, + "learning_rate": 9.615868854720903e-05, + "loss": 0.36966261863708494, + "step": 9910 + }, + { + "epoch": 0.0425886332998463, + "grad_norm": 0.24828827381134033, + "learning_rate": 9.615437682709141e-05, + "loss": 0.14217541217803956, + "step": 9920 + }, + { + "epoch": 0.04263156538986631, + "grad_norm": 0.807939887046814, + "learning_rate": 9.615006510697379e-05, + "loss": 0.3339355230331421, + "step": 9930 + }, + { + "epoch": 0.04267449747988632, + "grad_norm": 0.060862597078084946, + "learning_rate": 9.614575338685615e-05, + "loss": 0.24105873107910156, + "step": 9940 + }, + { + "epoch": 0.042717429569906325, + "grad_norm": 0.017723804339766502, + "learning_rate": 9.614144166673853e-05, + "loss": 0.192498779296875, + "step": 9950 + }, + { + "epoch": 0.042760361659926326, + "grad_norm": 4.271297931671143, + "learning_rate": 9.61371299466209e-05, + "loss": 0.17440344095230104, + "step": 9960 + }, + { + "epoch": 0.04280329374994633, + "grad_norm": 0.40663942694664, + "learning_rate": 9.613281822650328e-05, + "loss": 0.17575368881225586, + "step": 9970 + }, + { + "epoch": 0.04284622583996634, + "grad_norm": 0.34979286789894104, + "learning_rate": 9.612850650638566e-05, + "loss": 0.29747810363769533, + "step": 9980 + }, + { + "epoch": 0.04288915792998635, + "grad_norm": 1.386254906654358, + "learning_rate": 9.612419478626804e-05, + "loss": 0.20161771774291992, + "step": 9990 + }, + { + "epoch": 0.04293209002000636, + "grad_norm": 3.231767177581787, + "learning_rate": 9.611988306615041e-05, + "loss": 0.279221773147583, + "step": 10000 + }, + { + "epoch": 0.04293209002000636, + "eval_loss": 0.5372155904769897, + "eval_runtime": 27.4934, + "eval_samples_per_second": 3.637, + "eval_steps_per_second": 3.637, + "step": 10000 + }, + { + "epoch": 0.04297502211002636, + "grad_norm": 1.8212069272994995, + "learning_rate": 9.611557134603279e-05, + "loss": 0.29707605838775636, + "step": 10010 + }, + { + "epoch": 0.043017954200046365, + "grad_norm": 0.002717594150453806, + "learning_rate": 9.611125962591516e-05, + "loss": 0.3414538621902466, + "step": 10020 + }, + { + "epoch": 0.04306088629006637, + "grad_norm": 4.209356307983398, + "learning_rate": 9.610694790579753e-05, + "loss": 0.2345595121383667, + "step": 10030 + }, + { + "epoch": 0.04310381838008638, + "grad_norm": 1.0153359174728394, + "learning_rate": 9.610263618567991e-05, + "loss": 0.4312474250793457, + "step": 10040 + }, + { + "epoch": 0.04314675047010639, + "grad_norm": 0.9339410662651062, + "learning_rate": 9.60983244655623e-05, + "loss": 0.33216402530670164, + "step": 10050 + }, + { + "epoch": 0.04318968256012639, + "grad_norm": 0.008510474115610123, + "learning_rate": 9.609401274544468e-05, + "loss": 0.09502204060554505, + "step": 10060 + }, + { + "epoch": 0.0432326146501464, + "grad_norm": 5.747169017791748, + "learning_rate": 9.608970102532706e-05, + "loss": 0.44903016090393066, + "step": 10070 + }, + { + "epoch": 0.043275546740166405, + "grad_norm": 0.0010578184155747294, + "learning_rate": 9.608538930520943e-05, + "loss": 0.0894723355770111, + "step": 10080 + }, + { + "epoch": 0.04331847883018641, + "grad_norm": 0.317371129989624, + "learning_rate": 9.608107758509181e-05, + "loss": 0.33584930896759035, + "step": 10090 + }, + { + "epoch": 0.043361410920206414, + "grad_norm": 0.15126574039459229, + "learning_rate": 9.607676586497417e-05, + "loss": 0.26381702423095704, + "step": 10100 + }, + { + "epoch": 0.04340434301022642, + "grad_norm": 0.037118665874004364, + "learning_rate": 9.607245414485655e-05, + "loss": 0.09493091106414794, + "step": 10110 + }, + { + "epoch": 0.04344727510024643, + "grad_norm": 0.04892873018980026, + "learning_rate": 9.606814242473893e-05, + "loss": 0.2089691638946533, + "step": 10120 + }, + { + "epoch": 0.04349020719026644, + "grad_norm": 0.00034264527494087815, + "learning_rate": 9.60638307046213e-05, + "loss": 0.29033823013305665, + "step": 10130 + }, + { + "epoch": 0.043533139280286445, + "grad_norm": 0.2633804976940155, + "learning_rate": 9.605951898450368e-05, + "loss": 0.4061037540435791, + "step": 10140 + }, + { + "epoch": 0.043576071370306446, + "grad_norm": 0.3341623842716217, + "learning_rate": 9.605520726438606e-05, + "loss": 0.30220742225646974, + "step": 10150 + }, + { + "epoch": 0.043619003460326454, + "grad_norm": 6.020773887634277, + "learning_rate": 9.605089554426844e-05, + "loss": 0.15623061656951903, + "step": 10160 + }, + { + "epoch": 0.04366193555034646, + "grad_norm": 0.6320409774780273, + "learning_rate": 9.604658382415082e-05, + "loss": 0.19078081846237183, + "step": 10170 + }, + { + "epoch": 0.04370486764036647, + "grad_norm": 0.03318055346608162, + "learning_rate": 9.60422721040332e-05, + "loss": 0.21724910736083985, + "step": 10180 + }, + { + "epoch": 0.04374779973038648, + "grad_norm": 0.004824698902666569, + "learning_rate": 9.603796038391556e-05, + "loss": 0.08828012347221374, + "step": 10190 + }, + { + "epoch": 0.04379073182040648, + "grad_norm": 0.01676942966878414, + "learning_rate": 9.603364866379793e-05, + "loss": 0.26290996074676515, + "step": 10200 + }, + { + "epoch": 0.043833663910426486, + "grad_norm": 0.6830412745475769, + "learning_rate": 9.602933694368031e-05, + "loss": 0.13312371969223022, + "step": 10210 + }, + { + "epoch": 0.043876596000446494, + "grad_norm": 3.4054620265960693, + "learning_rate": 9.602502522356269e-05, + "loss": 0.4577789783477783, + "step": 10220 + }, + { + "epoch": 0.0439195280904665, + "grad_norm": 2.1453187465667725, + "learning_rate": 9.602071350344507e-05, + "loss": 0.3155569553375244, + "step": 10230 + }, + { + "epoch": 0.04396246018048651, + "grad_norm": 3.6915271282196045, + "learning_rate": 9.601640178332744e-05, + "loss": 0.16579316854476928, + "step": 10240 + }, + { + "epoch": 0.04400539227050651, + "grad_norm": 1.2764487266540527, + "learning_rate": 9.601209006320982e-05, + "loss": 0.4332874298095703, + "step": 10250 + }, + { + "epoch": 0.04404832436052652, + "grad_norm": 0.006339214742183685, + "learning_rate": 9.60077783430922e-05, + "loss": 0.33229615688323977, + "step": 10260 + }, + { + "epoch": 0.044091256450546526, + "grad_norm": 1.9073753356933594, + "learning_rate": 9.600346662297458e-05, + "loss": 0.4840839862823486, + "step": 10270 + }, + { + "epoch": 0.044134188540566534, + "grad_norm": 0.22293534874916077, + "learning_rate": 9.599915490285695e-05, + "loss": 0.3915229797363281, + "step": 10280 + }, + { + "epoch": 0.04417712063058654, + "grad_norm": 0.058444052934646606, + "learning_rate": 9.599484318273933e-05, + "loss": 0.21457459926605224, + "step": 10290 + }, + { + "epoch": 0.04422005272060654, + "grad_norm": 2.155538320541382, + "learning_rate": 9.599053146262171e-05, + "loss": 0.37848100662231443, + "step": 10300 + }, + { + "epoch": 0.04426298481062655, + "grad_norm": 2.304004430770874, + "learning_rate": 9.598621974250409e-05, + "loss": 0.33033792972564696, + "step": 10310 + }, + { + "epoch": 0.04430591690064656, + "grad_norm": 0.19713164865970612, + "learning_rate": 9.598190802238646e-05, + "loss": 0.21175870895385743, + "step": 10320 + }, + { + "epoch": 0.044348848990666566, + "grad_norm": 0.32798734307289124, + "learning_rate": 9.597759630226884e-05, + "loss": 0.25483083724975586, + "step": 10330 + }, + { + "epoch": 0.04439178108068657, + "grad_norm": 5.827533721923828, + "learning_rate": 9.597328458215122e-05, + "loss": 0.22512528896331788, + "step": 10340 + }, + { + "epoch": 0.044434713170706575, + "grad_norm": 2.3751018047332764, + "learning_rate": 9.596897286203358e-05, + "loss": 0.2391373634338379, + "step": 10350 + }, + { + "epoch": 0.04447764526072658, + "grad_norm": 0.21251116693019867, + "learning_rate": 9.596466114191596e-05, + "loss": 0.31760780811309813, + "step": 10360 + }, + { + "epoch": 0.04452057735074659, + "grad_norm": 0.04659834876656532, + "learning_rate": 9.596034942179834e-05, + "loss": 0.1353290319442749, + "step": 10370 + }, + { + "epoch": 0.0445635094407666, + "grad_norm": 0.03678036853671074, + "learning_rate": 9.595603770168071e-05, + "loss": 0.24096033573150635, + "step": 10380 + }, + { + "epoch": 0.0446064415307866, + "grad_norm": 0.24158138036727905, + "learning_rate": 9.595172598156309e-05, + "loss": 0.4555866241455078, + "step": 10390 + }, + { + "epoch": 0.04464937362080661, + "grad_norm": 0.34433749318122864, + "learning_rate": 9.594741426144547e-05, + "loss": 0.1974002480506897, + "step": 10400 + }, + { + "epoch": 0.044692305710826614, + "grad_norm": 1.9104621410369873, + "learning_rate": 9.594310254132785e-05, + "loss": 0.29447624683380125, + "step": 10410 + }, + { + "epoch": 0.04473523780084662, + "grad_norm": 1.7005401849746704, + "learning_rate": 9.593879082121022e-05, + "loss": 0.25979669094085694, + "step": 10420 + }, + { + "epoch": 0.04477816989086663, + "grad_norm": 0.15459361672401428, + "learning_rate": 9.593447910109259e-05, + "loss": 0.26453309059143065, + "step": 10430 + }, + { + "epoch": 0.04482110198088663, + "grad_norm": 0.07266079634428024, + "learning_rate": 9.593016738097496e-05, + "loss": 0.1707593321800232, + "step": 10440 + }, + { + "epoch": 0.04486403407090664, + "grad_norm": 1.2248409986495972, + "learning_rate": 9.592585566085734e-05, + "loss": 0.13180646896362305, + "step": 10450 + }, + { + "epoch": 0.04490696616092665, + "grad_norm": 0.01891922391951084, + "learning_rate": 9.592154394073972e-05, + "loss": 0.3238394737243652, + "step": 10460 + }, + { + "epoch": 0.044949898250946654, + "grad_norm": 3.500919818878174, + "learning_rate": 9.59172322206221e-05, + "loss": 0.37556891441345214, + "step": 10470 + }, + { + "epoch": 0.04499283034096666, + "grad_norm": 6.499775409698486, + "learning_rate": 9.591292050050447e-05, + "loss": 0.48048176765441897, + "step": 10480 + }, + { + "epoch": 0.04503576243098666, + "grad_norm": 2.8995563983917236, + "learning_rate": 9.590860878038685e-05, + "loss": 0.23455042839050294, + "step": 10490 + }, + { + "epoch": 0.04507869452100667, + "grad_norm": 5.567530632019043, + "learning_rate": 9.590429706026923e-05, + "loss": 0.26941962242126466, + "step": 10500 + }, + { + "epoch": 0.04512162661102668, + "grad_norm": 1.6310689449310303, + "learning_rate": 9.58999853401516e-05, + "loss": 0.27340772151947024, + "step": 10510 + }, + { + "epoch": 0.045164558701046686, + "grad_norm": 0.0813201442360878, + "learning_rate": 9.589567362003398e-05, + "loss": 0.06676875352859497, + "step": 10520 + }, + { + "epoch": 0.04520749079106669, + "grad_norm": 0.1687149703502655, + "learning_rate": 9.589136189991636e-05, + "loss": 0.00955677181482315, + "step": 10530 + }, + { + "epoch": 0.045250422881086695, + "grad_norm": 0.011493796482682228, + "learning_rate": 9.588705017979874e-05, + "loss": 0.2811113357543945, + "step": 10540 + }, + { + "epoch": 0.0452933549711067, + "grad_norm": 33.732940673828125, + "learning_rate": 9.588273845968111e-05, + "loss": 0.3537412643432617, + "step": 10550 + }, + { + "epoch": 0.04533628706112671, + "grad_norm": 7.003033638000488, + "learning_rate": 9.587842673956349e-05, + "loss": 0.21416730880737306, + "step": 10560 + }, + { + "epoch": 0.04537921915114672, + "grad_norm": 8.222970008850098, + "learning_rate": 9.587411501944587e-05, + "loss": 0.26668825149536135, + "step": 10570 + }, + { + "epoch": 0.04542215124116672, + "grad_norm": 1.0692180395126343, + "learning_rate": 9.586980329932825e-05, + "loss": 0.29593141078948976, + "step": 10580 + }, + { + "epoch": 0.04546508333118673, + "grad_norm": 0.018847858533263206, + "learning_rate": 9.586549157921062e-05, + "loss": 0.5067704200744629, + "step": 10590 + }, + { + "epoch": 0.045508015421206735, + "grad_norm": 1.3209799528121948, + "learning_rate": 9.586117985909299e-05, + "loss": 0.3315410137176514, + "step": 10600 + }, + { + "epoch": 0.04555094751122674, + "grad_norm": 0.009103666990995407, + "learning_rate": 9.585686813897536e-05, + "loss": 0.18447575569152833, + "step": 10610 + }, + { + "epoch": 0.04559387960124675, + "grad_norm": 0.11048633605241776, + "learning_rate": 9.585255641885774e-05, + "loss": 0.29539568424224855, + "step": 10620 + }, + { + "epoch": 0.04563681169126675, + "grad_norm": 1.421401858329773, + "learning_rate": 9.584824469874012e-05, + "loss": 0.48968944549560545, + "step": 10630 + }, + { + "epoch": 0.04567974378128676, + "grad_norm": 0.05576359108090401, + "learning_rate": 9.58439329786225e-05, + "loss": 0.15116746425628663, + "step": 10640 + }, + { + "epoch": 0.04572267587130677, + "grad_norm": 0.013207978568971157, + "learning_rate": 9.583962125850487e-05, + "loss": 0.1754152297973633, + "step": 10650 + }, + { + "epoch": 0.045765607961326775, + "grad_norm": 0.8618055582046509, + "learning_rate": 9.583530953838725e-05, + "loss": 0.40839419364929197, + "step": 10660 + }, + { + "epoch": 0.04580854005134678, + "grad_norm": 0.01856234483420849, + "learning_rate": 9.583099781826963e-05, + "loss": 0.3880528211593628, + "step": 10670 + }, + { + "epoch": 0.045851472141366784, + "grad_norm": 18.288997650146484, + "learning_rate": 9.582668609815199e-05, + "loss": 0.13824949264526368, + "step": 10680 + }, + { + "epoch": 0.04589440423138679, + "grad_norm": 0.08177149295806885, + "learning_rate": 9.582237437803437e-05, + "loss": 0.2577815055847168, + "step": 10690 + }, + { + "epoch": 0.0459373363214068, + "grad_norm": 0.2959858775138855, + "learning_rate": 9.581806265791675e-05, + "loss": 0.11627799272537231, + "step": 10700 + }, + { + "epoch": 0.04598026841142681, + "grad_norm": 0.02001349814236164, + "learning_rate": 9.581375093779912e-05, + "loss": 0.24912896156311035, + "step": 10710 + }, + { + "epoch": 0.046023200501446815, + "grad_norm": 0.01718798652291298, + "learning_rate": 9.58094392176815e-05, + "loss": 0.3985838651657104, + "step": 10720 + }, + { + "epoch": 0.046066132591466816, + "grad_norm": 0.04576408863067627, + "learning_rate": 9.580512749756388e-05, + "loss": 0.14956194162368774, + "step": 10730 + }, + { + "epoch": 0.046109064681486824, + "grad_norm": 1.2083806991577148, + "learning_rate": 9.580081577744626e-05, + "loss": 0.36740641593933104, + "step": 10740 + }, + { + "epoch": 0.04615199677150683, + "grad_norm": 0.023099783807992935, + "learning_rate": 9.579650405732863e-05, + "loss": 0.40886964797973635, + "step": 10750 + }, + { + "epoch": 0.04619492886152684, + "grad_norm": 0.1781107783317566, + "learning_rate": 9.579219233721101e-05, + "loss": 0.4085477352142334, + "step": 10760 + }, + { + "epoch": 0.04623786095154684, + "grad_norm": 0.4506646692752838, + "learning_rate": 9.578788061709339e-05, + "loss": 0.37363567352294924, + "step": 10770 + }, + { + "epoch": 0.04628079304156685, + "grad_norm": 0.011436011642217636, + "learning_rate": 9.578356889697577e-05, + "loss": 0.15324031114578246, + "step": 10780 + }, + { + "epoch": 0.046323725131586856, + "grad_norm": 0.6088100671768188, + "learning_rate": 9.577925717685814e-05, + "loss": 0.42464280128479004, + "step": 10790 + }, + { + "epoch": 0.04636665722160686, + "grad_norm": 3.944263458251953, + "learning_rate": 9.577494545674052e-05, + "loss": 0.16977940797805785, + "step": 10800 + }, + { + "epoch": 0.04640958931162687, + "grad_norm": 0.0476425401866436, + "learning_rate": 9.57706337366229e-05, + "loss": 0.3056109189987183, + "step": 10810 + }, + { + "epoch": 0.04645252140164687, + "grad_norm": 0.6663586497306824, + "learning_rate": 9.576632201650528e-05, + "loss": 0.22884979248046874, + "step": 10820 + }, + { + "epoch": 0.04649545349166688, + "grad_norm": 0.06515251845121384, + "learning_rate": 9.576201029638765e-05, + "loss": 0.3779136180877686, + "step": 10830 + }, + { + "epoch": 0.04653838558168689, + "grad_norm": 1.205527663230896, + "learning_rate": 9.575769857627002e-05, + "loss": 0.3730917930603027, + "step": 10840 + }, + { + "epoch": 0.046581317671706896, + "grad_norm": 0.9694238305091858, + "learning_rate": 9.57533868561524e-05, + "loss": 0.41217889785766604, + "step": 10850 + }, + { + "epoch": 0.0466242497617269, + "grad_norm": 0.15640544891357422, + "learning_rate": 9.574907513603477e-05, + "loss": 0.19687557220458984, + "step": 10860 + }, + { + "epoch": 0.046667181851746904, + "grad_norm": 0.1485452800989151, + "learning_rate": 9.574476341591715e-05, + "loss": 0.33020169734954835, + "step": 10870 + }, + { + "epoch": 0.04671011394176691, + "grad_norm": 0.9703245759010315, + "learning_rate": 9.574045169579953e-05, + "loss": 0.34535951614379884, + "step": 10880 + }, + { + "epoch": 0.04675304603178692, + "grad_norm": 11.140542984008789, + "learning_rate": 9.57361399756819e-05, + "loss": 0.3224964618682861, + "step": 10890 + }, + { + "epoch": 0.04679597812180693, + "grad_norm": 0.07320046424865723, + "learning_rate": 9.573182825556428e-05, + "loss": 0.17059063911437988, + "step": 10900 + }, + { + "epoch": 0.046838910211826935, + "grad_norm": 1.3945448398590088, + "learning_rate": 9.572751653544666e-05, + "loss": 0.16793534755706788, + "step": 10910 + }, + { + "epoch": 0.046881842301846936, + "grad_norm": 0.10286663472652435, + "learning_rate": 9.572320481532904e-05, + "loss": 0.3692343711853027, + "step": 10920 + }, + { + "epoch": 0.046924774391866944, + "grad_norm": 0.030759811401367188, + "learning_rate": 9.57188930952114e-05, + "loss": 0.21908931732177733, + "step": 10930 + }, + { + "epoch": 0.04696770648188695, + "grad_norm": 0.07938531041145325, + "learning_rate": 9.571458137509378e-05, + "loss": 0.09618297815322877, + "step": 10940 + }, + { + "epoch": 0.04701063857190696, + "grad_norm": 0.6279019117355347, + "learning_rate": 9.571026965497615e-05, + "loss": 0.16348246335983277, + "step": 10950 + }, + { + "epoch": 0.04705357066192696, + "grad_norm": 0.03495902940630913, + "learning_rate": 9.570595793485853e-05, + "loss": 0.19186799526214598, + "step": 10960 + }, + { + "epoch": 0.04709650275194697, + "grad_norm": 0.7391979098320007, + "learning_rate": 9.570164621474091e-05, + "loss": 0.11953030824661255, + "step": 10970 + }, + { + "epoch": 0.047139434841966976, + "grad_norm": 3.563753604888916, + "learning_rate": 9.569733449462329e-05, + "loss": 0.2138049602508545, + "step": 10980 + }, + { + "epoch": 0.047182366931986984, + "grad_norm": 0.14636358618736267, + "learning_rate": 9.569302277450566e-05, + "loss": 0.23743727207183837, + "step": 10990 + }, + { + "epoch": 0.04722529902200699, + "grad_norm": 0.006646784488111734, + "learning_rate": 9.568871105438804e-05, + "loss": 0.2026223659515381, + "step": 11000 + }, + { + "epoch": 0.04722529902200699, + "eval_loss": 0.5371872186660767, + "eval_runtime": 27.437, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 11000 + }, + { + "epoch": 0.04726823111202699, + "grad_norm": 0.00699937529861927, + "learning_rate": 9.568439933427042e-05, + "loss": 0.18824267387390137, + "step": 11010 + }, + { + "epoch": 0.047311163202047, + "grad_norm": 0.8322146534919739, + "learning_rate": 9.56800876141528e-05, + "loss": 0.3585317611694336, + "step": 11020 + }, + { + "epoch": 0.04735409529206701, + "grad_norm": 5.2994704246521, + "learning_rate": 9.567577589403517e-05, + "loss": 0.3288968563079834, + "step": 11030 + }, + { + "epoch": 0.047397027382087016, + "grad_norm": 1.3787318468093872, + "learning_rate": 9.567146417391755e-05, + "loss": 0.4345251560211182, + "step": 11040 + }, + { + "epoch": 0.047439959472107024, + "grad_norm": 7.704769134521484, + "learning_rate": 9.566715245379993e-05, + "loss": 0.22723698616027832, + "step": 11050 + }, + { + "epoch": 0.047482891562127025, + "grad_norm": 0.05537894368171692, + "learning_rate": 9.56628407336823e-05, + "loss": 0.3707094430923462, + "step": 11060 + }, + { + "epoch": 0.04752582365214703, + "grad_norm": 2.6444385051727295, + "learning_rate": 9.565852901356468e-05, + "loss": 0.1761788845062256, + "step": 11070 + }, + { + "epoch": 0.04756875574216704, + "grad_norm": 0.04609530419111252, + "learning_rate": 9.565421729344706e-05, + "loss": 0.27322142124176024, + "step": 11080 + }, + { + "epoch": 0.04761168783218705, + "grad_norm": 2.6429717540740967, + "learning_rate": 9.564990557332942e-05, + "loss": 0.23322317600250245, + "step": 11090 + }, + { + "epoch": 0.047654619922207056, + "grad_norm": 6.946089744567871, + "learning_rate": 9.56455938532118e-05, + "loss": 0.37691373825073243, + "step": 11100 + }, + { + "epoch": 0.04769755201222706, + "grad_norm": 2.606541395187378, + "learning_rate": 9.564128213309418e-05, + "loss": 0.3111485242843628, + "step": 11110 + }, + { + "epoch": 0.047740484102247065, + "grad_norm": 0.16900953650474548, + "learning_rate": 9.563697041297656e-05, + "loss": 0.24888322353363038, + "step": 11120 + }, + { + "epoch": 0.04778341619226707, + "grad_norm": 0.026469141244888306, + "learning_rate": 9.563265869285893e-05, + "loss": 0.33678669929504396, + "step": 11130 + }, + { + "epoch": 0.04782634828228708, + "grad_norm": 0.8681675791740417, + "learning_rate": 9.562834697274131e-05, + "loss": 0.3018766403198242, + "step": 11140 + }, + { + "epoch": 0.04786928037230709, + "grad_norm": 0.10563918203115463, + "learning_rate": 9.562403525262369e-05, + "loss": 0.37344143390655515, + "step": 11150 + }, + { + "epoch": 0.04791221246232709, + "grad_norm": 0.05082390457391739, + "learning_rate": 9.561972353250606e-05, + "loss": 0.19082963466644287, + "step": 11160 + }, + { + "epoch": 0.0479551445523471, + "grad_norm": 2.553626775741577, + "learning_rate": 9.561541181238843e-05, + "loss": 0.3182950258255005, + "step": 11170 + }, + { + "epoch": 0.047998076642367105, + "grad_norm": 2.447514533996582, + "learning_rate": 9.56111000922708e-05, + "loss": 0.41812405586242674, + "step": 11180 + }, + { + "epoch": 0.04804100873238711, + "grad_norm": 11.06352424621582, + "learning_rate": 9.560678837215318e-05, + "loss": 0.31888484954833984, + "step": 11190 + }, + { + "epoch": 0.04808394082240711, + "grad_norm": 6.056185722351074, + "learning_rate": 9.560247665203556e-05, + "loss": 0.6062085628509521, + "step": 11200 + }, + { + "epoch": 0.04812687291242712, + "grad_norm": 0.18857668340206146, + "learning_rate": 9.559816493191794e-05, + "loss": 0.2257009983062744, + "step": 11210 + }, + { + "epoch": 0.04816980500244713, + "grad_norm": 6.72116756439209, + "learning_rate": 9.559385321180032e-05, + "loss": 0.24791340827941893, + "step": 11220 + }, + { + "epoch": 0.04821273709246714, + "grad_norm": 3.8300418853759766, + "learning_rate": 9.558954149168269e-05, + "loss": 0.34927642345428467, + "step": 11230 + }, + { + "epoch": 0.048255669182487144, + "grad_norm": 0.015608267858624458, + "learning_rate": 9.558522977156508e-05, + "loss": 0.2696120023727417, + "step": 11240 + }, + { + "epoch": 0.048298601272507145, + "grad_norm": 1.1251463890075684, + "learning_rate": 9.558091805144746e-05, + "loss": 0.49908971786499023, + "step": 11250 + }, + { + "epoch": 0.04834153336252715, + "grad_norm": 0.09229426085948944, + "learning_rate": 9.557660633132982e-05, + "loss": 0.3560775279998779, + "step": 11260 + }, + { + "epoch": 0.04838446545254716, + "grad_norm": 0.010238938964903355, + "learning_rate": 9.55722946112122e-05, + "loss": 0.19407575130462645, + "step": 11270 + }, + { + "epoch": 0.04842739754256717, + "grad_norm": 0.0983290895819664, + "learning_rate": 9.556798289109458e-05, + "loss": 0.1932140111923218, + "step": 11280 + }, + { + "epoch": 0.04847032963258718, + "grad_norm": 0.5768459439277649, + "learning_rate": 9.556367117097696e-05, + "loss": 0.41858468055725095, + "step": 11290 + }, + { + "epoch": 0.04851326172260718, + "grad_norm": 1.8036881685256958, + "learning_rate": 9.555935945085933e-05, + "loss": 0.25796501636505126, + "step": 11300 + }, + { + "epoch": 0.048556193812627185, + "grad_norm": 1.1746166944503784, + "learning_rate": 9.555504773074171e-05, + "loss": 0.22487566471099854, + "step": 11310 + }, + { + "epoch": 0.04859912590264719, + "grad_norm": 0.12483537197113037, + "learning_rate": 9.555073601062409e-05, + "loss": 0.15164095163345337, + "step": 11320 + }, + { + "epoch": 0.0486420579926672, + "grad_norm": 0.050800539553165436, + "learning_rate": 9.554642429050647e-05, + "loss": 0.3063875675201416, + "step": 11330 + }, + { + "epoch": 0.04868499008268721, + "grad_norm": 0.03678128495812416, + "learning_rate": 9.554211257038883e-05, + "loss": 0.3408423662185669, + "step": 11340 + }, + { + "epoch": 0.04872792217270721, + "grad_norm": 0.03818840906023979, + "learning_rate": 9.553780085027121e-05, + "loss": 0.2510481357574463, + "step": 11350 + }, + { + "epoch": 0.04877085426272722, + "grad_norm": 0.05373666435480118, + "learning_rate": 9.553348913015358e-05, + "loss": 0.45258536338806155, + "step": 11360 + }, + { + "epoch": 0.048813786352747225, + "grad_norm": 0.08894126862287521, + "learning_rate": 9.552917741003596e-05, + "loss": 0.3101092576980591, + "step": 11370 + }, + { + "epoch": 0.04885671844276723, + "grad_norm": 1.3045735359191895, + "learning_rate": 9.552486568991834e-05, + "loss": 0.31512742042541503, + "step": 11380 + }, + { + "epoch": 0.048899650532787234, + "grad_norm": 0.8845686912536621, + "learning_rate": 9.552055396980072e-05, + "loss": 0.3243434429168701, + "step": 11390 + }, + { + "epoch": 0.04894258262280724, + "grad_norm": 2.371500253677368, + "learning_rate": 9.55162422496831e-05, + "loss": 0.15619829893112183, + "step": 11400 + }, + { + "epoch": 0.04898551471282725, + "grad_norm": 0.05966117978096008, + "learning_rate": 9.551193052956547e-05, + "loss": 0.3500715732574463, + "step": 11410 + }, + { + "epoch": 0.04902844680284726, + "grad_norm": 1.649623990058899, + "learning_rate": 9.550761880944783e-05, + "loss": 0.2953296661376953, + "step": 11420 + }, + { + "epoch": 0.049071378892867265, + "grad_norm": 0.0920541062951088, + "learning_rate": 9.550330708933021e-05, + "loss": 0.09942615628242493, + "step": 11430 + }, + { + "epoch": 0.049114310982887266, + "grad_norm": 0.14348438382148743, + "learning_rate": 9.549899536921259e-05, + "loss": 0.21229536533355714, + "step": 11440 + }, + { + "epoch": 0.049157243072907274, + "grad_norm": 1.0420409440994263, + "learning_rate": 9.549468364909497e-05, + "loss": 0.46654543876647947, + "step": 11450 + }, + { + "epoch": 0.04920017516292728, + "grad_norm": 11.456270217895508, + "learning_rate": 9.549037192897736e-05, + "loss": 0.4078725814819336, + "step": 11460 + }, + { + "epoch": 0.04924310725294729, + "grad_norm": 1.5066547393798828, + "learning_rate": 9.548606020885974e-05, + "loss": 0.32791011333465575, + "step": 11470 + }, + { + "epoch": 0.0492860393429673, + "grad_norm": 0.03008863516151905, + "learning_rate": 9.548174848874211e-05, + "loss": 0.13255956172943115, + "step": 11480 + }, + { + "epoch": 0.0493289714329873, + "grad_norm": 10.941854476928711, + "learning_rate": 9.547743676862449e-05, + "loss": 0.4925832271575928, + "step": 11490 + }, + { + "epoch": 0.049371903523007306, + "grad_norm": 0.006078363861888647, + "learning_rate": 9.547312504850685e-05, + "loss": 0.40673055648803713, + "step": 11500 + }, + { + "epoch": 0.049414835613027314, + "grad_norm": 0.009038330987095833, + "learning_rate": 9.546881332838923e-05, + "loss": 0.21685254573822021, + "step": 11510 + }, + { + "epoch": 0.04945776770304732, + "grad_norm": 0.0976053774356842, + "learning_rate": 9.546450160827161e-05, + "loss": 0.06418330669403076, + "step": 11520 + }, + { + "epoch": 0.04950069979306733, + "grad_norm": 0.03044871799647808, + "learning_rate": 9.546018988815399e-05, + "loss": 0.008297404646873474, + "step": 11530 + }, + { + "epoch": 0.04954363188308733, + "grad_norm": 2.4783143997192383, + "learning_rate": 9.545587816803636e-05, + "loss": 0.5636334419250488, + "step": 11540 + }, + { + "epoch": 0.04958656397310734, + "grad_norm": 0.3153276741504669, + "learning_rate": 9.545156644791874e-05, + "loss": 0.07382228970527649, + "step": 11550 + }, + { + "epoch": 0.049629496063127346, + "grad_norm": 4.899248123168945, + "learning_rate": 9.544725472780112e-05, + "loss": 0.36443448066711426, + "step": 11560 + }, + { + "epoch": 0.049672428153147354, + "grad_norm": 0.25590115785598755, + "learning_rate": 9.54429430076835e-05, + "loss": 0.24122922420501708, + "step": 11570 + }, + { + "epoch": 0.049715360243167354, + "grad_norm": 1.520410418510437, + "learning_rate": 9.543863128756586e-05, + "loss": 0.32092602252960206, + "step": 11580 + }, + { + "epoch": 0.04975829233318736, + "grad_norm": 0.017016088590025902, + "learning_rate": 9.543431956744824e-05, + "loss": 0.2933270215988159, + "step": 11590 + }, + { + "epoch": 0.04980122442320737, + "grad_norm": 0.07425573468208313, + "learning_rate": 9.543000784733061e-05, + "loss": 0.29837546348571775, + "step": 11600 + }, + { + "epoch": 0.04984415651322738, + "grad_norm": 0.09230173379182816, + "learning_rate": 9.542569612721299e-05, + "loss": 0.2609511375427246, + "step": 11610 + }, + { + "epoch": 0.049887088603247386, + "grad_norm": 1.69183349609375, + "learning_rate": 9.542138440709537e-05, + "loss": 0.2415536403656006, + "step": 11620 + }, + { + "epoch": 0.049930020693267387, + "grad_norm": 0.032961320132017136, + "learning_rate": 9.541707268697775e-05, + "loss": 0.19287939071655275, + "step": 11630 + }, + { + "epoch": 0.049972952783287394, + "grad_norm": 1.4239143133163452, + "learning_rate": 9.541276096686012e-05, + "loss": 0.3143747806549072, + "step": 11640 + }, + { + "epoch": 0.0500158848733074, + "grad_norm": 2.7020840644836426, + "learning_rate": 9.54084492467425e-05, + "loss": 0.411509370803833, + "step": 11650 + }, + { + "epoch": 0.05005881696332741, + "grad_norm": 1.0710383653640747, + "learning_rate": 9.540413752662488e-05, + "loss": 0.48816213607788084, + "step": 11660 + }, + { + "epoch": 0.05010174905334742, + "grad_norm": 0.13522252440452576, + "learning_rate": 9.539982580650724e-05, + "loss": 0.3829029560089111, + "step": 11670 + }, + { + "epoch": 0.05014468114336742, + "grad_norm": 0.016100700944662094, + "learning_rate": 9.539551408638963e-05, + "loss": 0.2695424795150757, + "step": 11680 + }, + { + "epoch": 0.050187613233387426, + "grad_norm": 0.36224961280822754, + "learning_rate": 9.539120236627201e-05, + "loss": 0.2530616283416748, + "step": 11690 + }, + { + "epoch": 0.050230545323407434, + "grad_norm": 0.07011070847511292, + "learning_rate": 9.538689064615439e-05, + "loss": 0.3647505521774292, + "step": 11700 + }, + { + "epoch": 0.05027347741342744, + "grad_norm": 0.03421260043978691, + "learning_rate": 9.538257892603676e-05, + "loss": 0.225752592086792, + "step": 11710 + }, + { + "epoch": 0.05031640950344745, + "grad_norm": 0.12849198281764984, + "learning_rate": 9.537826720591914e-05, + "loss": 0.22768645286560057, + "step": 11720 + }, + { + "epoch": 0.05035934159346745, + "grad_norm": 3.8090126514434814, + "learning_rate": 9.537395548580152e-05, + "loss": 0.26749815940856936, + "step": 11730 + }, + { + "epoch": 0.05040227368348746, + "grad_norm": 1.128069519996643, + "learning_rate": 9.53696437656839e-05, + "loss": 0.3992989778518677, + "step": 11740 + }, + { + "epoch": 0.050445205773507466, + "grad_norm": 2.9438343048095703, + "learning_rate": 9.536533204556626e-05, + "loss": 0.38953499794006347, + "step": 11750 + }, + { + "epoch": 0.050488137863527474, + "grad_norm": 0.017695719376206398, + "learning_rate": 9.536102032544864e-05, + "loss": 0.19398535490036012, + "step": 11760 + }, + { + "epoch": 0.05053106995354748, + "grad_norm": 1.8694612979888916, + "learning_rate": 9.535670860533101e-05, + "loss": 0.2112741470336914, + "step": 11770 + }, + { + "epoch": 0.05057400204356748, + "grad_norm": 2.410950183868408, + "learning_rate": 9.535239688521339e-05, + "loss": 0.21994683742523194, + "step": 11780 + }, + { + "epoch": 0.05061693413358749, + "grad_norm": 1.0458391904830933, + "learning_rate": 9.534808516509577e-05, + "loss": 0.31306536197662355, + "step": 11790 + }, + { + "epoch": 0.0506598662236075, + "grad_norm": 0.09369885176420212, + "learning_rate": 9.534377344497815e-05, + "loss": 0.3466991901397705, + "step": 11800 + }, + { + "epoch": 0.050702798313627506, + "grad_norm": 0.013919656164944172, + "learning_rate": 9.533946172486052e-05, + "loss": 0.20056288242340087, + "step": 11810 + }, + { + "epoch": 0.05074573040364751, + "grad_norm": 0.1818535476922989, + "learning_rate": 9.53351500047429e-05, + "loss": 0.3760584592819214, + "step": 11820 + }, + { + "epoch": 0.050788662493667515, + "grad_norm": 0.05832016095519066, + "learning_rate": 9.533083828462527e-05, + "loss": 0.23105072975158691, + "step": 11830 + }, + { + "epoch": 0.05083159458368752, + "grad_norm": 2.716463565826416, + "learning_rate": 9.532652656450764e-05, + "loss": 0.3760308027267456, + "step": 11840 + }, + { + "epoch": 0.05087452667370753, + "grad_norm": 0.3611092269420624, + "learning_rate": 9.532221484439002e-05, + "loss": 0.2634706735610962, + "step": 11850 + }, + { + "epoch": 0.05091745876372754, + "grad_norm": 1.7555878162384033, + "learning_rate": 9.53179031242724e-05, + "loss": 0.3180107831954956, + "step": 11860 + }, + { + "epoch": 0.05096039085374754, + "grad_norm": 1.297004222869873, + "learning_rate": 9.531359140415477e-05, + "loss": 0.2752450227737427, + "step": 11870 + }, + { + "epoch": 0.05100332294376755, + "grad_norm": 2.4578864574432373, + "learning_rate": 9.530927968403715e-05, + "loss": 0.33813331127166746, + "step": 11880 + }, + { + "epoch": 0.051046255033787555, + "grad_norm": 2.5260801315307617, + "learning_rate": 9.530496796391953e-05, + "loss": 0.3503072738647461, + "step": 11890 + }, + { + "epoch": 0.05108918712380756, + "grad_norm": 0.04710591956973076, + "learning_rate": 9.53006562438019e-05, + "loss": 0.31316137313842773, + "step": 11900 + }, + { + "epoch": 0.05113211921382757, + "grad_norm": 3.3213343620300293, + "learning_rate": 9.529634452368428e-05, + "loss": 0.3701478004455566, + "step": 11910 + }, + { + "epoch": 0.05117505130384757, + "grad_norm": 0.3955182731151581, + "learning_rate": 9.529203280356666e-05, + "loss": 0.22598392963409425, + "step": 11920 + }, + { + "epoch": 0.05121798339386758, + "grad_norm": 2.304351806640625, + "learning_rate": 9.528772108344904e-05, + "loss": 0.3806648015975952, + "step": 11930 + }, + { + "epoch": 0.05126091548388759, + "grad_norm": 2.4194395542144775, + "learning_rate": 9.528340936333142e-05, + "loss": 0.2859928131103516, + "step": 11940 + }, + { + "epoch": 0.051303847573907595, + "grad_norm": 0.4967823028564453, + "learning_rate": 9.52790976432138e-05, + "loss": 0.5555664539337158, + "step": 11950 + }, + { + "epoch": 0.0513467796639276, + "grad_norm": 0.08415184170007706, + "learning_rate": 9.527478592309617e-05, + "loss": 0.34697282314300537, + "step": 11960 + }, + { + "epoch": 0.0513897117539476, + "grad_norm": 1.2147996425628662, + "learning_rate": 9.527047420297855e-05, + "loss": 0.23912529945373534, + "step": 11970 + }, + { + "epoch": 0.05143264384396761, + "grad_norm": 0.4464738070964813, + "learning_rate": 9.526616248286093e-05, + "loss": 0.29886488914489745, + "step": 11980 + }, + { + "epoch": 0.05147557593398762, + "grad_norm": 1.3995260000228882, + "learning_rate": 9.52618507627433e-05, + "loss": 0.44214572906494143, + "step": 11990 + }, + { + "epoch": 0.05151850802400763, + "grad_norm": 0.23975662887096405, + "learning_rate": 9.525753904262567e-05, + "loss": 0.33844738006591796, + "step": 12000 + }, + { + "epoch": 0.05151850802400763, + "eval_loss": 0.5358631610870361, + "eval_runtime": 27.6176, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 3.621, + "step": 12000 + }, + { + "epoch": 0.05156144011402763, + "grad_norm": 3.157212495803833, + "learning_rate": 9.525322732250804e-05, + "loss": 0.16573574542999267, + "step": 12010 + }, + { + "epoch": 0.051604372204047635, + "grad_norm": 1.0986665487289429, + "learning_rate": 9.524891560239042e-05, + "loss": 0.31950509548187256, + "step": 12020 + }, + { + "epoch": 0.05164730429406764, + "grad_norm": 0.1084139496088028, + "learning_rate": 9.52446038822728e-05, + "loss": 0.366551947593689, + "step": 12030 + }, + { + "epoch": 0.05169023638408765, + "grad_norm": 0.04859800636768341, + "learning_rate": 9.524029216215518e-05, + "loss": 0.42212815284729005, + "step": 12040 + }, + { + "epoch": 0.05173316847410766, + "grad_norm": 0.9786454439163208, + "learning_rate": 9.523598044203755e-05, + "loss": 0.29626033306121824, + "step": 12050 + }, + { + "epoch": 0.05177610056412766, + "grad_norm": 0.13624534010887146, + "learning_rate": 9.523166872191993e-05, + "loss": 0.24190120697021483, + "step": 12060 + }, + { + "epoch": 0.05181903265414767, + "grad_norm": 0.044610291719436646, + "learning_rate": 9.522735700180231e-05, + "loss": 0.2906489372253418, + "step": 12070 + }, + { + "epoch": 0.051861964744167675, + "grad_norm": 2.1956253051757812, + "learning_rate": 9.522304528168467e-05, + "loss": 0.21995768547058106, + "step": 12080 + }, + { + "epoch": 0.05190489683418768, + "grad_norm": 0.09326007962226868, + "learning_rate": 9.521873356156705e-05, + "loss": 0.37242443561553956, + "step": 12090 + }, + { + "epoch": 0.05194782892420769, + "grad_norm": 0.008499284274876118, + "learning_rate": 9.521442184144943e-05, + "loss": 0.2127228021621704, + "step": 12100 + }, + { + "epoch": 0.05199076101422769, + "grad_norm": 0.1813715696334839, + "learning_rate": 9.52101101213318e-05, + "loss": 0.27324044704437256, + "step": 12110 + }, + { + "epoch": 0.0520336931042477, + "grad_norm": 0.01605965569615364, + "learning_rate": 9.520579840121418e-05, + "loss": 0.3123704671859741, + "step": 12120 + }, + { + "epoch": 0.05207662519426771, + "grad_norm": 0.05326181650161743, + "learning_rate": 9.520148668109656e-05, + "loss": 0.4056252479553223, + "step": 12130 + }, + { + "epoch": 0.052119557284287715, + "grad_norm": 1.489559531211853, + "learning_rate": 9.519717496097894e-05, + "loss": 0.2229902982711792, + "step": 12140 + }, + { + "epoch": 0.05216248937430772, + "grad_norm": 2.0173451900482178, + "learning_rate": 9.519286324086131e-05, + "loss": 0.2639839887619019, + "step": 12150 + }, + { + "epoch": 0.052205421464327724, + "grad_norm": 0.535400927066803, + "learning_rate": 9.518855152074369e-05, + "loss": 0.2586674690246582, + "step": 12160 + }, + { + "epoch": 0.05224835355434773, + "grad_norm": 0.052150264382362366, + "learning_rate": 9.518423980062607e-05, + "loss": 0.1961987853050232, + "step": 12170 + }, + { + "epoch": 0.05229128564436774, + "grad_norm": 0.8536399006843567, + "learning_rate": 9.517992808050845e-05, + "loss": 0.5551873207092285, + "step": 12180 + }, + { + "epoch": 0.05233421773438775, + "grad_norm": 8.8615140914917, + "learning_rate": 9.517561636039082e-05, + "loss": 0.41590356826782227, + "step": 12190 + }, + { + "epoch": 0.052377149824407755, + "grad_norm": 0.9676482677459717, + "learning_rate": 9.51713046402732e-05, + "loss": 0.30098705291748046, + "step": 12200 + }, + { + "epoch": 0.052420081914427756, + "grad_norm": 3.19608736038208, + "learning_rate": 9.516699292015558e-05, + "loss": 0.2460148811340332, + "step": 12210 + }, + { + "epoch": 0.052463014004447764, + "grad_norm": 0.4055849611759186, + "learning_rate": 9.516268120003795e-05, + "loss": 0.09390591979026794, + "step": 12220 + }, + { + "epoch": 0.05250594609446777, + "grad_norm": 1.5819487571716309, + "learning_rate": 9.515836947992033e-05, + "loss": 0.2062903642654419, + "step": 12230 + }, + { + "epoch": 0.05254887818448778, + "grad_norm": 1.0232068300247192, + "learning_rate": 9.51540577598027e-05, + "loss": 0.28633387088775636, + "step": 12240 + }, + { + "epoch": 0.05259181027450778, + "grad_norm": 0.008014945313334465, + "learning_rate": 9.514974603968507e-05, + "loss": 0.1834414482116699, + "step": 12250 + }, + { + "epoch": 0.05263474236452779, + "grad_norm": 1.5555038452148438, + "learning_rate": 9.514543431956745e-05, + "loss": 0.44185829162597656, + "step": 12260 + }, + { + "epoch": 0.052677674454547796, + "grad_norm": 1.1476542949676514, + "learning_rate": 9.514112259944983e-05, + "loss": 0.1935230612754822, + "step": 12270 + }, + { + "epoch": 0.052720606544567804, + "grad_norm": 0.12156800925731659, + "learning_rate": 9.51368108793322e-05, + "loss": 0.29886319637298586, + "step": 12280 + }, + { + "epoch": 0.05276353863458781, + "grad_norm": 0.029563816264271736, + "learning_rate": 9.513249915921458e-05, + "loss": 0.33358020782470704, + "step": 12290 + }, + { + "epoch": 0.05280647072460781, + "grad_norm": 0.47683200240135193, + "learning_rate": 9.512818743909696e-05, + "loss": 0.2097261905670166, + "step": 12300 + }, + { + "epoch": 0.05284940281462782, + "grad_norm": 0.04604807123541832, + "learning_rate": 9.512387571897934e-05, + "loss": 0.2722134828567505, + "step": 12310 + }, + { + "epoch": 0.05289233490464783, + "grad_norm": 16.888795852661133, + "learning_rate": 9.511956399886171e-05, + "loss": 0.4344566822052002, + "step": 12320 + }, + { + "epoch": 0.052935266994667836, + "grad_norm": 0.18765738606452942, + "learning_rate": 9.511525227874408e-05, + "loss": 0.21546649932861328, + "step": 12330 + }, + { + "epoch": 0.052978199084687844, + "grad_norm": 0.8789750933647156, + "learning_rate": 9.511094055862646e-05, + "loss": 0.35911104679107664, + "step": 12340 + }, + { + "epoch": 0.053021131174707845, + "grad_norm": 0.7323353290557861, + "learning_rate": 9.510662883850883e-05, + "loss": 0.26637275218963624, + "step": 12350 + }, + { + "epoch": 0.05306406326472785, + "grad_norm": 0.9971916079521179, + "learning_rate": 9.510231711839121e-05, + "loss": 0.39113051891326905, + "step": 12360 + }, + { + "epoch": 0.05310699535474786, + "grad_norm": 2.9714555740356445, + "learning_rate": 9.509800539827359e-05, + "loss": 0.2813329458236694, + "step": 12370 + }, + { + "epoch": 0.05314992744476787, + "grad_norm": 0.020251838490366936, + "learning_rate": 9.509369367815596e-05, + "loss": 0.2521169900894165, + "step": 12380 + }, + { + "epoch": 0.053192859534787876, + "grad_norm": 2.174328565597534, + "learning_rate": 9.508938195803834e-05, + "loss": 0.31486124992370607, + "step": 12390 + }, + { + "epoch": 0.05323579162480788, + "grad_norm": 0.056546323001384735, + "learning_rate": 9.508507023792072e-05, + "loss": 0.09270382523536683, + "step": 12400 + }, + { + "epoch": 0.053278723714827884, + "grad_norm": 1.50390625, + "learning_rate": 9.50807585178031e-05, + "loss": 0.5031360626220703, + "step": 12410 + }, + { + "epoch": 0.05332165580484789, + "grad_norm": 0.010583514347672462, + "learning_rate": 9.507644679768547e-05, + "loss": 0.23066141605377197, + "step": 12420 + }, + { + "epoch": 0.0533645878948679, + "grad_norm": 2.49086332321167, + "learning_rate": 9.507213507756785e-05, + "loss": 0.30078697204589844, + "step": 12430 + }, + { + "epoch": 0.0534075199848879, + "grad_norm": 0.44729381799697876, + "learning_rate": 9.506782335745023e-05, + "loss": 0.17861661911010743, + "step": 12440 + }, + { + "epoch": 0.05345045207490791, + "grad_norm": 0.10300600528717041, + "learning_rate": 9.50635116373326e-05, + "loss": 0.332088565826416, + "step": 12450 + }, + { + "epoch": 0.05349338416492792, + "grad_norm": 1.658432960510254, + "learning_rate": 9.505919991721498e-05, + "loss": 0.3135262966156006, + "step": 12460 + }, + { + "epoch": 0.053536316254947924, + "grad_norm": 2.3481667041778564, + "learning_rate": 9.505488819709736e-05, + "loss": 0.3561201572418213, + "step": 12470 + }, + { + "epoch": 0.05357924834496793, + "grad_norm": 0.21150454878807068, + "learning_rate": 9.505057647697974e-05, + "loss": 0.539197587966919, + "step": 12480 + }, + { + "epoch": 0.05362218043498793, + "grad_norm": 0.907037079334259, + "learning_rate": 9.50462647568621e-05, + "loss": 0.3842069149017334, + "step": 12490 + }, + { + "epoch": 0.05366511252500794, + "grad_norm": 8.066739082336426, + "learning_rate": 9.504195303674448e-05, + "loss": 0.43160204887390136, + "step": 12500 + }, + { + "epoch": 0.05370804461502795, + "grad_norm": 0.7765117883682251, + "learning_rate": 9.503764131662686e-05, + "loss": 0.2860031843185425, + "step": 12510 + }, + { + "epoch": 0.053750976705047956, + "grad_norm": 0.39384740591049194, + "learning_rate": 9.503332959650923e-05, + "loss": 0.23746469020843505, + "step": 12520 + }, + { + "epoch": 0.053793908795067964, + "grad_norm": 14.003802299499512, + "learning_rate": 9.502901787639161e-05, + "loss": 0.14782247543334961, + "step": 12530 + }, + { + "epoch": 0.053836840885087965, + "grad_norm": 1.0125482082366943, + "learning_rate": 9.502470615627399e-05, + "loss": 0.21553480625152588, + "step": 12540 + }, + { + "epoch": 0.05387977297510797, + "grad_norm": 3.473666191101074, + "learning_rate": 9.502039443615637e-05, + "loss": 0.3330304384231567, + "step": 12550 + }, + { + "epoch": 0.05392270506512798, + "grad_norm": 0.6401351690292358, + "learning_rate": 9.501608271603874e-05, + "loss": 0.2649469614028931, + "step": 12560 + }, + { + "epoch": 0.05396563715514799, + "grad_norm": 0.1529102474451065, + "learning_rate": 9.501177099592111e-05, + "loss": 0.19821540117263795, + "step": 12570 + }, + { + "epoch": 0.054008569245167996, + "grad_norm": 0.03825334832072258, + "learning_rate": 9.500745927580348e-05, + "loss": 0.24926376342773438, + "step": 12580 + }, + { + "epoch": 0.054051501335188, + "grad_norm": 2.7774956226348877, + "learning_rate": 9.500314755568586e-05, + "loss": 0.34471452236175537, + "step": 12590 + }, + { + "epoch": 0.054094433425208005, + "grad_norm": 0.6242231130599976, + "learning_rate": 9.499883583556824e-05, + "loss": 0.15731912851333618, + "step": 12600 + }, + { + "epoch": 0.05413736551522801, + "grad_norm": 1.258853793144226, + "learning_rate": 9.499452411545062e-05, + "loss": 0.33891823291778567, + "step": 12610 + }, + { + "epoch": 0.05418029760524802, + "grad_norm": 1.258105993270874, + "learning_rate": 9.4990212395333e-05, + "loss": 0.2595653057098389, + "step": 12620 + }, + { + "epoch": 0.05422322969526803, + "grad_norm": 3.286238670349121, + "learning_rate": 9.498590067521537e-05, + "loss": 0.30257110595703124, + "step": 12630 + }, + { + "epoch": 0.05426616178528803, + "grad_norm": 0.2592448890209198, + "learning_rate": 9.498158895509776e-05, + "loss": 0.3336447477340698, + "step": 12640 + }, + { + "epoch": 0.05430909387530804, + "grad_norm": 2.1326916217803955, + "learning_rate": 9.497727723498013e-05, + "loss": 0.34888103008270266, + "step": 12650 + }, + { + "epoch": 0.054352025965328045, + "grad_norm": 0.9109242558479309, + "learning_rate": 9.49729655148625e-05, + "loss": 0.35174739360809326, + "step": 12660 + }, + { + "epoch": 0.05439495805534805, + "grad_norm": 0.9759131669998169, + "learning_rate": 9.496865379474488e-05, + "loss": 0.2183375358581543, + "step": 12670 + }, + { + "epoch": 0.054437890145368054, + "grad_norm": 1.1428704261779785, + "learning_rate": 9.496434207462726e-05, + "loss": 0.457882022857666, + "step": 12680 + }, + { + "epoch": 0.05448082223538806, + "grad_norm": 2.1968188285827637, + "learning_rate": 9.496003035450964e-05, + "loss": 0.2927252292633057, + "step": 12690 + }, + { + "epoch": 0.05452375432540807, + "grad_norm": 2.174100875854492, + "learning_rate": 9.495571863439201e-05, + "loss": 0.2782183885574341, + "step": 12700 + }, + { + "epoch": 0.05456668641542808, + "grad_norm": 0.5769919157028198, + "learning_rate": 9.495140691427439e-05, + "loss": 0.2357858180999756, + "step": 12710 + }, + { + "epoch": 0.054609618505448085, + "grad_norm": 0.8519782423973083, + "learning_rate": 9.494709519415677e-05, + "loss": 0.2905261516571045, + "step": 12720 + }, + { + "epoch": 0.054652550595468086, + "grad_norm": 0.6527574062347412, + "learning_rate": 9.494278347403914e-05, + "loss": 0.38331165313720705, + "step": 12730 + }, + { + "epoch": 0.054695482685488093, + "grad_norm": 3.5559322834014893, + "learning_rate": 9.493847175392151e-05, + "loss": 0.24324979782104492, + "step": 12740 + }, + { + "epoch": 0.0547384147755081, + "grad_norm": 12.116598129272461, + "learning_rate": 9.493416003380389e-05, + "loss": 0.31748697757720945, + "step": 12750 + }, + { + "epoch": 0.05478134686552811, + "grad_norm": 0.057032838463783264, + "learning_rate": 9.492984831368626e-05, + "loss": 0.2784019231796265, + "step": 12760 + }, + { + "epoch": 0.05482427895554812, + "grad_norm": 2.6145899295806885, + "learning_rate": 9.492553659356864e-05, + "loss": 0.21561193466186523, + "step": 12770 + }, + { + "epoch": 0.05486721104556812, + "grad_norm": 0.1213018000125885, + "learning_rate": 9.492122487345102e-05, + "loss": 0.15118281841278075, + "step": 12780 + }, + { + "epoch": 0.054910143135588126, + "grad_norm": 1.966002345085144, + "learning_rate": 9.49169131533334e-05, + "loss": 0.206246280670166, + "step": 12790 + }, + { + "epoch": 0.05495307522560813, + "grad_norm": 0.18352824449539185, + "learning_rate": 9.491260143321577e-05, + "loss": 0.26455848217010497, + "step": 12800 + }, + { + "epoch": 0.05499600731562814, + "grad_norm": 0.017357051372528076, + "learning_rate": 9.490828971309815e-05, + "loss": 0.23155784606933594, + "step": 12810 + }, + { + "epoch": 0.05503893940564815, + "grad_norm": 0.009231144562363625, + "learning_rate": 9.490397799298051e-05, + "loss": 0.27017381191253664, + "step": 12820 + }, + { + "epoch": 0.05508187149566815, + "grad_norm": 1.3265694379806519, + "learning_rate": 9.489966627286289e-05, + "loss": 0.404315185546875, + "step": 12830 + }, + { + "epoch": 0.05512480358568816, + "grad_norm": 2.65555477142334, + "learning_rate": 9.489535455274527e-05, + "loss": 0.30979869365692136, + "step": 12840 + }, + { + "epoch": 0.055167735675708165, + "grad_norm": 1.8362942934036255, + "learning_rate": 9.489104283262765e-05, + "loss": 0.19149515628814698, + "step": 12850 + }, + { + "epoch": 0.05521066776572817, + "grad_norm": 0.008390932343900204, + "learning_rate": 9.488673111251004e-05, + "loss": 0.11477944850921631, + "step": 12860 + }, + { + "epoch": 0.055253599855748174, + "grad_norm": 19.582433700561523, + "learning_rate": 9.488241939239241e-05, + "loss": 0.2537912607192993, + "step": 12870 + }, + { + "epoch": 0.05529653194576818, + "grad_norm": 0.001269947155378759, + "learning_rate": 9.487810767227479e-05, + "loss": 0.2521425008773804, + "step": 12880 + }, + { + "epoch": 0.05533946403578819, + "grad_norm": 0.5160092711448669, + "learning_rate": 9.487379595215717e-05, + "loss": 0.22307281494140624, + "step": 12890 + }, + { + "epoch": 0.0553823961258082, + "grad_norm": 2.3668174743652344, + "learning_rate": 9.486948423203953e-05, + "loss": 0.35943007469177246, + "step": 12900 + }, + { + "epoch": 0.055425328215828205, + "grad_norm": 0.07710514962673187, + "learning_rate": 9.486517251192191e-05, + "loss": 0.21076421737670897, + "step": 12910 + }, + { + "epoch": 0.055468260305848206, + "grad_norm": 0.017242876812815666, + "learning_rate": 9.486086079180429e-05, + "loss": 0.07508084177970886, + "step": 12920 + }, + { + "epoch": 0.055511192395868214, + "grad_norm": 0.10909157246351242, + "learning_rate": 9.485654907168666e-05, + "loss": 0.15512967109680176, + "step": 12930 + }, + { + "epoch": 0.05555412448588822, + "grad_norm": 0.8637058138847351, + "learning_rate": 9.485223735156904e-05, + "loss": 0.4028017044067383, + "step": 12940 + }, + { + "epoch": 0.05559705657590823, + "grad_norm": 1.707521915435791, + "learning_rate": 9.484792563145142e-05, + "loss": 0.27000603675842283, + "step": 12950 + }, + { + "epoch": 0.05563998866592824, + "grad_norm": 0.002830659504979849, + "learning_rate": 9.48436139113338e-05, + "loss": 0.1828344225883484, + "step": 12960 + }, + { + "epoch": 0.05568292075594824, + "grad_norm": 20.095304489135742, + "learning_rate": 9.483930219121617e-05, + "loss": 0.5967009544372559, + "step": 12970 + }, + { + "epoch": 0.055725852845968246, + "grad_norm": 3.065985679626465, + "learning_rate": 9.483499047109854e-05, + "loss": 0.1959398627281189, + "step": 12980 + }, + { + "epoch": 0.055768784935988254, + "grad_norm": 1.2307249307632446, + "learning_rate": 9.483067875098092e-05, + "loss": 0.3424776792526245, + "step": 12990 + }, + { + "epoch": 0.05581171702600826, + "grad_norm": 0.06579990684986115, + "learning_rate": 9.482636703086329e-05, + "loss": 0.23139922618865966, + "step": 13000 + }, + { + "epoch": 0.05581171702600826, + "eval_loss": 0.5209886431694031, + "eval_runtime": 27.4824, + "eval_samples_per_second": 3.639, + "eval_steps_per_second": 3.639, + "step": 13000 + }, + { + "epoch": 0.05585464911602827, + "grad_norm": 1.030726671218872, + "learning_rate": 9.482205531074567e-05, + "loss": 0.24501314163208007, + "step": 13010 + }, + { + "epoch": 0.05589758120604827, + "grad_norm": 0.21431680023670197, + "learning_rate": 9.481774359062805e-05, + "loss": 0.16148560047149657, + "step": 13020 + }, + { + "epoch": 0.05594051329606828, + "grad_norm": 4.152802467346191, + "learning_rate": 9.481343187051042e-05, + "loss": 0.0998810887336731, + "step": 13030 + }, + { + "epoch": 0.055983445386088286, + "grad_norm": 0.12929271161556244, + "learning_rate": 9.48091201503928e-05, + "loss": 0.3375421047210693, + "step": 13040 + }, + { + "epoch": 0.056026377476108294, + "grad_norm": 3.620896577835083, + "learning_rate": 9.480480843027518e-05, + "loss": 0.32858121395111084, + "step": 13050 + }, + { + "epoch": 0.0560693095661283, + "grad_norm": 4.256040573120117, + "learning_rate": 9.480049671015756e-05, + "loss": 0.33753039836883547, + "step": 13060 + }, + { + "epoch": 0.0561122416561483, + "grad_norm": 0.004805037286132574, + "learning_rate": 9.479618499003992e-05, + "loss": 0.2353214979171753, + "step": 13070 + }, + { + "epoch": 0.05615517374616831, + "grad_norm": 1.3188596963882446, + "learning_rate": 9.479187326992231e-05, + "loss": 0.14669665098190307, + "step": 13080 + }, + { + "epoch": 0.05619810583618832, + "grad_norm": 0.1885381042957306, + "learning_rate": 9.478756154980469e-05, + "loss": 0.4346306324005127, + "step": 13090 + }, + { + "epoch": 0.056241037926208326, + "grad_norm": 0.03456701338291168, + "learning_rate": 9.478324982968707e-05, + "loss": 0.05812343955039978, + "step": 13100 + }, + { + "epoch": 0.05628397001622833, + "grad_norm": 0.0404074490070343, + "learning_rate": 9.477893810956944e-05, + "loss": 0.1765100598335266, + "step": 13110 + }, + { + "epoch": 0.056326902106248335, + "grad_norm": 1.5887359380722046, + "learning_rate": 9.477462638945182e-05, + "loss": 0.3653215169906616, + "step": 13120 + }, + { + "epoch": 0.05636983419626834, + "grad_norm": 0.08524619042873383, + "learning_rate": 9.47703146693342e-05, + "loss": 0.4067636489868164, + "step": 13130 + }, + { + "epoch": 0.05641276628628835, + "grad_norm": 0.9142935276031494, + "learning_rate": 9.476600294921658e-05, + "loss": 0.17017019987106324, + "step": 13140 + }, + { + "epoch": 0.05645569837630836, + "grad_norm": 3.3787689208984375, + "learning_rate": 9.476169122909894e-05, + "loss": 0.15745049715042114, + "step": 13150 + }, + { + "epoch": 0.05649863046632836, + "grad_norm": 1.9701745510101318, + "learning_rate": 9.475737950898132e-05, + "loss": 0.23845574855804444, + "step": 13160 + }, + { + "epoch": 0.05654156255634837, + "grad_norm": 3.9363882541656494, + "learning_rate": 9.47530677888637e-05, + "loss": 0.2576152801513672, + "step": 13170 + }, + { + "epoch": 0.056584494646368375, + "grad_norm": 0.0014147092588245869, + "learning_rate": 9.474875606874607e-05, + "loss": 0.18658721446990967, + "step": 13180 + }, + { + "epoch": 0.05662742673638838, + "grad_norm": 0.4537833333015442, + "learning_rate": 9.474444434862845e-05, + "loss": 0.3402831554412842, + "step": 13190 + }, + { + "epoch": 0.05667035882640839, + "grad_norm": 1.0720131397247314, + "learning_rate": 9.474013262851083e-05, + "loss": 0.27713770866394044, + "step": 13200 + }, + { + "epoch": 0.05671329091642839, + "grad_norm": 2.2132678031921387, + "learning_rate": 9.47358209083932e-05, + "loss": 0.13246928453445433, + "step": 13210 + }, + { + "epoch": 0.0567562230064484, + "grad_norm": 0.004071402829140425, + "learning_rate": 9.473150918827558e-05, + "loss": 0.3325110912322998, + "step": 13220 + }, + { + "epoch": 0.05679915509646841, + "grad_norm": 1.9012278318405151, + "learning_rate": 9.472719746815794e-05, + "loss": 0.3323903799057007, + "step": 13230 + }, + { + "epoch": 0.056842087186488414, + "grad_norm": 0.011959442868828773, + "learning_rate": 9.472288574804032e-05, + "loss": 0.2820717811584473, + "step": 13240 + }, + { + "epoch": 0.05688501927650842, + "grad_norm": 0.05983627215027809, + "learning_rate": 9.47185740279227e-05, + "loss": 0.15745246410369873, + "step": 13250 + }, + { + "epoch": 0.05692795136652842, + "grad_norm": 1.0197789669036865, + "learning_rate": 9.471426230780508e-05, + "loss": 0.12266286611557006, + "step": 13260 + }, + { + "epoch": 0.05697088345654843, + "grad_norm": 2.0742173194885254, + "learning_rate": 9.470995058768745e-05, + "loss": 0.20585498809814454, + "step": 13270 + }, + { + "epoch": 0.05701381554656844, + "grad_norm": 4.345624923706055, + "learning_rate": 9.470563886756983e-05, + "loss": 0.24817166328430176, + "step": 13280 + }, + { + "epoch": 0.05705674763658845, + "grad_norm": 2.2066709995269775, + "learning_rate": 9.470132714745221e-05, + "loss": 0.26742217540740965, + "step": 13290 + }, + { + "epoch": 0.05709967972660845, + "grad_norm": 0.2013663947582245, + "learning_rate": 9.469701542733459e-05, + "loss": 0.33151028156280515, + "step": 13300 + }, + { + "epoch": 0.057142611816628455, + "grad_norm": 0.0009617611649446189, + "learning_rate": 9.469270370721696e-05, + "loss": 0.4479741096496582, + "step": 13310 + }, + { + "epoch": 0.05718554390664846, + "grad_norm": 0.02231052704155445, + "learning_rate": 9.468839198709934e-05, + "loss": 0.3095113277435303, + "step": 13320 + }, + { + "epoch": 0.05722847599666847, + "grad_norm": 0.9156794548034668, + "learning_rate": 9.468408026698172e-05, + "loss": 0.3460153341293335, + "step": 13330 + }, + { + "epoch": 0.05727140808668848, + "grad_norm": 0.059285569936037064, + "learning_rate": 9.46797685468641e-05, + "loss": 0.15585129261016845, + "step": 13340 + }, + { + "epoch": 0.05731434017670848, + "grad_norm": 8.328903198242188, + "learning_rate": 9.467545682674647e-05, + "loss": 0.21851041316986083, + "step": 13350 + }, + { + "epoch": 0.05735727226672849, + "grad_norm": 9.454680442810059, + "learning_rate": 9.467114510662885e-05, + "loss": 0.3026920795440674, + "step": 13360 + }, + { + "epoch": 0.057400204356748495, + "grad_norm": 0.2567834258079529, + "learning_rate": 9.466683338651123e-05, + "loss": 0.22752134799957274, + "step": 13370 + }, + { + "epoch": 0.0574431364467685, + "grad_norm": 0.7814629673957825, + "learning_rate": 9.46625216663936e-05, + "loss": 0.18589977025985718, + "step": 13380 + }, + { + "epoch": 0.05748606853678851, + "grad_norm": 0.10805714875459671, + "learning_rate": 9.465820994627597e-05, + "loss": 0.42492899894714353, + "step": 13390 + }, + { + "epoch": 0.05752900062680851, + "grad_norm": 4.603877067565918, + "learning_rate": 9.465389822615835e-05, + "loss": 0.5462126731872559, + "step": 13400 + }, + { + "epoch": 0.05757193271682852, + "grad_norm": 4.8273820877075195, + "learning_rate": 9.464958650604072e-05, + "loss": 0.16398582458496094, + "step": 13410 + }, + { + "epoch": 0.05761486480684853, + "grad_norm": 0.11256857961416245, + "learning_rate": 9.46452747859231e-05, + "loss": 0.2469719886779785, + "step": 13420 + }, + { + "epoch": 0.057657796896868535, + "grad_norm": 1.4045051336288452, + "learning_rate": 9.464096306580548e-05, + "loss": 0.14543576240539552, + "step": 13430 + }, + { + "epoch": 0.05770072898688854, + "grad_norm": 1.5675419569015503, + "learning_rate": 9.463665134568785e-05, + "loss": 0.1374026656150818, + "step": 13440 + }, + { + "epoch": 0.057743661076908544, + "grad_norm": 0.041778989136219025, + "learning_rate": 9.463233962557023e-05, + "loss": 0.2824827194213867, + "step": 13450 + }, + { + "epoch": 0.05778659316692855, + "grad_norm": 2.386615514755249, + "learning_rate": 9.462802790545261e-05, + "loss": 0.2739110946655273, + "step": 13460 + }, + { + "epoch": 0.05782952525694856, + "grad_norm": 0.09681998938322067, + "learning_rate": 9.462371618533499e-05, + "loss": 0.25143866539001464, + "step": 13470 + }, + { + "epoch": 0.05787245734696857, + "grad_norm": 0.08663687855005264, + "learning_rate": 9.461940446521735e-05, + "loss": 0.16926002502441406, + "step": 13480 + }, + { + "epoch": 0.057915389436988575, + "grad_norm": 0.49383774399757385, + "learning_rate": 9.461509274509973e-05, + "loss": 0.1362561821937561, + "step": 13490 + }, + { + "epoch": 0.057958321527008576, + "grad_norm": 0.04574750363826752, + "learning_rate": 9.46107810249821e-05, + "loss": 0.1413083553314209, + "step": 13500 + }, + { + "epoch": 0.058001253617028584, + "grad_norm": 1.0846718549728394, + "learning_rate": 9.460646930486448e-05, + "loss": 0.3449979305267334, + "step": 13510 + }, + { + "epoch": 0.05804418570704859, + "grad_norm": 0.017421284690499306, + "learning_rate": 9.460215758474686e-05, + "loss": 0.05157340168952942, + "step": 13520 + }, + { + "epoch": 0.0580871177970686, + "grad_norm": 0.024469420313835144, + "learning_rate": 9.459784586462924e-05, + "loss": 0.5607762336730957, + "step": 13530 + }, + { + "epoch": 0.0581300498870886, + "grad_norm": 0.11292649805545807, + "learning_rate": 9.459353414451161e-05, + "loss": 0.36970949172973633, + "step": 13540 + }, + { + "epoch": 0.05817298197710861, + "grad_norm": 5.711234092712402, + "learning_rate": 9.458922242439399e-05, + "loss": 0.3390767574310303, + "step": 13550 + }, + { + "epoch": 0.058215914067128616, + "grad_norm": 0.1786605715751648, + "learning_rate": 9.458491070427637e-05, + "loss": 0.3094156503677368, + "step": 13560 + }, + { + "epoch": 0.058258846157148624, + "grad_norm": 16.943851470947266, + "learning_rate": 9.458059898415875e-05, + "loss": 0.29798853397369385, + "step": 13570 + }, + { + "epoch": 0.05830177824716863, + "grad_norm": 1.147255778312683, + "learning_rate": 9.457628726404112e-05, + "loss": 0.18354703187942506, + "step": 13580 + }, + { + "epoch": 0.05834471033718863, + "grad_norm": 0.9971585869789124, + "learning_rate": 9.45719755439235e-05, + "loss": 0.16321992874145508, + "step": 13590 + }, + { + "epoch": 0.05838764242720864, + "grad_norm": 0.028112033382058144, + "learning_rate": 9.456766382380588e-05, + "loss": 0.21919541358947753, + "step": 13600 + }, + { + "epoch": 0.05843057451722865, + "grad_norm": 1.8403459787368774, + "learning_rate": 9.456335210368826e-05, + "loss": 0.3318035125732422, + "step": 13610 + }, + { + "epoch": 0.058473506607248656, + "grad_norm": 4.8288044929504395, + "learning_rate": 9.455904038357063e-05, + "loss": 0.1652234435081482, + "step": 13620 + }, + { + "epoch": 0.05851643869726866, + "grad_norm": 1.7937736511230469, + "learning_rate": 9.455472866345301e-05, + "loss": 0.5239839553833008, + "step": 13630 + }, + { + "epoch": 0.058559370787288664, + "grad_norm": 0.09594344347715378, + "learning_rate": 9.455041694333537e-05, + "loss": 0.39256093502044676, + "step": 13640 + }, + { + "epoch": 0.05860230287730867, + "grad_norm": 3.84417724609375, + "learning_rate": 9.454610522321775e-05, + "loss": 0.28167335987091063, + "step": 13650 + }, + { + "epoch": 0.05864523496732868, + "grad_norm": 0.060315635055303574, + "learning_rate": 9.454179350310013e-05, + "loss": 0.28380427360534666, + "step": 13660 + }, + { + "epoch": 0.05868816705734869, + "grad_norm": 0.10807570815086365, + "learning_rate": 9.45374817829825e-05, + "loss": 0.22851717472076416, + "step": 13670 + }, + { + "epoch": 0.058731099147368696, + "grad_norm": 0.21893347799777985, + "learning_rate": 9.453317006286488e-05, + "loss": 0.299111008644104, + "step": 13680 + }, + { + "epoch": 0.058774031237388696, + "grad_norm": 0.656207799911499, + "learning_rate": 9.452885834274726e-05, + "loss": 0.164165723323822, + "step": 13690 + }, + { + "epoch": 0.058816963327408704, + "grad_norm": 0.8610594272613525, + "learning_rate": 9.452454662262964e-05, + "loss": 0.2684111356735229, + "step": 13700 + }, + { + "epoch": 0.05885989541742871, + "grad_norm": 0.04181723669171333, + "learning_rate": 9.452023490251202e-05, + "loss": 0.21170082092285156, + "step": 13710 + }, + { + "epoch": 0.05890282750744872, + "grad_norm": 0.030319862067699432, + "learning_rate": 9.451592318239438e-05, + "loss": 0.0055593281984329225, + "step": 13720 + }, + { + "epoch": 0.05894575959746872, + "grad_norm": 2.068293333053589, + "learning_rate": 9.451161146227676e-05, + "loss": 0.36513285636901854, + "step": 13730 + }, + { + "epoch": 0.05898869168748873, + "grad_norm": 0.039667144417762756, + "learning_rate": 9.450729974215913e-05, + "loss": 0.3398525953292847, + "step": 13740 + }, + { + "epoch": 0.059031623777508736, + "grad_norm": 1.1753939390182495, + "learning_rate": 9.450298802204151e-05, + "loss": 0.2801861047744751, + "step": 13750 + }, + { + "epoch": 0.059074555867528744, + "grad_norm": 1.5181111097335815, + "learning_rate": 9.449867630192389e-05, + "loss": 0.3567522048950195, + "step": 13760 + }, + { + "epoch": 0.05911748795754875, + "grad_norm": 0.018359310925006866, + "learning_rate": 9.449436458180627e-05, + "loss": 0.2326261043548584, + "step": 13770 + }, + { + "epoch": 0.05916042004756875, + "grad_norm": 0.02728513814508915, + "learning_rate": 9.449005286168864e-05, + "loss": 0.1719570279121399, + "step": 13780 + }, + { + "epoch": 0.05920335213758876, + "grad_norm": 0.039325762540102005, + "learning_rate": 9.448574114157102e-05, + "loss": 0.2949488878250122, + "step": 13790 + }, + { + "epoch": 0.05924628422760877, + "grad_norm": 0.020210914313793182, + "learning_rate": 9.44814294214534e-05, + "loss": 0.2757899761199951, + "step": 13800 + }, + { + "epoch": 0.059289216317628776, + "grad_norm": 6.97475528717041, + "learning_rate": 9.447711770133578e-05, + "loss": 0.3151508092880249, + "step": 13810 + }, + { + "epoch": 0.059332148407648784, + "grad_norm": 0.18654370307922363, + "learning_rate": 9.447280598121815e-05, + "loss": 0.3797006368637085, + "step": 13820 + }, + { + "epoch": 0.059375080497668785, + "grad_norm": 0.02834288775920868, + "learning_rate": 9.446849426110053e-05, + "loss": 0.28402860164642335, + "step": 13830 + }, + { + "epoch": 0.05941801258768879, + "grad_norm": 0.07072301208972931, + "learning_rate": 9.446418254098291e-05, + "loss": 0.2836951971054077, + "step": 13840 + }, + { + "epoch": 0.0594609446777088, + "grad_norm": 0.10318069159984589, + "learning_rate": 9.445987082086529e-05, + "loss": 0.22994587421417237, + "step": 13850 + }, + { + "epoch": 0.05950387676772881, + "grad_norm": 0.228297621011734, + "learning_rate": 9.445555910074766e-05, + "loss": 0.2023998975753784, + "step": 13860 + }, + { + "epoch": 0.059546808857748816, + "grad_norm": 1.7108619213104248, + "learning_rate": 9.445124738063004e-05, + "loss": 0.273990535736084, + "step": 13870 + }, + { + "epoch": 0.05958974094776882, + "grad_norm": 0.20896852016448975, + "learning_rate": 9.444693566051242e-05, + "loss": 0.15025321245193482, + "step": 13880 + }, + { + "epoch": 0.059632673037788825, + "grad_norm": 0.3814859688282013, + "learning_rate": 9.444262394039478e-05, + "loss": 0.2851571798324585, + "step": 13890 + }, + { + "epoch": 0.05967560512780883, + "grad_norm": 0.16517090797424316, + "learning_rate": 9.443831222027716e-05, + "loss": 0.21811909675598146, + "step": 13900 + }, + { + "epoch": 0.05971853721782884, + "grad_norm": 0.06338504701852798, + "learning_rate": 9.443400050015954e-05, + "loss": 0.2580056667327881, + "step": 13910 + }, + { + "epoch": 0.05976146930784884, + "grad_norm": 0.006939787417650223, + "learning_rate": 9.442968878004191e-05, + "loss": 0.13981956243515015, + "step": 13920 + }, + { + "epoch": 0.05980440139786885, + "grad_norm": 0.019513536244630814, + "learning_rate": 9.442537705992429e-05, + "loss": 0.3545896768569946, + "step": 13930 + }, + { + "epoch": 0.05984733348788886, + "grad_norm": 0.02124965377151966, + "learning_rate": 9.442106533980667e-05, + "loss": 0.31715543270111085, + "step": 13940 + }, + { + "epoch": 0.059890265577908865, + "grad_norm": 0.054176412522792816, + "learning_rate": 9.441675361968905e-05, + "loss": 0.245257568359375, + "step": 13950 + }, + { + "epoch": 0.05993319766792887, + "grad_norm": 0.04804272949695587, + "learning_rate": 9.441244189957142e-05, + "loss": 0.06981720924377441, + "step": 13960 + }, + { + "epoch": 0.05997612975794887, + "grad_norm": 0.024836163967847824, + "learning_rate": 9.440813017945379e-05, + "loss": 0.31092960834503175, + "step": 13970 + }, + { + "epoch": 0.06001906184796888, + "grad_norm": 0.05360177159309387, + "learning_rate": 9.440381845933616e-05, + "loss": 0.27363669872283936, + "step": 13980 + }, + { + "epoch": 0.06006199393798889, + "grad_norm": 0.25896015763282776, + "learning_rate": 9.439950673921854e-05, + "loss": 0.37048931121826173, + "step": 13990 + }, + { + "epoch": 0.0601049260280089, + "grad_norm": 3.391026020050049, + "learning_rate": 9.439519501910092e-05, + "loss": 0.38292765617370605, + "step": 14000 + }, + { + "epoch": 0.0601049260280089, + "eval_loss": 0.5119574666023254, + "eval_runtime": 27.5231, + "eval_samples_per_second": 3.633, + "eval_steps_per_second": 3.633, + "step": 14000 + }, + { + "epoch": 0.060147858118028905, + "grad_norm": 0.10783776640892029, + "learning_rate": 9.43908832989833e-05, + "loss": 0.4476637363433838, + "step": 14010 + }, + { + "epoch": 0.060190790208048905, + "grad_norm": 0.0501650795340538, + "learning_rate": 9.438657157886567e-05, + "loss": 0.19077664613723755, + "step": 14020 + }, + { + "epoch": 0.06023372229806891, + "grad_norm": 3.299262046813965, + "learning_rate": 9.438225985874805e-05, + "loss": 0.4837735652923584, + "step": 14030 + }, + { + "epoch": 0.06027665438808892, + "grad_norm": 0.10058021545410156, + "learning_rate": 9.437794813863043e-05, + "loss": 0.39572231769561766, + "step": 14040 + }, + { + "epoch": 0.06031958647810893, + "grad_norm": 2.3452744483947754, + "learning_rate": 9.43736364185128e-05, + "loss": 0.2922865629196167, + "step": 14050 + }, + { + "epoch": 0.06036251856812894, + "grad_norm": 0.03752734139561653, + "learning_rate": 9.436932469839518e-05, + "loss": 0.26948659420013427, + "step": 14060 + }, + { + "epoch": 0.06040545065814894, + "grad_norm": 0.21016894280910492, + "learning_rate": 9.436501297827756e-05, + "loss": 0.07004184126853943, + "step": 14070 + }, + { + "epoch": 0.060448382748168945, + "grad_norm": 1.554970145225525, + "learning_rate": 9.436070125815994e-05, + "loss": 0.488129997253418, + "step": 14080 + }, + { + "epoch": 0.06049131483818895, + "grad_norm": 3.1139557361602783, + "learning_rate": 9.435638953804231e-05, + "loss": 0.2959815740585327, + "step": 14090 + }, + { + "epoch": 0.06053424692820896, + "grad_norm": 0.07403170317411423, + "learning_rate": 9.435207781792469e-05, + "loss": 0.14179257154464722, + "step": 14100 + }, + { + "epoch": 0.06057717901822897, + "grad_norm": 0.5391601324081421, + "learning_rate": 9.434776609780707e-05, + "loss": 0.11069589853286743, + "step": 14110 + }, + { + "epoch": 0.06062011110824897, + "grad_norm": 2.3623971939086914, + "learning_rate": 9.434345437768945e-05, + "loss": 0.3838085412979126, + "step": 14120 + }, + { + "epoch": 0.06066304319826898, + "grad_norm": 10.063488006591797, + "learning_rate": 9.433914265757181e-05, + "loss": 0.2548569440841675, + "step": 14130 + }, + { + "epoch": 0.060705975288288985, + "grad_norm": 1.8734171390533447, + "learning_rate": 9.433483093745419e-05, + "loss": 0.2592017650604248, + "step": 14140 + }, + { + "epoch": 0.06074890737830899, + "grad_norm": 0.2549486756324768, + "learning_rate": 9.433051921733656e-05, + "loss": 0.18551105260849, + "step": 14150 + }, + { + "epoch": 0.060791839468328994, + "grad_norm": 0.09773687273263931, + "learning_rate": 9.432620749721894e-05, + "loss": 0.17357507944107056, + "step": 14160 + }, + { + "epoch": 0.060834771558349, + "grad_norm": 0.09439878910779953, + "learning_rate": 9.432189577710132e-05, + "loss": 0.45656538009643555, + "step": 14170 + }, + { + "epoch": 0.06087770364836901, + "grad_norm": 3.9225845336914062, + "learning_rate": 9.43175840569837e-05, + "loss": 0.24273622035980225, + "step": 14180 + }, + { + "epoch": 0.06092063573838902, + "grad_norm": 2.183349370956421, + "learning_rate": 9.431327233686607e-05, + "loss": 0.25051212310791016, + "step": 14190 + }, + { + "epoch": 0.060963567828409025, + "grad_norm": 3.013693332672119, + "learning_rate": 9.430896061674845e-05, + "loss": 0.258715295791626, + "step": 14200 + }, + { + "epoch": 0.061006499918429026, + "grad_norm": 3.5759599208831787, + "learning_rate": 9.430464889663083e-05, + "loss": 0.27321045398712157, + "step": 14210 + }, + { + "epoch": 0.061049432008449034, + "grad_norm": 0.26453498005867004, + "learning_rate": 9.430033717651319e-05, + "loss": 0.2096024513244629, + "step": 14220 + }, + { + "epoch": 0.06109236409846904, + "grad_norm": 1.056121826171875, + "learning_rate": 9.429602545639557e-05, + "loss": 0.3136213541030884, + "step": 14230 + }, + { + "epoch": 0.06113529618848905, + "grad_norm": 0.04215465858578682, + "learning_rate": 9.429171373627795e-05, + "loss": 0.2560518026351929, + "step": 14240 + }, + { + "epoch": 0.06117822827850906, + "grad_norm": 0.3055509328842163, + "learning_rate": 9.428740201616032e-05, + "loss": 0.26824657917022704, + "step": 14250 + }, + { + "epoch": 0.06122116036852906, + "grad_norm": 2.0319509506225586, + "learning_rate": 9.42830902960427e-05, + "loss": 0.2057985782623291, + "step": 14260 + }, + { + "epoch": 0.061264092458549066, + "grad_norm": 0.09097740799188614, + "learning_rate": 9.427877857592509e-05, + "loss": 0.25459158420562744, + "step": 14270 + }, + { + "epoch": 0.061307024548569074, + "grad_norm": 1.385293960571289, + "learning_rate": 9.427446685580747e-05, + "loss": 0.2401709794998169, + "step": 14280 + }, + { + "epoch": 0.06134995663858908, + "grad_norm": 0.0381929837167263, + "learning_rate": 9.427015513568985e-05, + "loss": 0.28802106380462644, + "step": 14290 + }, + { + "epoch": 0.06139288872860909, + "grad_norm": 0.14590157568454742, + "learning_rate": 9.426584341557221e-05, + "loss": 0.48211469650268557, + "step": 14300 + }, + { + "epoch": 0.06143582081862909, + "grad_norm": 0.004426190629601479, + "learning_rate": 9.426153169545459e-05, + "loss": 0.18446681499481202, + "step": 14310 + }, + { + "epoch": 0.0614787529086491, + "grad_norm": 0.8361203670501709, + "learning_rate": 9.425721997533697e-05, + "loss": 0.1543604850769043, + "step": 14320 + }, + { + "epoch": 0.061521684998669106, + "grad_norm": 2.4105887413024902, + "learning_rate": 9.425290825521934e-05, + "loss": 0.4578250885009766, + "step": 14330 + }, + { + "epoch": 0.061564617088689114, + "grad_norm": 6.35371208190918, + "learning_rate": 9.424859653510172e-05, + "loss": 0.1941359043121338, + "step": 14340 + }, + { + "epoch": 0.061607549178709115, + "grad_norm": 0.030798856168985367, + "learning_rate": 9.42442848149841e-05, + "loss": 0.38810138702392577, + "step": 14350 + }, + { + "epoch": 0.06165048126872912, + "grad_norm": 0.5181298851966858, + "learning_rate": 9.423997309486648e-05, + "loss": 0.05099499821662903, + "step": 14360 + }, + { + "epoch": 0.06169341335874913, + "grad_norm": 0.10615668445825577, + "learning_rate": 9.423566137474885e-05, + "loss": 0.11544710397720337, + "step": 14370 + }, + { + "epoch": 0.06173634544876914, + "grad_norm": 0.006546362768858671, + "learning_rate": 9.423134965463122e-05, + "loss": 0.20839293003082277, + "step": 14380 + }, + { + "epoch": 0.061779277538789146, + "grad_norm": 4.777956962585449, + "learning_rate": 9.42270379345136e-05, + "loss": 0.27350683212280275, + "step": 14390 + }, + { + "epoch": 0.06182220962880915, + "grad_norm": 0.22227756679058075, + "learning_rate": 9.422272621439597e-05, + "loss": 0.23988797664642333, + "step": 14400 + }, + { + "epoch": 0.061865141718829154, + "grad_norm": 6.438022136688232, + "learning_rate": 9.421841449427835e-05, + "loss": 0.6111140251159668, + "step": 14410 + }, + { + "epoch": 0.06190807380884916, + "grad_norm": 0.06184747815132141, + "learning_rate": 9.421410277416073e-05, + "loss": 0.2695784330368042, + "step": 14420 + }, + { + "epoch": 0.06195100589886917, + "grad_norm": 2.856022834777832, + "learning_rate": 9.42097910540431e-05, + "loss": 0.16448986530303955, + "step": 14430 + }, + { + "epoch": 0.06199393798888918, + "grad_norm": 0.024922547861933708, + "learning_rate": 9.420547933392548e-05, + "loss": 0.24486238956451417, + "step": 14440 + }, + { + "epoch": 0.06203687007890918, + "grad_norm": 0.06679052859544754, + "learning_rate": 9.420116761380786e-05, + "loss": 0.32473771572113036, + "step": 14450 + }, + { + "epoch": 0.062079802168929187, + "grad_norm": 1.0972843170166016, + "learning_rate": 9.419685589369022e-05, + "loss": 0.23681797981262206, + "step": 14460 + }, + { + "epoch": 0.062122734258949194, + "grad_norm": 1.9927321672439575, + "learning_rate": 9.41925441735726e-05, + "loss": 0.24192793369293214, + "step": 14470 + }, + { + "epoch": 0.0621656663489692, + "grad_norm": 0.08671754598617554, + "learning_rate": 9.418823245345498e-05, + "loss": 0.1729517936706543, + "step": 14480 + }, + { + "epoch": 0.06220859843898921, + "grad_norm": 0.3051014542579651, + "learning_rate": 9.418392073333737e-05, + "loss": 0.28752970695495605, + "step": 14490 + }, + { + "epoch": 0.06225153052900921, + "grad_norm": 4.915781021118164, + "learning_rate": 9.417960901321974e-05, + "loss": 0.2712711811065674, + "step": 14500 + }, + { + "epoch": 0.06229446261902922, + "grad_norm": 1.4001567363739014, + "learning_rate": 9.417529729310212e-05, + "loss": 0.5032999038696289, + "step": 14510 + }, + { + "epoch": 0.062337394709049226, + "grad_norm": 1.549265742301941, + "learning_rate": 9.41709855729845e-05, + "loss": 0.21653423309326172, + "step": 14520 + }, + { + "epoch": 0.062380326799069234, + "grad_norm": 6.566656589508057, + "learning_rate": 9.416667385286688e-05, + "loss": 0.30635671615600585, + "step": 14530 + }, + { + "epoch": 0.06242325888908924, + "grad_norm": 0.2426643669605255, + "learning_rate": 9.416236213274925e-05, + "loss": 0.2337181568145752, + "step": 14540 + }, + { + "epoch": 0.06246619097910924, + "grad_norm": 0.9807308316230774, + "learning_rate": 9.415805041263162e-05, + "loss": 0.2038658857345581, + "step": 14550 + }, + { + "epoch": 0.06250912306912926, + "grad_norm": 0.10739678144454956, + "learning_rate": 9.4153738692514e-05, + "loss": 0.20511481761932374, + "step": 14560 + }, + { + "epoch": 0.06255205515914926, + "grad_norm": 2.38814640045166, + "learning_rate": 9.414942697239637e-05, + "loss": 0.15794265270233154, + "step": 14570 + }, + { + "epoch": 0.06259498724916926, + "grad_norm": 8.256712913513184, + "learning_rate": 9.414511525227875e-05, + "loss": 0.3933896541595459, + "step": 14580 + }, + { + "epoch": 0.06263791933918927, + "grad_norm": 2.060042142868042, + "learning_rate": 9.414080353216113e-05, + "loss": 0.18279197216033935, + "step": 14590 + }, + { + "epoch": 0.06268085142920927, + "grad_norm": 0.012487399391829967, + "learning_rate": 9.41364918120435e-05, + "loss": 0.14272109270095826, + "step": 14600 + }, + { + "epoch": 0.06272378351922929, + "grad_norm": 2.3638439178466797, + "learning_rate": 9.413218009192588e-05, + "loss": 0.1730712652206421, + "step": 14610 + }, + { + "epoch": 0.06276671560924929, + "grad_norm": 1.4593660831451416, + "learning_rate": 9.412786837180826e-05, + "loss": 0.23349535465240479, + "step": 14620 + }, + { + "epoch": 0.06280964769926929, + "grad_norm": 0.04631822183728218, + "learning_rate": 9.412355665169062e-05, + "loss": 0.3255072832107544, + "step": 14630 + }, + { + "epoch": 0.0628525797892893, + "grad_norm": 1.6218816041946411, + "learning_rate": 9.4119244931573e-05, + "loss": 0.3747283935546875, + "step": 14640 + }, + { + "epoch": 0.06289551187930931, + "grad_norm": 0.73520827293396, + "learning_rate": 9.411493321145538e-05, + "loss": 0.17614601850509642, + "step": 14650 + }, + { + "epoch": 0.06293844396932931, + "grad_norm": 0.7931138277053833, + "learning_rate": 9.411062149133776e-05, + "loss": 0.24755749702453614, + "step": 14660 + }, + { + "epoch": 0.06298137605934932, + "grad_norm": 0.17208456993103027, + "learning_rate": 9.410630977122013e-05, + "loss": 0.44512248039245605, + "step": 14670 + }, + { + "epoch": 0.06302430814936932, + "grad_norm": 0.07860125601291656, + "learning_rate": 9.410199805110251e-05, + "loss": 0.25961253643035886, + "step": 14680 + }, + { + "epoch": 0.06306724023938934, + "grad_norm": 1.2539564371109009, + "learning_rate": 9.409768633098489e-05, + "loss": 0.24830968379974366, + "step": 14690 + }, + { + "epoch": 0.06311017232940934, + "grad_norm": 0.6141181588172913, + "learning_rate": 9.409337461086726e-05, + "loss": 0.4124382495880127, + "step": 14700 + }, + { + "epoch": 0.06315310441942934, + "grad_norm": 2.1591837406158447, + "learning_rate": 9.408906289074964e-05, + "loss": 0.3804943561553955, + "step": 14710 + }, + { + "epoch": 0.06319603650944935, + "grad_norm": 0.14408475160598755, + "learning_rate": 9.408475117063202e-05, + "loss": 0.06406650543212891, + "step": 14720 + }, + { + "epoch": 0.06323896859946936, + "grad_norm": 0.11168843507766724, + "learning_rate": 9.40804394505144e-05, + "loss": 0.1355807065963745, + "step": 14730 + }, + { + "epoch": 0.06328190068948937, + "grad_norm": 8.794105529785156, + "learning_rate": 9.407612773039677e-05, + "loss": 0.3579146385192871, + "step": 14740 + }, + { + "epoch": 0.06332483277950937, + "grad_norm": 2.165438175201416, + "learning_rate": 9.407181601027915e-05, + "loss": 0.24737703800201416, + "step": 14750 + }, + { + "epoch": 0.06336776486952937, + "grad_norm": 3.3912713527679443, + "learning_rate": 9.406750429016153e-05, + "loss": 0.3478167295455933, + "step": 14760 + }, + { + "epoch": 0.06341069695954939, + "grad_norm": 0.035082586109638214, + "learning_rate": 9.40631925700439e-05, + "loss": 0.2180727243423462, + "step": 14770 + }, + { + "epoch": 0.06345362904956939, + "grad_norm": 1.1248410940170288, + "learning_rate": 9.405888084992628e-05, + "loss": 0.20930137634277343, + "step": 14780 + }, + { + "epoch": 0.0634965611395894, + "grad_norm": 0.03349543362855911, + "learning_rate": 9.405456912980865e-05, + "loss": 0.2702275276184082, + "step": 14790 + }, + { + "epoch": 0.0635394932296094, + "grad_norm": 0.04150047525763512, + "learning_rate": 9.405025740969102e-05, + "loss": 0.18669117689132692, + "step": 14800 + }, + { + "epoch": 0.0635824253196294, + "grad_norm": 0.07978838682174683, + "learning_rate": 9.40459456895734e-05, + "loss": 0.12569122314453124, + "step": 14810 + }, + { + "epoch": 0.06362535740964942, + "grad_norm": 1.4422789812088013, + "learning_rate": 9.404163396945578e-05, + "loss": 0.2750370979309082, + "step": 14820 + }, + { + "epoch": 0.06366828949966942, + "grad_norm": 0.08112714439630508, + "learning_rate": 9.403732224933816e-05, + "loss": 0.35209102630615235, + "step": 14830 + }, + { + "epoch": 0.06371122158968943, + "grad_norm": 0.05240992456674576, + "learning_rate": 9.403301052922053e-05, + "loss": 0.1633044719696045, + "step": 14840 + }, + { + "epoch": 0.06375415367970944, + "grad_norm": 0.5782513618469238, + "learning_rate": 9.402869880910291e-05, + "loss": 0.28111331462860106, + "step": 14850 + }, + { + "epoch": 0.06379708576972944, + "grad_norm": 1.0291950702667236, + "learning_rate": 9.402438708898529e-05, + "loss": 0.18657200336456298, + "step": 14860 + }, + { + "epoch": 0.06384001785974945, + "grad_norm": 1.3704755306243896, + "learning_rate": 9.402007536886767e-05, + "loss": 0.33126370906829833, + "step": 14870 + }, + { + "epoch": 0.06388294994976945, + "grad_norm": 0.014527720399200916, + "learning_rate": 9.401576364875003e-05, + "loss": 0.27899258136749266, + "step": 14880 + }, + { + "epoch": 0.06392588203978947, + "grad_norm": 2.0788886547088623, + "learning_rate": 9.401145192863241e-05, + "loss": 0.248209810256958, + "step": 14890 + }, + { + "epoch": 0.06396881412980947, + "grad_norm": 1.4462379217147827, + "learning_rate": 9.400714020851478e-05, + "loss": 0.20047402381896973, + "step": 14900 + }, + { + "epoch": 0.06401174621982947, + "grad_norm": 1.2522448301315308, + "learning_rate": 9.400282848839716e-05, + "loss": 0.40030665397644044, + "step": 14910 + }, + { + "epoch": 0.06405467830984948, + "grad_norm": 0.7433568239212036, + "learning_rate": 9.399851676827954e-05, + "loss": 0.1965320587158203, + "step": 14920 + }, + { + "epoch": 0.06409761039986948, + "grad_norm": 0.19484727084636688, + "learning_rate": 9.399420504816192e-05, + "loss": 0.3624082565307617, + "step": 14930 + }, + { + "epoch": 0.0641405424898895, + "grad_norm": 2.0891435146331787, + "learning_rate": 9.39898933280443e-05, + "loss": 0.4050153732299805, + "step": 14940 + }, + { + "epoch": 0.0641834745799095, + "grad_norm": 0.0240493081510067, + "learning_rate": 9.398558160792667e-05, + "loss": 0.09888641238212585, + "step": 14950 + }, + { + "epoch": 0.0642264066699295, + "grad_norm": 0.9053465127944946, + "learning_rate": 9.398126988780905e-05, + "loss": 0.48057260513305666, + "step": 14960 + }, + { + "epoch": 0.06426933875994952, + "grad_norm": 0.017222393304109573, + "learning_rate": 9.397695816769143e-05, + "loss": 0.26117160320281985, + "step": 14970 + }, + { + "epoch": 0.06431227084996952, + "grad_norm": 0.4678298234939575, + "learning_rate": 9.39726464475738e-05, + "loss": 0.19400867223739623, + "step": 14980 + }, + { + "epoch": 0.06435520293998953, + "grad_norm": 3.2162866592407227, + "learning_rate": 9.396833472745618e-05, + "loss": 0.24385993480682372, + "step": 14990 + }, + { + "epoch": 0.06439813503000953, + "grad_norm": 0.5383427739143372, + "learning_rate": 9.396402300733856e-05, + "loss": 0.19195520877838135, + "step": 15000 + }, + { + "epoch": 0.06439813503000953, + "eval_loss": 0.5142996311187744, + "eval_runtime": 27.4677, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 15000 + }, + { + "epoch": 0.06444106712002953, + "grad_norm": 0.5139740109443665, + "learning_rate": 9.395971128722094e-05, + "loss": 0.0931144118309021, + "step": 15010 + }, + { + "epoch": 0.06448399921004955, + "grad_norm": 1.9202089309692383, + "learning_rate": 9.395539956710331e-05, + "loss": 0.5149777889251709, + "step": 15020 + }, + { + "epoch": 0.06452693130006955, + "grad_norm": 1.7345205545425415, + "learning_rate": 9.395108784698569e-05, + "loss": 0.2525042533874512, + "step": 15030 + }, + { + "epoch": 0.06456986339008956, + "grad_norm": 0.01726091280579567, + "learning_rate": 9.394677612686805e-05, + "loss": 0.45943574905395507, + "step": 15040 + }, + { + "epoch": 0.06461279548010956, + "grad_norm": 0.04527002200484276, + "learning_rate": 9.394246440675043e-05, + "loss": 0.3036803722381592, + "step": 15050 + }, + { + "epoch": 0.06465572757012956, + "grad_norm": 0.010524548590183258, + "learning_rate": 9.393815268663281e-05, + "loss": 0.27003719806671145, + "step": 15060 + }, + { + "epoch": 0.06469865966014958, + "grad_norm": 0.6304553151130676, + "learning_rate": 9.393384096651519e-05, + "loss": 0.3148674964904785, + "step": 15070 + }, + { + "epoch": 0.06474159175016958, + "grad_norm": 1.2246551513671875, + "learning_rate": 9.392952924639756e-05, + "loss": 0.3761624813079834, + "step": 15080 + }, + { + "epoch": 0.06478452384018958, + "grad_norm": 3.657578229904175, + "learning_rate": 9.392521752627994e-05, + "loss": 0.16718716621398927, + "step": 15090 + }, + { + "epoch": 0.0648274559302096, + "grad_norm": 0.03699451684951782, + "learning_rate": 9.392090580616232e-05, + "loss": 0.2341052532196045, + "step": 15100 + }, + { + "epoch": 0.0648703880202296, + "grad_norm": 0.09733054786920547, + "learning_rate": 9.39165940860447e-05, + "loss": 0.21350882053375245, + "step": 15110 + }, + { + "epoch": 0.06491332011024961, + "grad_norm": 0.023660734295845032, + "learning_rate": 9.391228236592706e-05, + "loss": 0.27405989170074463, + "step": 15120 + }, + { + "epoch": 0.06495625220026961, + "grad_norm": 0.02325832098722458, + "learning_rate": 9.390797064580944e-05, + "loss": 0.06409944891929627, + "step": 15130 + }, + { + "epoch": 0.06499918429028961, + "grad_norm": 0.010780767537653446, + "learning_rate": 9.390365892569181e-05, + "loss": 0.29595441818237306, + "step": 15140 + }, + { + "epoch": 0.06504211638030963, + "grad_norm": 0.06848180294036865, + "learning_rate": 9.389934720557419e-05, + "loss": 0.3280792236328125, + "step": 15150 + }, + { + "epoch": 0.06508504847032963, + "grad_norm": 1.5644136667251587, + "learning_rate": 9.389503548545657e-05, + "loss": 0.2550457715988159, + "step": 15160 + }, + { + "epoch": 0.06512798056034964, + "grad_norm": 0.478125661611557, + "learning_rate": 9.389072376533895e-05, + "loss": 0.27299840450286866, + "step": 15170 + }, + { + "epoch": 0.06517091265036964, + "grad_norm": 38.812252044677734, + "learning_rate": 9.388641204522132e-05, + "loss": 0.17638283967971802, + "step": 15180 + }, + { + "epoch": 0.06521384474038965, + "grad_norm": 0.23070688545703888, + "learning_rate": 9.38821003251037e-05, + "loss": 0.21997244358062745, + "step": 15190 + }, + { + "epoch": 0.06525677683040966, + "grad_norm": 0.012052039615809917, + "learning_rate": 9.387778860498608e-05, + "loss": 0.07902588844299316, + "step": 15200 + }, + { + "epoch": 0.06529970892042966, + "grad_norm": 0.14975719153881073, + "learning_rate": 9.387347688486845e-05, + "loss": 0.4755707263946533, + "step": 15210 + }, + { + "epoch": 0.06534264101044968, + "grad_norm": 29.956811904907227, + "learning_rate": 9.386916516475083e-05, + "loss": 0.29420592784881594, + "step": 15220 + }, + { + "epoch": 0.06538557310046968, + "grad_norm": 0.009674137458205223, + "learning_rate": 9.386485344463321e-05, + "loss": 0.25805816650390623, + "step": 15230 + }, + { + "epoch": 0.06542850519048968, + "grad_norm": 0.31382089853286743, + "learning_rate": 9.386054172451559e-05, + "loss": 0.20677318572998046, + "step": 15240 + }, + { + "epoch": 0.06547143728050969, + "grad_norm": 0.0067361705005168915, + "learning_rate": 9.385623000439796e-05, + "loss": 0.10377358198165894, + "step": 15250 + }, + { + "epoch": 0.0655143693705297, + "grad_norm": 0.010374417528510094, + "learning_rate": 9.385191828428034e-05, + "loss": 0.24403734207153321, + "step": 15260 + }, + { + "epoch": 0.06555730146054971, + "grad_norm": 0.05472126975655556, + "learning_rate": 9.384760656416272e-05, + "loss": 0.3620487928390503, + "step": 15270 + }, + { + "epoch": 0.06560023355056971, + "grad_norm": 0.21027326583862305, + "learning_rate": 9.38432948440451e-05, + "loss": 0.182614266872406, + "step": 15280 + }, + { + "epoch": 0.06564316564058971, + "grad_norm": 0.31767529249191284, + "learning_rate": 9.383898312392746e-05, + "loss": 0.0882587492465973, + "step": 15290 + }, + { + "epoch": 0.06568609773060972, + "grad_norm": 0.03704505041241646, + "learning_rate": 9.383467140380984e-05, + "loss": 0.09384621977806092, + "step": 15300 + }, + { + "epoch": 0.06572902982062973, + "grad_norm": 0.025900261476635933, + "learning_rate": 9.383035968369221e-05, + "loss": 0.2080439329147339, + "step": 15310 + }, + { + "epoch": 0.06577196191064974, + "grad_norm": 0.07200033217668533, + "learning_rate": 9.382604796357459e-05, + "loss": 0.3437242031097412, + "step": 15320 + }, + { + "epoch": 0.06581489400066974, + "grad_norm": 0.7287998199462891, + "learning_rate": 9.382173624345697e-05, + "loss": 0.37477612495422363, + "step": 15330 + }, + { + "epoch": 0.06585782609068974, + "grad_norm": 0.013580391183495522, + "learning_rate": 9.381742452333935e-05, + "loss": 0.19882876873016359, + "step": 15340 + }, + { + "epoch": 0.06590075818070976, + "grad_norm": 1.0913523435592651, + "learning_rate": 9.381311280322172e-05, + "loss": 0.4215532302856445, + "step": 15350 + }, + { + "epoch": 0.06594369027072976, + "grad_norm": 3.035409688949585, + "learning_rate": 9.38088010831041e-05, + "loss": 0.31964831352233886, + "step": 15360 + }, + { + "epoch": 0.06598662236074977, + "grad_norm": 0.4063565731048584, + "learning_rate": 9.380448936298647e-05, + "loss": 0.22180414199829102, + "step": 15370 + }, + { + "epoch": 0.06602955445076977, + "grad_norm": 2.7936208248138428, + "learning_rate": 9.380017764286884e-05, + "loss": 0.28003358840942383, + "step": 15380 + }, + { + "epoch": 0.06607248654078977, + "grad_norm": 0.5528353452682495, + "learning_rate": 9.379586592275122e-05, + "loss": 0.45966687202453616, + "step": 15390 + }, + { + "epoch": 0.06611541863080979, + "grad_norm": 0.123682901263237, + "learning_rate": 9.37915542026336e-05, + "loss": 0.21645750999450683, + "step": 15400 + }, + { + "epoch": 0.06615835072082979, + "grad_norm": 1.5124139785766602, + "learning_rate": 9.378724248251597e-05, + "loss": 0.41568670272827146, + "step": 15410 + }, + { + "epoch": 0.0662012828108498, + "grad_norm": 0.14867492020130157, + "learning_rate": 9.378293076239835e-05, + "loss": 0.29629995822906496, + "step": 15420 + }, + { + "epoch": 0.0662442149008698, + "grad_norm": 0.3669738173484802, + "learning_rate": 9.377861904228073e-05, + "loss": 0.18541309833526612, + "step": 15430 + }, + { + "epoch": 0.0662871469908898, + "grad_norm": 0.015653476119041443, + "learning_rate": 9.37743073221631e-05, + "loss": 0.10632809400558471, + "step": 15440 + }, + { + "epoch": 0.06633007908090982, + "grad_norm": 0.02738998457789421, + "learning_rate": 9.376999560204548e-05, + "loss": 0.26311161518096926, + "step": 15450 + }, + { + "epoch": 0.06637301117092982, + "grad_norm": 0.12375674396753311, + "learning_rate": 9.376568388192786e-05, + "loss": 0.2682521820068359, + "step": 15460 + }, + { + "epoch": 0.06641594326094984, + "grad_norm": 0.06628740578889847, + "learning_rate": 9.376137216181024e-05, + "loss": 0.08452145457267761, + "step": 15470 + }, + { + "epoch": 0.06645887535096984, + "grad_norm": 0.08715321123600006, + "learning_rate": 9.375706044169262e-05, + "loss": 0.26986777782440186, + "step": 15480 + }, + { + "epoch": 0.06650180744098984, + "grad_norm": 0.11993555724620819, + "learning_rate": 9.3752748721575e-05, + "loss": 0.4222938060760498, + "step": 15490 + }, + { + "epoch": 0.06654473953100985, + "grad_norm": 0.10585108399391174, + "learning_rate": 9.374843700145737e-05, + "loss": 0.23077239990234374, + "step": 15500 + }, + { + "epoch": 0.06658767162102985, + "grad_norm": 0.08384846150875092, + "learning_rate": 9.374412528133975e-05, + "loss": 0.19676105976104735, + "step": 15510 + }, + { + "epoch": 0.06663060371104985, + "grad_norm": 1.7130482196807861, + "learning_rate": 9.373981356122213e-05, + "loss": 0.30971174240112304, + "step": 15520 + }, + { + "epoch": 0.06667353580106987, + "grad_norm": 1.3083285093307495, + "learning_rate": 9.373550184110449e-05, + "loss": 0.18229281902313232, + "step": 15530 + }, + { + "epoch": 0.06671646789108987, + "grad_norm": 0.05157879367470741, + "learning_rate": 9.373119012098687e-05, + "loss": 0.22775630950927733, + "step": 15540 + }, + { + "epoch": 0.06675939998110988, + "grad_norm": 0.1158638447523117, + "learning_rate": 9.372687840086924e-05, + "loss": 0.15219898223876954, + "step": 15550 + }, + { + "epoch": 0.06680233207112989, + "grad_norm": 1.0765706300735474, + "learning_rate": 9.372256668075162e-05, + "loss": 0.3147680997848511, + "step": 15560 + }, + { + "epoch": 0.06684526416114989, + "grad_norm": 1.6944249868392944, + "learning_rate": 9.3718254960634e-05, + "loss": 0.2515150785446167, + "step": 15570 + }, + { + "epoch": 0.0668881962511699, + "grad_norm": 2.6989057064056396, + "learning_rate": 9.371394324051638e-05, + "loss": 0.4220762252807617, + "step": 15580 + }, + { + "epoch": 0.0669311283411899, + "grad_norm": 0.8688747882843018, + "learning_rate": 9.370963152039875e-05, + "loss": 0.17261714935302735, + "step": 15590 + }, + { + "epoch": 0.06697406043120992, + "grad_norm": 0.10762748122215271, + "learning_rate": 9.370531980028113e-05, + "loss": 0.3298125982284546, + "step": 15600 + }, + { + "epoch": 0.06701699252122992, + "grad_norm": 0.31058886647224426, + "learning_rate": 9.370100808016351e-05, + "loss": 0.19337745904922485, + "step": 15610 + }, + { + "epoch": 0.06705992461124992, + "grad_norm": 4.391843795776367, + "learning_rate": 9.369669636004587e-05, + "loss": 0.16032843589782714, + "step": 15620 + }, + { + "epoch": 0.06710285670126993, + "grad_norm": 1.146741509437561, + "learning_rate": 9.369238463992825e-05, + "loss": 0.21080875396728516, + "step": 15630 + }, + { + "epoch": 0.06714578879128993, + "grad_norm": 3.3124451637268066, + "learning_rate": 9.368807291981063e-05, + "loss": 0.17837634086608886, + "step": 15640 + }, + { + "epoch": 0.06718872088130995, + "grad_norm": 0.04370800778269768, + "learning_rate": 9.3683761199693e-05, + "loss": 0.17239662408828735, + "step": 15650 + }, + { + "epoch": 0.06723165297132995, + "grad_norm": 0.13442166149616241, + "learning_rate": 9.367944947957538e-05, + "loss": 0.09528826475143433, + "step": 15660 + }, + { + "epoch": 0.06727458506134995, + "grad_norm": 2.838956832885742, + "learning_rate": 9.367513775945776e-05, + "loss": 0.347308874130249, + "step": 15670 + }, + { + "epoch": 0.06731751715136997, + "grad_norm": 0.13439913094043732, + "learning_rate": 9.367082603934015e-05, + "loss": 0.3867277860641479, + "step": 15680 + }, + { + "epoch": 0.06736044924138997, + "grad_norm": 1.1007906198501587, + "learning_rate": 9.366651431922253e-05, + "loss": 0.4312880516052246, + "step": 15690 + }, + { + "epoch": 0.06740338133140998, + "grad_norm": 9.591843605041504, + "learning_rate": 9.366220259910489e-05, + "loss": 0.25187973976135253, + "step": 15700 + }, + { + "epoch": 0.06744631342142998, + "grad_norm": 0.6064546704292297, + "learning_rate": 9.365789087898727e-05, + "loss": 0.18571261167526246, + "step": 15710 + }, + { + "epoch": 0.06748924551144998, + "grad_norm": 1.0531588792800903, + "learning_rate": 9.365357915886965e-05, + "loss": 0.3080170154571533, + "step": 15720 + }, + { + "epoch": 0.06753217760147, + "grad_norm": 3.3235676288604736, + "learning_rate": 9.364926743875202e-05, + "loss": 0.34673519134521485, + "step": 15730 + }, + { + "epoch": 0.06757510969149, + "grad_norm": 0.06666219234466553, + "learning_rate": 9.36449557186344e-05, + "loss": 0.17069684267044066, + "step": 15740 + }, + { + "epoch": 0.06761804178151001, + "grad_norm": 0.08851531147956848, + "learning_rate": 9.364064399851678e-05, + "loss": 0.33871867656707766, + "step": 15750 + }, + { + "epoch": 0.06766097387153001, + "grad_norm": 0.5347506403923035, + "learning_rate": 9.363633227839915e-05, + "loss": 0.2940122365951538, + "step": 15760 + }, + { + "epoch": 0.06770390596155001, + "grad_norm": 0.8870237469673157, + "learning_rate": 9.363202055828153e-05, + "loss": 0.3184309720993042, + "step": 15770 + }, + { + "epoch": 0.06774683805157003, + "grad_norm": 2.2229931354522705, + "learning_rate": 9.36277088381639e-05, + "loss": 0.280017352104187, + "step": 15780 + }, + { + "epoch": 0.06778977014159003, + "grad_norm": 0.06036687269806862, + "learning_rate": 9.362339711804627e-05, + "loss": 0.3001197576522827, + "step": 15790 + }, + { + "epoch": 0.06783270223161005, + "grad_norm": 0.059763338416814804, + "learning_rate": 9.361908539792865e-05, + "loss": 0.29081311225891116, + "step": 15800 + }, + { + "epoch": 0.06787563432163005, + "grad_norm": 0.09039962291717529, + "learning_rate": 9.361477367781103e-05, + "loss": 0.30297653675079345, + "step": 15810 + }, + { + "epoch": 0.06791856641165005, + "grad_norm": 0.24067221581935883, + "learning_rate": 9.36104619576934e-05, + "loss": 0.063690185546875, + "step": 15820 + }, + { + "epoch": 0.06796149850167006, + "grad_norm": 1.0791633129119873, + "learning_rate": 9.360615023757578e-05, + "loss": 0.3398705005645752, + "step": 15830 + }, + { + "epoch": 0.06800443059169006, + "grad_norm": 4.034187316894531, + "learning_rate": 9.360183851745816e-05, + "loss": 0.353275990486145, + "step": 15840 + }, + { + "epoch": 0.06804736268171008, + "grad_norm": 0.6785535216331482, + "learning_rate": 9.359752679734054e-05, + "loss": 0.13289799690246581, + "step": 15850 + }, + { + "epoch": 0.06809029477173008, + "grad_norm": 0.7840905785560608, + "learning_rate": 9.35932150772229e-05, + "loss": 0.12579288482666015, + "step": 15860 + }, + { + "epoch": 0.06813322686175008, + "grad_norm": 0.13649722933769226, + "learning_rate": 9.358890335710528e-05, + "loss": 0.25913591384887696, + "step": 15870 + }, + { + "epoch": 0.0681761589517701, + "grad_norm": 0.06511491537094116, + "learning_rate": 9.358459163698766e-05, + "loss": 0.21581885814666749, + "step": 15880 + }, + { + "epoch": 0.0682190910417901, + "grad_norm": 0.08669072389602661, + "learning_rate": 9.358027991687003e-05, + "loss": 0.3530642032623291, + "step": 15890 + }, + { + "epoch": 0.06826202313181011, + "grad_norm": 0.0684560239315033, + "learning_rate": 9.357596819675242e-05, + "loss": 0.24950690269470216, + "step": 15900 + }, + { + "epoch": 0.06830495522183011, + "grad_norm": 0.1603812426328659, + "learning_rate": 9.35716564766348e-05, + "loss": 0.2178800582885742, + "step": 15910 + }, + { + "epoch": 0.06834788731185011, + "grad_norm": 2.852123260498047, + "learning_rate": 9.356734475651718e-05, + "loss": 0.16984164714813232, + "step": 15920 + }, + { + "epoch": 0.06839081940187013, + "grad_norm": 1.6295524835586548, + "learning_rate": 9.356303303639956e-05, + "loss": 0.4048158168792725, + "step": 15930 + }, + { + "epoch": 0.06843375149189013, + "grad_norm": 3.4276375770568848, + "learning_rate": 9.355872131628192e-05, + "loss": 0.09007681012153626, + "step": 15940 + }, + { + "epoch": 0.06847668358191013, + "grad_norm": 27.015365600585938, + "learning_rate": 9.35544095961643e-05, + "loss": 0.12301526069641114, + "step": 15950 + }, + { + "epoch": 0.06851961567193014, + "grad_norm": 0.044064607471227646, + "learning_rate": 9.355009787604667e-05, + "loss": 0.3360439300537109, + "step": 15960 + }, + { + "epoch": 0.06856254776195014, + "grad_norm": 2.858654737472534, + "learning_rate": 9.354578615592905e-05, + "loss": 0.6317211151123047, + "step": 15970 + }, + { + "epoch": 0.06860547985197016, + "grad_norm": 0.2168719321489334, + "learning_rate": 9.354147443581143e-05, + "loss": 0.10691696405410767, + "step": 15980 + }, + { + "epoch": 0.06864841194199016, + "grad_norm": 0.17285951972007751, + "learning_rate": 9.35371627156938e-05, + "loss": 0.18733786344528197, + "step": 15990 + }, + { + "epoch": 0.06869134403201016, + "grad_norm": 0.08299694955348969, + "learning_rate": 9.353285099557618e-05, + "loss": 0.09542076587677002, + "step": 16000 + }, + { + "epoch": 0.06869134403201016, + "eval_loss": 0.526293933391571, + "eval_runtime": 27.5007, + "eval_samples_per_second": 3.636, + "eval_steps_per_second": 3.636, + "step": 16000 + }, + { + "epoch": 0.06873427612203017, + "grad_norm": 0.05496923252940178, + "learning_rate": 9.352853927545856e-05, + "loss": 0.1974226117134094, + "step": 16010 + }, + { + "epoch": 0.06877720821205018, + "grad_norm": 0.11612247675657272, + "learning_rate": 9.352422755534094e-05, + "loss": 0.3134061574935913, + "step": 16020 + }, + { + "epoch": 0.06882014030207019, + "grad_norm": 0.06240341067314148, + "learning_rate": 9.35199158352233e-05, + "loss": 0.09108485579490662, + "step": 16030 + }, + { + "epoch": 0.06886307239209019, + "grad_norm": 0.018856368958950043, + "learning_rate": 9.351560411510568e-05, + "loss": 0.1379598021507263, + "step": 16040 + }, + { + "epoch": 0.06890600448211019, + "grad_norm": 0.02072647213935852, + "learning_rate": 9.351129239498806e-05, + "loss": 0.29755070209503176, + "step": 16050 + }, + { + "epoch": 0.0689489365721302, + "grad_norm": 0.02678876370191574, + "learning_rate": 9.350698067487043e-05, + "loss": 0.19866015911102294, + "step": 16060 + }, + { + "epoch": 0.06899186866215021, + "grad_norm": 0.02596438117325306, + "learning_rate": 9.350266895475281e-05, + "loss": 0.23744065761566163, + "step": 16070 + }, + { + "epoch": 0.06903480075217022, + "grad_norm": 0.0036663708742707968, + "learning_rate": 9.349835723463519e-05, + "loss": 0.22252767086029052, + "step": 16080 + }, + { + "epoch": 0.06907773284219022, + "grad_norm": 1.597440242767334, + "learning_rate": 9.349404551451757e-05, + "loss": 0.3193185329437256, + "step": 16090 + }, + { + "epoch": 0.06912066493221022, + "grad_norm": 0.05906902626156807, + "learning_rate": 9.348973379439994e-05, + "loss": 0.2511482238769531, + "step": 16100 + }, + { + "epoch": 0.06916359702223024, + "grad_norm": 0.03994489088654518, + "learning_rate": 9.348542207428231e-05, + "loss": 0.24306204319000244, + "step": 16110 + }, + { + "epoch": 0.06920652911225024, + "grad_norm": 0.9707558155059814, + "learning_rate": 9.34811103541647e-05, + "loss": 0.3085232496261597, + "step": 16120 + }, + { + "epoch": 0.06924946120227025, + "grad_norm": 7.231551647186279, + "learning_rate": 9.347679863404708e-05, + "loss": 0.2969422101974487, + "step": 16130 + }, + { + "epoch": 0.06929239329229026, + "grad_norm": 0.19959791004657745, + "learning_rate": 9.347248691392945e-05, + "loss": 0.20120530128479003, + "step": 16140 + }, + { + "epoch": 0.06933532538231026, + "grad_norm": 3.905804395675659, + "learning_rate": 9.346817519381183e-05, + "loss": 0.31837916374206543, + "step": 16150 + }, + { + "epoch": 0.06937825747233027, + "grad_norm": 1.6950525045394897, + "learning_rate": 9.346386347369421e-05, + "loss": 0.21888651847839355, + "step": 16160 + }, + { + "epoch": 0.06942118956235027, + "grad_norm": 4.336323261260986, + "learning_rate": 9.345955175357658e-05, + "loss": 0.5113966464996338, + "step": 16170 + }, + { + "epoch": 0.06946412165237029, + "grad_norm": 0.053269702941179276, + "learning_rate": 9.345524003345896e-05, + "loss": 0.18296149969100953, + "step": 16180 + }, + { + "epoch": 0.06950705374239029, + "grad_norm": 0.09012717008590698, + "learning_rate": 9.345092831334133e-05, + "loss": 0.1996088743209839, + "step": 16190 + }, + { + "epoch": 0.06954998583241029, + "grad_norm": 2.09253191947937, + "learning_rate": 9.34466165932237e-05, + "loss": 0.20683207511901855, + "step": 16200 + }, + { + "epoch": 0.0695929179224303, + "grad_norm": 0.13705863058567047, + "learning_rate": 9.344230487310608e-05, + "loss": 0.30519049167633056, + "step": 16210 + }, + { + "epoch": 0.0696358500124503, + "grad_norm": 0.018574651330709457, + "learning_rate": 9.343799315298846e-05, + "loss": 0.3084603548049927, + "step": 16220 + }, + { + "epoch": 0.06967878210247032, + "grad_norm": 0.9541419744491577, + "learning_rate": 9.343368143287084e-05, + "loss": 0.3294121742248535, + "step": 16230 + }, + { + "epoch": 0.06972171419249032, + "grad_norm": 1.0622129440307617, + "learning_rate": 9.342936971275321e-05, + "loss": 0.48809447288513186, + "step": 16240 + }, + { + "epoch": 0.06976464628251032, + "grad_norm": 0.009066279046237469, + "learning_rate": 9.342505799263559e-05, + "loss": 0.04652267992496491, + "step": 16250 + }, + { + "epoch": 0.06980757837253034, + "grad_norm": 0.5664111375808716, + "learning_rate": 9.342074627251797e-05, + "loss": 0.31479432582855227, + "step": 16260 + }, + { + "epoch": 0.06985051046255034, + "grad_norm": 0.040703702718019485, + "learning_rate": 9.341643455240033e-05, + "loss": 0.10909019708633423, + "step": 16270 + }, + { + "epoch": 0.06989344255257035, + "grad_norm": 0.0748838409781456, + "learning_rate": 9.341212283228271e-05, + "loss": 0.26613683700561525, + "step": 16280 + }, + { + "epoch": 0.06993637464259035, + "grad_norm": 1.402508020401001, + "learning_rate": 9.340781111216509e-05, + "loss": 0.328706693649292, + "step": 16290 + }, + { + "epoch": 0.06997930673261035, + "grad_norm": 1.6031125783920288, + "learning_rate": 9.340349939204746e-05, + "loss": 0.20419700145721437, + "step": 16300 + }, + { + "epoch": 0.07002223882263037, + "grad_norm": 3.960529327392578, + "learning_rate": 9.339918767192984e-05, + "loss": 0.16980836391448975, + "step": 16310 + }, + { + "epoch": 0.07006517091265037, + "grad_norm": 0.040454789996147156, + "learning_rate": 9.339487595181222e-05, + "loss": 0.33831195831298827, + "step": 16320 + }, + { + "epoch": 0.07010810300267038, + "grad_norm": 0.18430018424987793, + "learning_rate": 9.33905642316946e-05, + "loss": 0.1638139009475708, + "step": 16330 + }, + { + "epoch": 0.07015103509269038, + "grad_norm": 2.9964406490325928, + "learning_rate": 9.338625251157697e-05, + "loss": 0.3662814378738403, + "step": 16340 + }, + { + "epoch": 0.07019396718271038, + "grad_norm": 0.0014150225324556231, + "learning_rate": 9.338194079145935e-05, + "loss": 0.2916199445724487, + "step": 16350 + }, + { + "epoch": 0.0702368992727304, + "grad_norm": 1.233773112297058, + "learning_rate": 9.337762907134173e-05, + "loss": 0.2907113552093506, + "step": 16360 + }, + { + "epoch": 0.0702798313627504, + "grad_norm": 0.06687705218791962, + "learning_rate": 9.33733173512241e-05, + "loss": 0.21224551200866698, + "step": 16370 + }, + { + "epoch": 0.0703227634527704, + "grad_norm": 3.684509038925171, + "learning_rate": 9.336900563110648e-05, + "loss": 0.20599827766418458, + "step": 16380 + }, + { + "epoch": 0.07036569554279042, + "grad_norm": 1.5613230466842651, + "learning_rate": 9.336469391098886e-05, + "loss": 0.26720128059387205, + "step": 16390 + }, + { + "epoch": 0.07040862763281042, + "grad_norm": 3.201432466506958, + "learning_rate": 9.336038219087124e-05, + "loss": 0.11232433319091797, + "step": 16400 + }, + { + "epoch": 0.07045155972283043, + "grad_norm": 0.6785762906074524, + "learning_rate": 9.335607047075361e-05, + "loss": 0.2559498310089111, + "step": 16410 + }, + { + "epoch": 0.07049449181285043, + "grad_norm": 0.44419634342193604, + "learning_rate": 9.335175875063599e-05, + "loss": 0.6221245765686035, + "step": 16420 + }, + { + "epoch": 0.07053742390287043, + "grad_norm": 0.08672723919153214, + "learning_rate": 9.334744703051837e-05, + "loss": 0.22470765113830565, + "step": 16430 + }, + { + "epoch": 0.07058035599289045, + "grad_norm": 6.374612331390381, + "learning_rate": 9.334313531040073e-05, + "loss": 0.2854200839996338, + "step": 16440 + }, + { + "epoch": 0.07062328808291045, + "grad_norm": 0.025272591039538383, + "learning_rate": 9.333882359028311e-05, + "loss": 0.3004850149154663, + "step": 16450 + }, + { + "epoch": 0.07066622017293046, + "grad_norm": 1.0149154663085938, + "learning_rate": 9.333451187016549e-05, + "loss": 0.160288405418396, + "step": 16460 + }, + { + "epoch": 0.07070915226295046, + "grad_norm": 0.10873989015817642, + "learning_rate": 9.333020015004786e-05, + "loss": 0.2299208402633667, + "step": 16470 + }, + { + "epoch": 0.07075208435297047, + "grad_norm": 0.05491790547966957, + "learning_rate": 9.332588842993024e-05, + "loss": 0.19463187456130981, + "step": 16480 + }, + { + "epoch": 0.07079501644299048, + "grad_norm": 1.5342626571655273, + "learning_rate": 9.332157670981262e-05, + "loss": 0.24398891925811766, + "step": 16490 + }, + { + "epoch": 0.07083794853301048, + "grad_norm": 0.13651759922504425, + "learning_rate": 9.3317264989695e-05, + "loss": 0.47367658615112307, + "step": 16500 + }, + { + "epoch": 0.0708808806230305, + "grad_norm": 0.634511411190033, + "learning_rate": 9.331295326957737e-05, + "loss": 0.47757797241210936, + "step": 16510 + }, + { + "epoch": 0.0709238127130505, + "grad_norm": 0.09785524755716324, + "learning_rate": 9.330864154945974e-05, + "loss": 0.17711851596832276, + "step": 16520 + }, + { + "epoch": 0.0709667448030705, + "grad_norm": 0.026648370549082756, + "learning_rate": 9.330432982934212e-05, + "loss": 0.14128479957580567, + "step": 16530 + }, + { + "epoch": 0.07100967689309051, + "grad_norm": 0.26906535029411316, + "learning_rate": 9.330001810922449e-05, + "loss": 0.142492938041687, + "step": 16540 + }, + { + "epoch": 0.07105260898311051, + "grad_norm": 1.0589438676834106, + "learning_rate": 9.329570638910687e-05, + "loss": 0.22687864303588867, + "step": 16550 + }, + { + "epoch": 0.07109554107313053, + "grad_norm": 1.6760090589523315, + "learning_rate": 9.329139466898925e-05, + "loss": 0.2446916103363037, + "step": 16560 + }, + { + "epoch": 0.07113847316315053, + "grad_norm": 0.03415419161319733, + "learning_rate": 9.328708294887162e-05, + "loss": 0.20921945571899414, + "step": 16570 + }, + { + "epoch": 0.07118140525317053, + "grad_norm": 0.9828307032585144, + "learning_rate": 9.3282771228754e-05, + "loss": 0.23197565078735352, + "step": 16580 + }, + { + "epoch": 0.07122433734319054, + "grad_norm": 0.10277032107114792, + "learning_rate": 9.327845950863638e-05, + "loss": 0.1265937328338623, + "step": 16590 + }, + { + "epoch": 0.07126726943321054, + "grad_norm": 0.5460087060928345, + "learning_rate": 9.327414778851876e-05, + "loss": 0.30683131217956544, + "step": 16600 + }, + { + "epoch": 0.07131020152323056, + "grad_norm": 1.562453031539917, + "learning_rate": 9.326983606840113e-05, + "loss": 0.13357644081115722, + "step": 16610 + }, + { + "epoch": 0.07135313361325056, + "grad_norm": 0.13830183446407318, + "learning_rate": 9.326552434828351e-05, + "loss": 0.28795995712280276, + "step": 16620 + }, + { + "epoch": 0.07139606570327056, + "grad_norm": 0.06272502988576889, + "learning_rate": 9.326121262816589e-05, + "loss": 0.2059838056564331, + "step": 16630 + }, + { + "epoch": 0.07143899779329058, + "grad_norm": 0.02094428613781929, + "learning_rate": 9.325690090804827e-05, + "loss": 0.05481228232383728, + "step": 16640 + }, + { + "epoch": 0.07148192988331058, + "grad_norm": 0.11069640517234802, + "learning_rate": 9.325258918793064e-05, + "loss": 0.1461290240287781, + "step": 16650 + }, + { + "epoch": 0.07152486197333059, + "grad_norm": 3.0805158615112305, + "learning_rate": 9.324827746781302e-05, + "loss": 0.33733808994293213, + "step": 16660 + }, + { + "epoch": 0.07156779406335059, + "grad_norm": 9.202255249023438, + "learning_rate": 9.32439657476954e-05, + "loss": 0.339400577545166, + "step": 16670 + }, + { + "epoch": 0.0716107261533706, + "grad_norm": 0.3002989590167999, + "learning_rate": 9.323965402757776e-05, + "loss": 0.38298094272613525, + "step": 16680 + }, + { + "epoch": 0.07165365824339061, + "grad_norm": 1.8727099895477295, + "learning_rate": 9.323534230746014e-05, + "loss": 0.48968868255615233, + "step": 16690 + }, + { + "epoch": 0.07169659033341061, + "grad_norm": 0.329200804233551, + "learning_rate": 9.323103058734252e-05, + "loss": 0.3012394905090332, + "step": 16700 + }, + { + "epoch": 0.07173952242343062, + "grad_norm": 0.5954709053039551, + "learning_rate": 9.32267188672249e-05, + "loss": 0.0898634910583496, + "step": 16710 + }, + { + "epoch": 0.07178245451345062, + "grad_norm": 0.6996222734451294, + "learning_rate": 9.322240714710727e-05, + "loss": 0.27539935111999514, + "step": 16720 + }, + { + "epoch": 0.07182538660347063, + "grad_norm": 0.015904199331998825, + "learning_rate": 9.321809542698965e-05, + "loss": 0.3184134244918823, + "step": 16730 + }, + { + "epoch": 0.07186831869349064, + "grad_norm": 1.6481454372406006, + "learning_rate": 9.321378370687203e-05, + "loss": 0.34613637924194335, + "step": 16740 + }, + { + "epoch": 0.07191125078351064, + "grad_norm": 0.08801378309726715, + "learning_rate": 9.32094719867544e-05, + "loss": 0.3641823768615723, + "step": 16750 + }, + { + "epoch": 0.07195418287353066, + "grad_norm": 0.03433467075228691, + "learning_rate": 9.320516026663678e-05, + "loss": 0.2840471029281616, + "step": 16760 + }, + { + "epoch": 0.07199711496355066, + "grad_norm": 0.033972445875406265, + "learning_rate": 9.320084854651914e-05, + "loss": 0.1771819233894348, + "step": 16770 + }, + { + "epoch": 0.07204004705357066, + "grad_norm": 1.2626923322677612, + "learning_rate": 9.319653682640152e-05, + "loss": 0.28968324661254885, + "step": 16780 + }, + { + "epoch": 0.07208297914359067, + "grad_norm": 0.09417697042226791, + "learning_rate": 9.31922251062839e-05, + "loss": 0.15749802589416503, + "step": 16790 + }, + { + "epoch": 0.07212591123361067, + "grad_norm": 0.062441833317279816, + "learning_rate": 9.318791338616628e-05, + "loss": 0.26803529262542725, + "step": 16800 + }, + { + "epoch": 0.07216884332363067, + "grad_norm": 2.0154871940612793, + "learning_rate": 9.318360166604865e-05, + "loss": 0.4345219135284424, + "step": 16810 + }, + { + "epoch": 0.07221177541365069, + "grad_norm": 2.229771614074707, + "learning_rate": 9.317928994593103e-05, + "loss": 0.1976357340812683, + "step": 16820 + }, + { + "epoch": 0.07225470750367069, + "grad_norm": 0.26853519678115845, + "learning_rate": 9.317497822581341e-05, + "loss": 0.2845271587371826, + "step": 16830 + }, + { + "epoch": 0.0722976395936907, + "grad_norm": 0.0032490019220858812, + "learning_rate": 9.317066650569579e-05, + "loss": 0.42137975692749025, + "step": 16840 + }, + { + "epoch": 0.0723405716837107, + "grad_norm": 0.004822635091841221, + "learning_rate": 9.316635478557816e-05, + "loss": 0.377595853805542, + "step": 16850 + }, + { + "epoch": 0.0723835037737307, + "grad_norm": 0.10645035654306412, + "learning_rate": 9.316204306546054e-05, + "loss": 0.15568927526474, + "step": 16860 + }, + { + "epoch": 0.07242643586375072, + "grad_norm": 0.015432666055858135, + "learning_rate": 9.315773134534292e-05, + "loss": 0.36362059116363527, + "step": 16870 + }, + { + "epoch": 0.07246936795377072, + "grad_norm": 0.017693661153316498, + "learning_rate": 9.31534196252253e-05, + "loss": 0.2413787603378296, + "step": 16880 + }, + { + "epoch": 0.07251230004379074, + "grad_norm": 0.15146492421627045, + "learning_rate": 9.314910790510767e-05, + "loss": 0.237416672706604, + "step": 16890 + }, + { + "epoch": 0.07255523213381074, + "grad_norm": 4.291798114776611, + "learning_rate": 9.314479618499005e-05, + "loss": 0.4450693130493164, + "step": 16900 + }, + { + "epoch": 0.07259816422383074, + "grad_norm": 1.8319154977798462, + "learning_rate": 9.314048446487243e-05, + "loss": 0.33154332637786865, + "step": 16910 + }, + { + "epoch": 0.07264109631385075, + "grad_norm": 0.23402829468250275, + "learning_rate": 9.31361727447548e-05, + "loss": 0.21158668994903565, + "step": 16920 + }, + { + "epoch": 0.07268402840387075, + "grad_norm": 0.9586695432662964, + "learning_rate": 9.313186102463717e-05, + "loss": 0.2537838935852051, + "step": 16930 + }, + { + "epoch": 0.07272696049389077, + "grad_norm": 0.7257668375968933, + "learning_rate": 9.312754930451955e-05, + "loss": 0.4206377983093262, + "step": 16940 + }, + { + "epoch": 0.07276989258391077, + "grad_norm": 1.5806889533996582, + "learning_rate": 9.312323758440192e-05, + "loss": 0.20530943870544432, + "step": 16950 + }, + { + "epoch": 0.07281282467393077, + "grad_norm": 0.2541026771068573, + "learning_rate": 9.31189258642843e-05, + "loss": 0.40438618659973147, + "step": 16960 + }, + { + "epoch": 0.07285575676395079, + "grad_norm": 2.763205051422119, + "learning_rate": 9.311461414416668e-05, + "loss": 0.41117844581604, + "step": 16970 + }, + { + "epoch": 0.07289868885397079, + "grad_norm": 0.3387240767478943, + "learning_rate": 9.311030242404905e-05, + "loss": 0.32136220932006837, + "step": 16980 + }, + { + "epoch": 0.0729416209439908, + "grad_norm": 0.4019925594329834, + "learning_rate": 9.310599070393143e-05, + "loss": 0.2839751958847046, + "step": 16990 + }, + { + "epoch": 0.0729845530340108, + "grad_norm": 0.6113116145133972, + "learning_rate": 9.310167898381381e-05, + "loss": 0.2559647798538208, + "step": 17000 + }, + { + "epoch": 0.0729845530340108, + "eval_loss": 0.5012018084526062, + "eval_runtime": 27.5411, + "eval_samples_per_second": 3.631, + "eval_steps_per_second": 3.631, + "step": 17000 + }, + { + "epoch": 0.0730274851240308, + "grad_norm": 0.06289585679769516, + "learning_rate": 9.309736726369617e-05, + "loss": 0.27092506885528567, + "step": 17010 + }, + { + "epoch": 0.07307041721405082, + "grad_norm": 3.7301859855651855, + "learning_rate": 9.309305554357855e-05, + "loss": 0.289726996421814, + "step": 17020 + }, + { + "epoch": 0.07311334930407082, + "grad_norm": 0.07134075462818146, + "learning_rate": 9.308874382346093e-05, + "loss": 0.15513920783996582, + "step": 17030 + }, + { + "epoch": 0.07315628139409083, + "grad_norm": 1.0717676877975464, + "learning_rate": 9.30844321033433e-05, + "loss": 0.2019169569015503, + "step": 17040 + }, + { + "epoch": 0.07319921348411083, + "grad_norm": 1.1874752044677734, + "learning_rate": 9.308012038322568e-05, + "loss": 0.346480655670166, + "step": 17050 + }, + { + "epoch": 0.07324214557413083, + "grad_norm": 0.29725155234336853, + "learning_rate": 9.307580866310806e-05, + "loss": 0.2214757204055786, + "step": 17060 + }, + { + "epoch": 0.07328507766415085, + "grad_norm": 0.0174336489289999, + "learning_rate": 9.307149694299044e-05, + "loss": 0.19202833175659179, + "step": 17070 + }, + { + "epoch": 0.07332800975417085, + "grad_norm": 2.515242338180542, + "learning_rate": 9.306718522287281e-05, + "loss": 0.42656803131103516, + "step": 17080 + }, + { + "epoch": 0.07337094184419087, + "grad_norm": 0.02410704828798771, + "learning_rate": 9.30628735027552e-05, + "loss": 0.15156646966934204, + "step": 17090 + }, + { + "epoch": 0.07341387393421087, + "grad_norm": 7.142855167388916, + "learning_rate": 9.305856178263757e-05, + "loss": 0.31276555061340333, + "step": 17100 + }, + { + "epoch": 0.07345680602423087, + "grad_norm": 8.62886905670166, + "learning_rate": 9.305425006251995e-05, + "loss": 0.3314593553543091, + "step": 17110 + }, + { + "epoch": 0.07349973811425088, + "grad_norm": 0.042989179491996765, + "learning_rate": 9.304993834240232e-05, + "loss": 0.1726578950881958, + "step": 17120 + }, + { + "epoch": 0.07354267020427088, + "grad_norm": 0.3412798345088959, + "learning_rate": 9.30456266222847e-05, + "loss": 0.2019892692565918, + "step": 17130 + }, + { + "epoch": 0.0735856022942909, + "grad_norm": 0.10631978511810303, + "learning_rate": 9.304131490216708e-05, + "loss": 0.32094345092773435, + "step": 17140 + }, + { + "epoch": 0.0736285343843109, + "grad_norm": 1.4753174781799316, + "learning_rate": 9.303700318204946e-05, + "loss": 0.3164651393890381, + "step": 17150 + }, + { + "epoch": 0.0736714664743309, + "grad_norm": 1.849280595779419, + "learning_rate": 9.303269146193183e-05, + "loss": 0.3136913537979126, + "step": 17160 + }, + { + "epoch": 0.07371439856435091, + "grad_norm": 0.006545449141412973, + "learning_rate": 9.302837974181421e-05, + "loss": 0.2594448566436768, + "step": 17170 + }, + { + "epoch": 0.07375733065437091, + "grad_norm": 0.0032706542406231165, + "learning_rate": 9.302406802169657e-05, + "loss": 0.13361896276474, + "step": 17180 + }, + { + "epoch": 0.07380026274439093, + "grad_norm": 0.02022142894566059, + "learning_rate": 9.301975630157895e-05, + "loss": 0.030848246812820435, + "step": 17190 + }, + { + "epoch": 0.07384319483441093, + "grad_norm": 0.004030085168778896, + "learning_rate": 9.301544458146133e-05, + "loss": 0.2393047332763672, + "step": 17200 + }, + { + "epoch": 0.07388612692443093, + "grad_norm": 0.04448021203279495, + "learning_rate": 9.30111328613437e-05, + "loss": 0.12692539691925048, + "step": 17210 + }, + { + "epoch": 0.07392905901445095, + "grad_norm": 0.10861719399690628, + "learning_rate": 9.300682114122608e-05, + "loss": 0.31166269779205324, + "step": 17220 + }, + { + "epoch": 0.07397199110447095, + "grad_norm": 0.00539032556116581, + "learning_rate": 9.300250942110846e-05, + "loss": 0.08787255883216857, + "step": 17230 + }, + { + "epoch": 0.07401492319449095, + "grad_norm": 1.3360871076583862, + "learning_rate": 9.299819770099084e-05, + "loss": 0.3009865045547485, + "step": 17240 + }, + { + "epoch": 0.07405785528451096, + "grad_norm": 0.24125048518180847, + "learning_rate": 9.299388598087322e-05, + "loss": 0.18031530380249022, + "step": 17250 + }, + { + "epoch": 0.07410078737453096, + "grad_norm": 0.004599709529429674, + "learning_rate": 9.298957426075558e-05, + "loss": 0.34679040908813474, + "step": 17260 + }, + { + "epoch": 0.07414371946455098, + "grad_norm": 5.64448881149292, + "learning_rate": 9.298526254063796e-05, + "loss": 0.20970757007598878, + "step": 17270 + }, + { + "epoch": 0.07418665155457098, + "grad_norm": 0.8933638334274292, + "learning_rate": 9.298095082052033e-05, + "loss": 0.2926939487457275, + "step": 17280 + }, + { + "epoch": 0.07422958364459098, + "grad_norm": 1.7648670673370361, + "learning_rate": 9.297663910040271e-05, + "loss": 0.3770651578903198, + "step": 17290 + }, + { + "epoch": 0.074272515734611, + "grad_norm": 6.239072322845459, + "learning_rate": 9.297232738028509e-05, + "loss": 0.2613657474517822, + "step": 17300 + }, + { + "epoch": 0.074315447824631, + "grad_norm": 0.22732043266296387, + "learning_rate": 9.296801566016748e-05, + "loss": 0.33427302837371825, + "step": 17310 + }, + { + "epoch": 0.07435837991465101, + "grad_norm": 0.988491415977478, + "learning_rate": 9.296370394004986e-05, + "loss": 0.11933293342590331, + "step": 17320 + }, + { + "epoch": 0.07440131200467101, + "grad_norm": 0.3913913369178772, + "learning_rate": 9.295939221993223e-05, + "loss": 0.2547286033630371, + "step": 17330 + }, + { + "epoch": 0.07444424409469101, + "grad_norm": 0.3429929316043854, + "learning_rate": 9.29550804998146e-05, + "loss": 0.30628602504730223, + "step": 17340 + }, + { + "epoch": 0.07448717618471103, + "grad_norm": 5.126238822937012, + "learning_rate": 9.295076877969698e-05, + "loss": 0.3914073944091797, + "step": 17350 + }, + { + "epoch": 0.07453010827473103, + "grad_norm": 0.3128284215927124, + "learning_rate": 9.294645705957935e-05, + "loss": 0.22532930374145507, + "step": 17360 + }, + { + "epoch": 0.07457304036475104, + "grad_norm": 0.21137411892414093, + "learning_rate": 9.294214533946173e-05, + "loss": 0.27229340076446534, + "step": 17370 + }, + { + "epoch": 0.07461597245477104, + "grad_norm": 1.3947068452835083, + "learning_rate": 9.293783361934411e-05, + "loss": 0.19315674304962158, + "step": 17380 + }, + { + "epoch": 0.07465890454479104, + "grad_norm": 1.3445826768875122, + "learning_rate": 9.293352189922649e-05, + "loss": 0.44831433296203616, + "step": 17390 + }, + { + "epoch": 0.07470183663481106, + "grad_norm": 1.0823842287063599, + "learning_rate": 9.292921017910886e-05, + "loss": 0.1613788366317749, + "step": 17400 + }, + { + "epoch": 0.07474476872483106, + "grad_norm": 0.2984132170677185, + "learning_rate": 9.292489845899124e-05, + "loss": 0.18345820903778076, + "step": 17410 + }, + { + "epoch": 0.07478770081485107, + "grad_norm": 1.5152897834777832, + "learning_rate": 9.292058673887362e-05, + "loss": 0.29555258750915525, + "step": 17420 + }, + { + "epoch": 0.07483063290487107, + "grad_norm": 0.08002516627311707, + "learning_rate": 9.291627501875598e-05, + "loss": 0.29400632381439207, + "step": 17430 + }, + { + "epoch": 0.07487356499489108, + "grad_norm": 0.10808016359806061, + "learning_rate": 9.291196329863836e-05, + "loss": 0.2884323835372925, + "step": 17440 + }, + { + "epoch": 0.07491649708491109, + "grad_norm": 0.11248364299535751, + "learning_rate": 9.290765157852074e-05, + "loss": 0.0785040020942688, + "step": 17450 + }, + { + "epoch": 0.07495942917493109, + "grad_norm": 0.34658434987068176, + "learning_rate": 9.290333985840311e-05, + "loss": 0.33166520595550536, + "step": 17460 + }, + { + "epoch": 0.0750023612649511, + "grad_norm": 0.7928297519683838, + "learning_rate": 9.289902813828549e-05, + "loss": 0.3397735834121704, + "step": 17470 + }, + { + "epoch": 0.07504529335497111, + "grad_norm": 0.6116107106208801, + "learning_rate": 9.289471641816787e-05, + "loss": 0.04122519195079803, + "step": 17480 + }, + { + "epoch": 0.07508822544499111, + "grad_norm": 0.1864428073167801, + "learning_rate": 9.289040469805025e-05, + "loss": 0.18004008531570434, + "step": 17490 + }, + { + "epoch": 0.07513115753501112, + "grad_norm": 0.0013322310987859964, + "learning_rate": 9.288609297793262e-05, + "loss": 0.10974700450897217, + "step": 17500 + }, + { + "epoch": 0.07517408962503112, + "grad_norm": 0.008626694791018963, + "learning_rate": 9.288178125781499e-05, + "loss": 0.2750270128250122, + "step": 17510 + }, + { + "epoch": 0.07521702171505114, + "grad_norm": 0.013454783707857132, + "learning_rate": 9.287746953769736e-05, + "loss": 0.1500556468963623, + "step": 17520 + }, + { + "epoch": 0.07525995380507114, + "grad_norm": 1.009264349937439, + "learning_rate": 9.287315781757975e-05, + "loss": 0.15861928462982178, + "step": 17530 + }, + { + "epoch": 0.07530288589509114, + "grad_norm": 3.1030020713806152, + "learning_rate": 9.286884609746213e-05, + "loss": 0.32444114685058595, + "step": 17540 + }, + { + "epoch": 0.07534581798511115, + "grad_norm": 0.42069998383522034, + "learning_rate": 9.286453437734451e-05, + "loss": 0.23469743728637696, + "step": 17550 + }, + { + "epoch": 0.07538875007513116, + "grad_norm": 0.8916969299316406, + "learning_rate": 9.286022265722689e-05, + "loss": 0.251232647895813, + "step": 17560 + }, + { + "epoch": 0.07543168216515117, + "grad_norm": 0.0047593554481863976, + "learning_rate": 9.285591093710926e-05, + "loss": 0.39376943111419677, + "step": 17570 + }, + { + "epoch": 0.07547461425517117, + "grad_norm": 2.4464917182922363, + "learning_rate": 9.285159921699164e-05, + "loss": 0.27484569549560545, + "step": 17580 + }, + { + "epoch": 0.07551754634519117, + "grad_norm": 2.3647587299346924, + "learning_rate": 9.2847287496874e-05, + "loss": 0.35099794864654543, + "step": 17590 + }, + { + "epoch": 0.07556047843521119, + "grad_norm": 4.215408802032471, + "learning_rate": 9.284297577675638e-05, + "loss": 0.21538822650909423, + "step": 17600 + }, + { + "epoch": 0.07560341052523119, + "grad_norm": 0.03410341590642929, + "learning_rate": 9.283866405663876e-05, + "loss": 0.18266010284423828, + "step": 17610 + }, + { + "epoch": 0.0756463426152512, + "grad_norm": 6.214269638061523, + "learning_rate": 9.283435233652114e-05, + "loss": 0.42656545639038085, + "step": 17620 + }, + { + "epoch": 0.0756892747052712, + "grad_norm": 3.918022871017456, + "learning_rate": 9.283004061640351e-05, + "loss": 0.14886451959609986, + "step": 17630 + }, + { + "epoch": 0.0757322067952912, + "grad_norm": 0.05858919024467468, + "learning_rate": 9.282572889628589e-05, + "loss": 0.48270864486694337, + "step": 17640 + }, + { + "epoch": 0.07577513888531122, + "grad_norm": 22.52327537536621, + "learning_rate": 9.282141717616827e-05, + "loss": 0.35639214515686035, + "step": 17650 + }, + { + "epoch": 0.07581807097533122, + "grad_norm": 0.14872047305107117, + "learning_rate": 9.281710545605065e-05, + "loss": 0.480879545211792, + "step": 17660 + }, + { + "epoch": 0.07586100306535122, + "grad_norm": 2.3057806491851807, + "learning_rate": 9.281279373593301e-05, + "loss": 0.2604722023010254, + "step": 17670 + }, + { + "epoch": 0.07590393515537124, + "grad_norm": 0.18835517764091492, + "learning_rate": 9.280848201581539e-05, + "loss": 0.17890266180038453, + "step": 17680 + }, + { + "epoch": 0.07594686724539124, + "grad_norm": 0.07281363755464554, + "learning_rate": 9.280417029569776e-05, + "loss": 0.4239158630371094, + "step": 17690 + }, + { + "epoch": 0.07598979933541125, + "grad_norm": 9.378323554992676, + "learning_rate": 9.279985857558014e-05, + "loss": 0.19697673320770265, + "step": 17700 + }, + { + "epoch": 0.07603273142543125, + "grad_norm": 0.8428283333778381, + "learning_rate": 9.279554685546252e-05, + "loss": 0.234600830078125, + "step": 17710 + }, + { + "epoch": 0.07607566351545125, + "grad_norm": 0.11949367076158524, + "learning_rate": 9.27912351353449e-05, + "loss": 0.4152794361114502, + "step": 17720 + }, + { + "epoch": 0.07611859560547127, + "grad_norm": 0.05498175323009491, + "learning_rate": 9.278692341522727e-05, + "loss": 0.29344398975372316, + "step": 17730 + }, + { + "epoch": 0.07616152769549127, + "grad_norm": 0.007519569247961044, + "learning_rate": 9.278261169510965e-05, + "loss": 0.14354888200759888, + "step": 17740 + }, + { + "epoch": 0.07620445978551128, + "grad_norm": 2.0580191612243652, + "learning_rate": 9.277829997499203e-05, + "loss": 0.3214442729949951, + "step": 17750 + }, + { + "epoch": 0.07624739187553128, + "grad_norm": 0.21962420642375946, + "learning_rate": 9.27739882548744e-05, + "loss": 0.1612050414085388, + "step": 17760 + }, + { + "epoch": 0.07629032396555128, + "grad_norm": 0.6961410641670227, + "learning_rate": 9.276967653475678e-05, + "loss": 0.3876644134521484, + "step": 17770 + }, + { + "epoch": 0.0763332560555713, + "grad_norm": 0.548738420009613, + "learning_rate": 9.276536481463916e-05, + "loss": 0.28008925914764404, + "step": 17780 + }, + { + "epoch": 0.0763761881455913, + "grad_norm": 1.0258835554122925, + "learning_rate": 9.276105309452154e-05, + "loss": 0.2834671974182129, + "step": 17790 + }, + { + "epoch": 0.07641912023561132, + "grad_norm": 2.0303940773010254, + "learning_rate": 9.275674137440392e-05, + "loss": 0.37418713569641116, + "step": 17800 + }, + { + "epoch": 0.07646205232563132, + "grad_norm": 1.1370363235473633, + "learning_rate": 9.275242965428629e-05, + "loss": 0.3102121353149414, + "step": 17810 + }, + { + "epoch": 0.07650498441565132, + "grad_norm": 1.8728364706039429, + "learning_rate": 9.274811793416867e-05, + "loss": 0.13144179582595825, + "step": 17820 + }, + { + "epoch": 0.07654791650567133, + "grad_norm": 1.3970445394515991, + "learning_rate": 9.274380621405105e-05, + "loss": 0.27070481777191163, + "step": 17830 + }, + { + "epoch": 0.07659084859569133, + "grad_norm": 1.4733984470367432, + "learning_rate": 9.273949449393341e-05, + "loss": 0.15226986408233642, + "step": 17840 + }, + { + "epoch": 0.07663378068571135, + "grad_norm": 0.010169444605708122, + "learning_rate": 9.273518277381579e-05, + "loss": 0.2343198299407959, + "step": 17850 + }, + { + "epoch": 0.07667671277573135, + "grad_norm": 0.03013860061764717, + "learning_rate": 9.273087105369817e-05, + "loss": 0.1993506908416748, + "step": 17860 + }, + { + "epoch": 0.07671964486575135, + "grad_norm": 0.0857686772942543, + "learning_rate": 9.272655933358054e-05, + "loss": 0.13797582387924195, + "step": 17870 + }, + { + "epoch": 0.07676257695577136, + "grad_norm": 1.5063636302947998, + "learning_rate": 9.272224761346292e-05, + "loss": 0.19858624935150146, + "step": 17880 + }, + { + "epoch": 0.07680550904579136, + "grad_norm": 0.0022428890224546194, + "learning_rate": 9.27179358933453e-05, + "loss": 0.39183568954467773, + "step": 17890 + }, + { + "epoch": 0.07684844113581138, + "grad_norm": 1.1863030195236206, + "learning_rate": 9.271362417322768e-05, + "loss": 0.47901058197021484, + "step": 17900 + }, + { + "epoch": 0.07689137322583138, + "grad_norm": 0.13905732333660126, + "learning_rate": 9.270931245311005e-05, + "loss": 0.2854343891143799, + "step": 17910 + }, + { + "epoch": 0.07693430531585138, + "grad_norm": 0.03768673911690712, + "learning_rate": 9.270500073299242e-05, + "loss": 0.24338369369506835, + "step": 17920 + }, + { + "epoch": 0.0769772374058714, + "grad_norm": 0.04136132448911667, + "learning_rate": 9.27006890128748e-05, + "loss": 0.15599859952926637, + "step": 17930 + }, + { + "epoch": 0.0770201694958914, + "grad_norm": 0.2402133345603943, + "learning_rate": 9.269637729275717e-05, + "loss": 0.2983910799026489, + "step": 17940 + }, + { + "epoch": 0.07706310158591141, + "grad_norm": 0.00554937357082963, + "learning_rate": 9.269206557263955e-05, + "loss": 0.11619760990142822, + "step": 17950 + }, + { + "epoch": 0.07710603367593141, + "grad_norm": 1.5368311405181885, + "learning_rate": 9.268775385252193e-05, + "loss": 0.21799736022949218, + "step": 17960 + }, + { + "epoch": 0.07714896576595141, + "grad_norm": 1.8025269508361816, + "learning_rate": 9.26834421324043e-05, + "loss": 0.33750646114349364, + "step": 17970 + }, + { + "epoch": 0.07719189785597143, + "grad_norm": 0.08740323036909103, + "learning_rate": 9.267913041228668e-05, + "loss": 0.23000924587249755, + "step": 17980 + }, + { + "epoch": 0.07723482994599143, + "grad_norm": 2.418018341064453, + "learning_rate": 9.267481869216906e-05, + "loss": 0.21157240867614746, + "step": 17990 + }, + { + "epoch": 0.07727776203601144, + "grad_norm": 1.756937026977539, + "learning_rate": 9.267050697205144e-05, + "loss": 0.42026352882385254, + "step": 18000 + }, + { + "epoch": 0.07727776203601144, + "eval_loss": 0.49356770515441895, + "eval_runtime": 27.51, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 3.635, + "step": 18000 + }, + { + "epoch": 0.07732069412603144, + "grad_norm": 0.012592796236276627, + "learning_rate": 9.266619525193381e-05, + "loss": 0.015883392095565795, + "step": 18010 + }, + { + "epoch": 0.07736362621605145, + "grad_norm": 0.0440199077129364, + "learning_rate": 9.266188353181619e-05, + "loss": 0.3735236167907715, + "step": 18020 + }, + { + "epoch": 0.07740655830607146, + "grad_norm": 0.01626587100327015, + "learning_rate": 9.265757181169857e-05, + "loss": 0.1956225872039795, + "step": 18030 + }, + { + "epoch": 0.07744949039609146, + "grad_norm": 1.1902430057525635, + "learning_rate": 9.265326009158094e-05, + "loss": 0.4065211772918701, + "step": 18040 + }, + { + "epoch": 0.07749242248611148, + "grad_norm": 0.0007542611565440893, + "learning_rate": 9.264894837146332e-05, + "loss": 0.32700634002685547, + "step": 18050 + }, + { + "epoch": 0.07753535457613148, + "grad_norm": 2.0431017875671387, + "learning_rate": 9.26446366513457e-05, + "loss": 0.3908370494842529, + "step": 18060 + }, + { + "epoch": 0.07757828666615148, + "grad_norm": 1.1069626808166504, + "learning_rate": 9.264032493122808e-05, + "loss": 0.555892038345337, + "step": 18070 + }, + { + "epoch": 0.07762121875617149, + "grad_norm": 0.035903044044971466, + "learning_rate": 9.263601321111044e-05, + "loss": 0.0531062126159668, + "step": 18080 + }, + { + "epoch": 0.0776641508461915, + "grad_norm": 0.05693231150507927, + "learning_rate": 9.263170149099282e-05, + "loss": 0.09009885191917419, + "step": 18090 + }, + { + "epoch": 0.0777070829362115, + "grad_norm": 0.026545345783233643, + "learning_rate": 9.26273897708752e-05, + "loss": 0.29533431529998777, + "step": 18100 + }, + { + "epoch": 0.07775001502623151, + "grad_norm": 0.12718848884105682, + "learning_rate": 9.262307805075757e-05, + "loss": 0.13758124113082887, + "step": 18110 + }, + { + "epoch": 0.07779294711625151, + "grad_norm": 1.6255548000335693, + "learning_rate": 9.261876633063995e-05, + "loss": 0.2296905994415283, + "step": 18120 + }, + { + "epoch": 0.07783587920627152, + "grad_norm": 0.0008108015172183514, + "learning_rate": 9.261445461052233e-05, + "loss": 0.2890913963317871, + "step": 18130 + }, + { + "epoch": 0.07787881129629153, + "grad_norm": 1.9627443552017212, + "learning_rate": 9.26101428904047e-05, + "loss": 0.3923152446746826, + "step": 18140 + }, + { + "epoch": 0.07792174338631153, + "grad_norm": 0.3513317406177521, + "learning_rate": 9.260583117028708e-05, + "loss": 0.10352857112884521, + "step": 18150 + }, + { + "epoch": 0.07796467547633154, + "grad_norm": 0.09808113425970078, + "learning_rate": 9.260151945016946e-05, + "loss": 0.17033973932266236, + "step": 18160 + }, + { + "epoch": 0.07800760756635154, + "grad_norm": 1.067866325378418, + "learning_rate": 9.259720773005182e-05, + "loss": 0.21410031318664552, + "step": 18170 + }, + { + "epoch": 0.07805053965637156, + "grad_norm": 1.5204136371612549, + "learning_rate": 9.25928960099342e-05, + "loss": 0.1276358723640442, + "step": 18180 + }, + { + "epoch": 0.07809347174639156, + "grad_norm": 0.0007020228658802807, + "learning_rate": 9.258858428981658e-05, + "loss": 0.22528154850006105, + "step": 18190 + }, + { + "epoch": 0.07813640383641156, + "grad_norm": 0.009184078313410282, + "learning_rate": 9.258427256969896e-05, + "loss": 0.2669683456420898, + "step": 18200 + }, + { + "epoch": 0.07817933592643157, + "grad_norm": 0.16662456095218658, + "learning_rate": 9.257996084958133e-05, + "loss": 0.2813804864883423, + "step": 18210 + }, + { + "epoch": 0.07822226801645157, + "grad_norm": 31.754150390625, + "learning_rate": 9.257564912946371e-05, + "loss": 0.3360316514968872, + "step": 18220 + }, + { + "epoch": 0.07826520010647159, + "grad_norm": 2.000892400741577, + "learning_rate": 9.257133740934609e-05, + "loss": 0.28811111450195315, + "step": 18230 + }, + { + "epoch": 0.07830813219649159, + "grad_norm": 0.47206950187683105, + "learning_rate": 9.256702568922846e-05, + "loss": 0.13355579376220703, + "step": 18240 + }, + { + "epoch": 0.07835106428651159, + "grad_norm": 0.0003992223646491766, + "learning_rate": 9.256271396911084e-05, + "loss": 0.22010478973388672, + "step": 18250 + }, + { + "epoch": 0.0783939963765316, + "grad_norm": 0.0260959193110466, + "learning_rate": 9.255840224899322e-05, + "loss": 0.13738093376159669, + "step": 18260 + }, + { + "epoch": 0.0784369284665516, + "grad_norm": 1.5283880233764648, + "learning_rate": 9.25540905288756e-05, + "loss": 0.3932394027709961, + "step": 18270 + }, + { + "epoch": 0.07847986055657162, + "grad_norm": 0.7194616794586182, + "learning_rate": 9.254977880875797e-05, + "loss": 0.22366058826446533, + "step": 18280 + }, + { + "epoch": 0.07852279264659162, + "grad_norm": 6.801660537719727, + "learning_rate": 9.254546708864035e-05, + "loss": 0.3743177652359009, + "step": 18290 + }, + { + "epoch": 0.07856572473661162, + "grad_norm": 0.004187764599919319, + "learning_rate": 9.254115536852273e-05, + "loss": 0.09611039161682129, + "step": 18300 + }, + { + "epoch": 0.07860865682663164, + "grad_norm": 0.01413058489561081, + "learning_rate": 9.25368436484051e-05, + "loss": 0.21443462371826172, + "step": 18310 + }, + { + "epoch": 0.07865158891665164, + "grad_norm": 0.06668446213006973, + "learning_rate": 9.253253192828748e-05, + "loss": 0.3364569902420044, + "step": 18320 + }, + { + "epoch": 0.07869452100667165, + "grad_norm": 0.010097499936819077, + "learning_rate": 9.252822020816985e-05, + "loss": 0.2553986072540283, + "step": 18330 + }, + { + "epoch": 0.07873745309669165, + "grad_norm": 0.10130496323108673, + "learning_rate": 9.252390848805222e-05, + "loss": 0.44827828407287595, + "step": 18340 + }, + { + "epoch": 0.07878038518671165, + "grad_norm": 0.970496654510498, + "learning_rate": 9.25195967679346e-05, + "loss": 0.2125465154647827, + "step": 18350 + }, + { + "epoch": 0.07882331727673167, + "grad_norm": 0.12793606519699097, + "learning_rate": 9.251528504781698e-05, + "loss": 0.3075400829315186, + "step": 18360 + }, + { + "epoch": 0.07886624936675167, + "grad_norm": 0.03173859789967537, + "learning_rate": 9.251097332769936e-05, + "loss": 0.13301374912261962, + "step": 18370 + }, + { + "epoch": 0.07890918145677168, + "grad_norm": 0.9702137112617493, + "learning_rate": 9.250666160758173e-05, + "loss": 0.501566219329834, + "step": 18380 + }, + { + "epoch": 0.07895211354679169, + "grad_norm": 1.6415718793869019, + "learning_rate": 9.250234988746411e-05, + "loss": 0.4016451358795166, + "step": 18390 + }, + { + "epoch": 0.07899504563681169, + "grad_norm": 0.0025606367271393538, + "learning_rate": 9.249803816734649e-05, + "loss": 0.18235890865325927, + "step": 18400 + }, + { + "epoch": 0.0790379777268317, + "grad_norm": 1.0034633874893188, + "learning_rate": 9.249372644722885e-05, + "loss": 0.37052345275878906, + "step": 18410 + }, + { + "epoch": 0.0790809098168517, + "grad_norm": 1.0105383396148682, + "learning_rate": 9.248941472711123e-05, + "loss": 0.1658101797103882, + "step": 18420 + }, + { + "epoch": 0.07912384190687172, + "grad_norm": 0.20500674843788147, + "learning_rate": 9.248510300699361e-05, + "loss": 0.07586517333984374, + "step": 18430 + }, + { + "epoch": 0.07916677399689172, + "grad_norm": 2.6038026809692383, + "learning_rate": 9.248079128687598e-05, + "loss": 0.3654672384262085, + "step": 18440 + }, + { + "epoch": 0.07920970608691172, + "grad_norm": 1.0341541767120361, + "learning_rate": 9.247647956675836e-05, + "loss": 0.18789979219436645, + "step": 18450 + }, + { + "epoch": 0.07925263817693173, + "grad_norm": 0.005500131286680698, + "learning_rate": 9.247216784664074e-05, + "loss": 0.25736103057861326, + "step": 18460 + }, + { + "epoch": 0.07929557026695173, + "grad_norm": 3.110564708709717, + "learning_rate": 9.246785612652312e-05, + "loss": 0.3830663442611694, + "step": 18470 + }, + { + "epoch": 0.07933850235697175, + "grad_norm": 27.515901565551758, + "learning_rate": 9.24635444064055e-05, + "loss": 0.2711588621139526, + "step": 18480 + }, + { + "epoch": 0.07938143444699175, + "grad_norm": 0.05192455276846886, + "learning_rate": 9.245923268628787e-05, + "loss": 0.14970123767852783, + "step": 18490 + }, + { + "epoch": 0.07942436653701175, + "grad_norm": 1.9534826278686523, + "learning_rate": 9.245492096617025e-05, + "loss": 0.41820321083068845, + "step": 18500 + }, + { + "epoch": 0.07946729862703177, + "grad_norm": 0.18882907927036285, + "learning_rate": 9.245060924605263e-05, + "loss": 0.10666049718856811, + "step": 18510 + }, + { + "epoch": 0.07951023071705177, + "grad_norm": 0.5181287527084351, + "learning_rate": 9.2446297525935e-05, + "loss": 0.2748467445373535, + "step": 18520 + }, + { + "epoch": 0.07955316280707177, + "grad_norm": 0.07303071022033691, + "learning_rate": 9.244198580581738e-05, + "loss": 0.11560168266296386, + "step": 18530 + }, + { + "epoch": 0.07959609489709178, + "grad_norm": 0.2730436623096466, + "learning_rate": 9.243767408569976e-05, + "loss": 0.14908556938171386, + "step": 18540 + }, + { + "epoch": 0.07963902698711178, + "grad_norm": 6.777319431304932, + "learning_rate": 9.243336236558214e-05, + "loss": 0.21021676063537598, + "step": 18550 + }, + { + "epoch": 0.0796819590771318, + "grad_norm": 2.380194664001465, + "learning_rate": 9.242905064546451e-05, + "loss": 0.30106852054595945, + "step": 18560 + }, + { + "epoch": 0.0797248911671518, + "grad_norm": 1.9388470649719238, + "learning_rate": 9.242473892534689e-05, + "loss": 0.20236554145812988, + "step": 18570 + }, + { + "epoch": 0.0797678232571718, + "grad_norm": 3.202451467514038, + "learning_rate": 9.242042720522925e-05, + "loss": 0.2923529863357544, + "step": 18580 + }, + { + "epoch": 0.07981075534719181, + "grad_norm": 1.1814517974853516, + "learning_rate": 9.241611548511163e-05, + "loss": 0.35280370712280273, + "step": 18590 + }, + { + "epoch": 0.07985368743721181, + "grad_norm": 2.723605155944824, + "learning_rate": 9.241180376499401e-05, + "loss": 0.31783127784729004, + "step": 18600 + }, + { + "epoch": 0.07989661952723183, + "grad_norm": 0.038898617029190063, + "learning_rate": 9.240749204487639e-05, + "loss": 0.35399770736694336, + "step": 18610 + }, + { + "epoch": 0.07993955161725183, + "grad_norm": 0.9012385010719299, + "learning_rate": 9.240318032475876e-05, + "loss": 0.29789721965789795, + "step": 18620 + }, + { + "epoch": 0.07998248370727183, + "grad_norm": 0.15400391817092896, + "learning_rate": 9.239886860464114e-05, + "loss": 0.1943342924118042, + "step": 18630 + }, + { + "epoch": 0.08002541579729185, + "grad_norm": 0.08272214233875275, + "learning_rate": 9.239455688452352e-05, + "loss": 0.1672539234161377, + "step": 18640 + }, + { + "epoch": 0.08006834788731185, + "grad_norm": 0.004393594805151224, + "learning_rate": 9.23902451644059e-05, + "loss": 0.10813627243041993, + "step": 18650 + }, + { + "epoch": 0.08011127997733186, + "grad_norm": 1.657952904701233, + "learning_rate": 9.238593344428826e-05, + "loss": 0.4301816463470459, + "step": 18660 + }, + { + "epoch": 0.08015421206735186, + "grad_norm": 2.1112592220306396, + "learning_rate": 9.238162172417064e-05, + "loss": 0.2593135595321655, + "step": 18670 + }, + { + "epoch": 0.08019714415737186, + "grad_norm": 1.2312036752700806, + "learning_rate": 9.237731000405301e-05, + "loss": 0.1060512900352478, + "step": 18680 + }, + { + "epoch": 0.08024007624739188, + "grad_norm": 0.550669252872467, + "learning_rate": 9.237299828393539e-05, + "loss": 0.16692020893096923, + "step": 18690 + }, + { + "epoch": 0.08028300833741188, + "grad_norm": 0.06942453235387802, + "learning_rate": 9.236868656381777e-05, + "loss": 0.23918561935424804, + "step": 18700 + }, + { + "epoch": 0.0803259404274319, + "grad_norm": 0.8593615293502808, + "learning_rate": 9.236437484370015e-05, + "loss": 0.34322171211242675, + "step": 18710 + }, + { + "epoch": 0.0803688725174519, + "grad_norm": 0.001945764059200883, + "learning_rate": 9.236006312358254e-05, + "loss": 0.1272782564163208, + "step": 18720 + }, + { + "epoch": 0.0804118046074719, + "grad_norm": 0.14265984296798706, + "learning_rate": 9.235575140346491e-05, + "loss": 0.17112067937850953, + "step": 18730 + }, + { + "epoch": 0.08045473669749191, + "grad_norm": 0.7494179010391235, + "learning_rate": 9.235143968334728e-05, + "loss": 0.32529516220092775, + "step": 18740 + }, + { + "epoch": 0.08049766878751191, + "grad_norm": 12.334822654724121, + "learning_rate": 9.234712796322965e-05, + "loss": 0.3763798952102661, + "step": 18750 + }, + { + "epoch": 0.08054060087753193, + "grad_norm": 10.992851257324219, + "learning_rate": 9.234281624311203e-05, + "loss": 0.1684706449508667, + "step": 18760 + }, + { + "epoch": 0.08058353296755193, + "grad_norm": 0.1401337832212448, + "learning_rate": 9.233850452299441e-05, + "loss": 0.190047287940979, + "step": 18770 + }, + { + "epoch": 0.08062646505757193, + "grad_norm": 1.823208212852478, + "learning_rate": 9.233419280287679e-05, + "loss": 0.2534889459609985, + "step": 18780 + }, + { + "epoch": 0.08066939714759194, + "grad_norm": 0.029730668291449547, + "learning_rate": 9.232988108275916e-05, + "loss": 0.18909434080123902, + "step": 18790 + }, + { + "epoch": 0.08071232923761194, + "grad_norm": 2.487135171890259, + "learning_rate": 9.232556936264154e-05, + "loss": 0.2689740896224976, + "step": 18800 + }, + { + "epoch": 0.08075526132763196, + "grad_norm": 0.8337386250495911, + "learning_rate": 9.232125764252392e-05, + "loss": 0.22457048892974854, + "step": 18810 + }, + { + "epoch": 0.08079819341765196, + "grad_norm": 8.491974830627441, + "learning_rate": 9.231694592240628e-05, + "loss": 0.1307414174079895, + "step": 18820 + }, + { + "epoch": 0.08084112550767196, + "grad_norm": 0.0009186516981571913, + "learning_rate": 9.231263420228866e-05, + "loss": 0.16259074211120605, + "step": 18830 + }, + { + "epoch": 0.08088405759769197, + "grad_norm": 0.029363462701439857, + "learning_rate": 9.230832248217104e-05, + "loss": 0.30105087757110593, + "step": 18840 + }, + { + "epoch": 0.08092698968771198, + "grad_norm": 0.021574202924966812, + "learning_rate": 9.230401076205341e-05, + "loss": 0.17228692770004272, + "step": 18850 + }, + { + "epoch": 0.08096992177773199, + "grad_norm": 0.005163417663425207, + "learning_rate": 9.229969904193579e-05, + "loss": 0.40862216949462893, + "step": 18860 + }, + { + "epoch": 0.08101285386775199, + "grad_norm": 0.28390929102897644, + "learning_rate": 9.229538732181817e-05, + "loss": 0.1912772536277771, + "step": 18870 + }, + { + "epoch": 0.08105578595777199, + "grad_norm": 2.644794225692749, + "learning_rate": 9.229107560170055e-05, + "loss": 0.5370799541473389, + "step": 18880 + }, + { + "epoch": 0.081098718047792, + "grad_norm": 0.1604711264371872, + "learning_rate": 9.228676388158292e-05, + "loss": 0.3262667179107666, + "step": 18890 + }, + { + "epoch": 0.08114165013781201, + "grad_norm": 0.0013925611274316907, + "learning_rate": 9.22824521614653e-05, + "loss": 0.1252922773361206, + "step": 18900 + }, + { + "epoch": 0.08118458222783201, + "grad_norm": 0.019793476909399033, + "learning_rate": 9.227814044134767e-05, + "loss": 0.25236876010894777, + "step": 18910 + }, + { + "epoch": 0.08122751431785202, + "grad_norm": 0.09192702174186707, + "learning_rate": 9.227382872123004e-05, + "loss": 0.1882225513458252, + "step": 18920 + }, + { + "epoch": 0.08127044640787202, + "grad_norm": 0.003928286023437977, + "learning_rate": 9.226951700111242e-05, + "loss": 0.21287171840667723, + "step": 18930 + }, + { + "epoch": 0.08131337849789204, + "grad_norm": 0.1621057242155075, + "learning_rate": 9.226520528099481e-05, + "loss": 0.0935452401638031, + "step": 18940 + }, + { + "epoch": 0.08135631058791204, + "grad_norm": 0.14026261866092682, + "learning_rate": 9.226089356087719e-05, + "loss": 0.2502132415771484, + "step": 18950 + }, + { + "epoch": 0.08139924267793204, + "grad_norm": 0.028061900287866592, + "learning_rate": 9.225658184075957e-05, + "loss": 0.07642927169799804, + "step": 18960 + }, + { + "epoch": 0.08144217476795206, + "grad_norm": 0.06425946205854416, + "learning_rate": 9.225227012064194e-05, + "loss": 0.17422356605529785, + "step": 18970 + }, + { + "epoch": 0.08148510685797206, + "grad_norm": 1.6507774591445923, + "learning_rate": 9.224795840052432e-05, + "loss": 0.2827779293060303, + "step": 18980 + }, + { + "epoch": 0.08152803894799207, + "grad_norm": 0.08310697227716446, + "learning_rate": 9.224364668040668e-05, + "loss": 0.3493364334106445, + "step": 18990 + }, + { + "epoch": 0.08157097103801207, + "grad_norm": 1.1928002834320068, + "learning_rate": 9.223933496028906e-05, + "loss": 0.48784561157226564, + "step": 19000 + }, + { + "epoch": 0.08157097103801207, + "eval_loss": 0.5041004419326782, + "eval_runtime": 27.5172, + "eval_samples_per_second": 3.634, + "eval_steps_per_second": 3.634, + "step": 19000 + }, + { + "epoch": 0.08161390312803207, + "grad_norm": 0.13684290647506714, + "learning_rate": 9.223502324017144e-05, + "loss": 0.4936811447143555, + "step": 19010 + }, + { + "epoch": 0.08165683521805209, + "grad_norm": 0.04186616465449333, + "learning_rate": 9.223071152005382e-05, + "loss": 0.23352341651916503, + "step": 19020 + }, + { + "epoch": 0.08169976730807209, + "grad_norm": 0.44165217876434326, + "learning_rate": 9.22263997999362e-05, + "loss": 0.2601867437362671, + "step": 19030 + }, + { + "epoch": 0.0817426993980921, + "grad_norm": 0.014664696529507637, + "learning_rate": 9.222208807981857e-05, + "loss": 0.06263558268547058, + "step": 19040 + }, + { + "epoch": 0.0817856314881121, + "grad_norm": 2.0226991176605225, + "learning_rate": 9.221777635970095e-05, + "loss": 0.3118800163269043, + "step": 19050 + }, + { + "epoch": 0.0818285635781321, + "grad_norm": 1.483991265296936, + "learning_rate": 9.221346463958333e-05, + "loss": 0.20840089321136473, + "step": 19060 + }, + { + "epoch": 0.08187149566815212, + "grad_norm": 0.1026201993227005, + "learning_rate": 9.220915291946569e-05, + "loss": 0.20031397342681884, + "step": 19070 + }, + { + "epoch": 0.08191442775817212, + "grad_norm": 0.11774060875177383, + "learning_rate": 9.220484119934807e-05, + "loss": 0.19844886064529418, + "step": 19080 + }, + { + "epoch": 0.08195735984819214, + "grad_norm": 0.018591005355119705, + "learning_rate": 9.220052947923044e-05, + "loss": 0.2688950538635254, + "step": 19090 + }, + { + "epoch": 0.08200029193821214, + "grad_norm": 0.03999682515859604, + "learning_rate": 9.219621775911282e-05, + "loss": 0.3359368324279785, + "step": 19100 + }, + { + "epoch": 0.08204322402823214, + "grad_norm": 0.0394248366355896, + "learning_rate": 9.21919060389952e-05, + "loss": 0.2790042877197266, + "step": 19110 + }, + { + "epoch": 0.08208615611825215, + "grad_norm": 2.473489284515381, + "learning_rate": 9.218759431887758e-05, + "loss": 0.2525103807449341, + "step": 19120 + }, + { + "epoch": 0.08212908820827215, + "grad_norm": 3.0899598598480225, + "learning_rate": 9.218328259875995e-05, + "loss": 0.22865710258483887, + "step": 19130 + }, + { + "epoch": 0.08217202029829217, + "grad_norm": 1.9980636835098267, + "learning_rate": 9.217897087864233e-05, + "loss": 0.3162590980529785, + "step": 19140 + }, + { + "epoch": 0.08221495238831217, + "grad_norm": 0.26501959562301636, + "learning_rate": 9.21746591585247e-05, + "loss": 0.3742363691329956, + "step": 19150 + }, + { + "epoch": 0.08225788447833217, + "grad_norm": 2.546135425567627, + "learning_rate": 9.217034743840709e-05, + "loss": 0.13382744789123535, + "step": 19160 + }, + { + "epoch": 0.08230081656835218, + "grad_norm": 0.9495781064033508, + "learning_rate": 9.216603571828946e-05, + "loss": 0.3787665843963623, + "step": 19170 + }, + { + "epoch": 0.08234374865837218, + "grad_norm": 0.04306304082274437, + "learning_rate": 9.216172399817184e-05, + "loss": 0.2250833511352539, + "step": 19180 + }, + { + "epoch": 0.0823866807483922, + "grad_norm": 3.428227424621582, + "learning_rate": 9.215741227805422e-05, + "loss": 0.6014655590057373, + "step": 19190 + }, + { + "epoch": 0.0824296128384122, + "grad_norm": 0.19456183910369873, + "learning_rate": 9.21531005579366e-05, + "loss": 0.13260576725006104, + "step": 19200 + }, + { + "epoch": 0.0824725449284322, + "grad_norm": 0.9759875535964966, + "learning_rate": 9.214878883781897e-05, + "loss": 0.23652286529541017, + "step": 19210 + }, + { + "epoch": 0.08251547701845222, + "grad_norm": 1.5591341257095337, + "learning_rate": 9.214447711770135e-05, + "loss": 0.3411963701248169, + "step": 19220 + }, + { + "epoch": 0.08255840910847222, + "grad_norm": 0.06276403367519379, + "learning_rate": 9.214016539758371e-05, + "loss": 0.46163148880004884, + "step": 19230 + }, + { + "epoch": 0.08260134119849223, + "grad_norm": 0.541723370552063, + "learning_rate": 9.213585367746609e-05, + "loss": 0.33110618591308594, + "step": 19240 + }, + { + "epoch": 0.08264427328851223, + "grad_norm": 1.5728428363800049, + "learning_rate": 9.213154195734847e-05, + "loss": 0.28666555881500244, + "step": 19250 + }, + { + "epoch": 0.08268720537853223, + "grad_norm": 0.13925987482070923, + "learning_rate": 9.212723023723085e-05, + "loss": 0.09053775668144226, + "step": 19260 + }, + { + "epoch": 0.08273013746855225, + "grad_norm": 0.09296605736017227, + "learning_rate": 9.212291851711322e-05, + "loss": 0.2339235782623291, + "step": 19270 + }, + { + "epoch": 0.08277306955857225, + "grad_norm": 1.1923575401306152, + "learning_rate": 9.21186067969956e-05, + "loss": 0.2979546546936035, + "step": 19280 + }, + { + "epoch": 0.08281600164859226, + "grad_norm": 0.02540683001279831, + "learning_rate": 9.211429507687798e-05, + "loss": 0.3172282695770264, + "step": 19290 + }, + { + "epoch": 0.08285893373861226, + "grad_norm": 0.009578239172697067, + "learning_rate": 9.210998335676035e-05, + "loss": 0.16685105562210084, + "step": 19300 + }, + { + "epoch": 0.08290186582863227, + "grad_norm": 0.07192831486463547, + "learning_rate": 9.210567163664273e-05, + "loss": 0.0985795497894287, + "step": 19310 + }, + { + "epoch": 0.08294479791865228, + "grad_norm": 0.5271665453910828, + "learning_rate": 9.21013599165251e-05, + "loss": 0.2767664432525635, + "step": 19320 + }, + { + "epoch": 0.08298773000867228, + "grad_norm": 0.2542556822299957, + "learning_rate": 9.209704819640747e-05, + "loss": 0.27911207675933836, + "step": 19330 + }, + { + "epoch": 0.08303066209869228, + "grad_norm": 0.19310764968395233, + "learning_rate": 9.209273647628985e-05, + "loss": 0.2386990785598755, + "step": 19340 + }, + { + "epoch": 0.0830735941887123, + "grad_norm": 1.4710135459899902, + "learning_rate": 9.208842475617223e-05, + "loss": 0.4310801029205322, + "step": 19350 + }, + { + "epoch": 0.0831165262787323, + "grad_norm": 1.1130155324935913, + "learning_rate": 9.20841130360546e-05, + "loss": 0.15598387718200685, + "step": 19360 + }, + { + "epoch": 0.08315945836875231, + "grad_norm": 0.05505505949258804, + "learning_rate": 9.207980131593698e-05, + "loss": 0.33764450550079345, + "step": 19370 + }, + { + "epoch": 0.08320239045877231, + "grad_norm": 0.062077395617961884, + "learning_rate": 9.207548959581936e-05, + "loss": 0.12927793264389037, + "step": 19380 + }, + { + "epoch": 0.08324532254879231, + "grad_norm": 0.887370228767395, + "learning_rate": 9.207117787570174e-05, + "loss": 0.42006869316101075, + "step": 19390 + }, + { + "epoch": 0.08328825463881233, + "grad_norm": 3.715780258178711, + "learning_rate": 9.206686615558411e-05, + "loss": 0.28740389347076417, + "step": 19400 + }, + { + "epoch": 0.08333118672883233, + "grad_norm": 0.3408714532852173, + "learning_rate": 9.206255443546649e-05, + "loss": 0.2963599681854248, + "step": 19410 + }, + { + "epoch": 0.08337411881885234, + "grad_norm": 0.04663284495472908, + "learning_rate": 9.205824271534887e-05, + "loss": 0.33824911117553713, + "step": 19420 + }, + { + "epoch": 0.08341705090887234, + "grad_norm": 0.008326475508511066, + "learning_rate": 9.205393099523125e-05, + "loss": 0.1826784133911133, + "step": 19430 + }, + { + "epoch": 0.08345998299889235, + "grad_norm": 0.11303720623254776, + "learning_rate": 9.204961927511362e-05, + "loss": 0.27020950317382814, + "step": 19440 + }, + { + "epoch": 0.08350291508891236, + "grad_norm": 0.08869299292564392, + "learning_rate": 9.2045307554996e-05, + "loss": 0.10644828081130982, + "step": 19450 + }, + { + "epoch": 0.08354584717893236, + "grad_norm": 0.030560927465558052, + "learning_rate": 9.204099583487838e-05, + "loss": 0.04898174703121185, + "step": 19460 + }, + { + "epoch": 0.08358877926895238, + "grad_norm": 1.178162932395935, + "learning_rate": 9.203668411476076e-05, + "loss": 0.2633205413818359, + "step": 19470 + }, + { + "epoch": 0.08363171135897238, + "grad_norm": 0.07836475223302841, + "learning_rate": 9.203237239464312e-05, + "loss": 0.21295971870422364, + "step": 19480 + }, + { + "epoch": 0.08367464344899238, + "grad_norm": 0.024082181975245476, + "learning_rate": 9.20280606745255e-05, + "loss": 0.13825159072875975, + "step": 19490 + }, + { + "epoch": 0.08371757553901239, + "grad_norm": 2.2076027393341064, + "learning_rate": 9.202374895440787e-05, + "loss": 0.2961868762969971, + "step": 19500 + }, + { + "epoch": 0.0837605076290324, + "grad_norm": 8.747289657592773, + "learning_rate": 9.201943723429025e-05, + "loss": 0.13476651906967163, + "step": 19510 + }, + { + "epoch": 0.08380343971905241, + "grad_norm": 0.07265298813581467, + "learning_rate": 9.201512551417263e-05, + "loss": 0.3324207067489624, + "step": 19520 + }, + { + "epoch": 0.08384637180907241, + "grad_norm": 1.4036122560501099, + "learning_rate": 9.2010813794055e-05, + "loss": 0.20782277584075928, + "step": 19530 + }, + { + "epoch": 0.08388930389909241, + "grad_norm": 0.006723584607243538, + "learning_rate": 9.200650207393738e-05, + "loss": 0.14426236152648925, + "step": 19540 + }, + { + "epoch": 0.08393223598911242, + "grad_norm": 1.0080065727233887, + "learning_rate": 9.200219035381976e-05, + "loss": 0.268358039855957, + "step": 19550 + }, + { + "epoch": 0.08397516807913243, + "grad_norm": 1.6201801300048828, + "learning_rate": 9.199787863370212e-05, + "loss": 0.42645840644836425, + "step": 19560 + }, + { + "epoch": 0.08401810016915244, + "grad_norm": 0.13518543541431427, + "learning_rate": 9.19935669135845e-05, + "loss": 0.12359261512756348, + "step": 19570 + }, + { + "epoch": 0.08406103225917244, + "grad_norm": 1.122020959854126, + "learning_rate": 9.198925519346688e-05, + "loss": 0.5048614025115967, + "step": 19580 + }, + { + "epoch": 0.08410396434919244, + "grad_norm": 1.1860013008117676, + "learning_rate": 9.198494347334926e-05, + "loss": 0.17101879119873048, + "step": 19590 + }, + { + "epoch": 0.08414689643921246, + "grad_norm": 1.223297119140625, + "learning_rate": 9.198063175323163e-05, + "loss": 0.25258498191833495, + "step": 19600 + }, + { + "epoch": 0.08418982852923246, + "grad_norm": 4.542642116546631, + "learning_rate": 9.197632003311401e-05, + "loss": 0.2318887710571289, + "step": 19610 + }, + { + "epoch": 0.08423276061925247, + "grad_norm": 0.05279264226555824, + "learning_rate": 9.197200831299639e-05, + "loss": 0.19677789211273194, + "step": 19620 + }, + { + "epoch": 0.08427569270927247, + "grad_norm": 0.022767324000597, + "learning_rate": 9.196769659287877e-05, + "loss": 0.1827967047691345, + "step": 19630 + }, + { + "epoch": 0.08431862479929247, + "grad_norm": 0.9736915230751038, + "learning_rate": 9.196338487276114e-05, + "loss": 0.39751832485198973, + "step": 19640 + }, + { + "epoch": 0.08436155688931249, + "grad_norm": 7.58955717086792, + "learning_rate": 9.195907315264352e-05, + "loss": 0.40766420364379885, + "step": 19650 + }, + { + "epoch": 0.08440448897933249, + "grad_norm": 0.9952966570854187, + "learning_rate": 9.19547614325259e-05, + "loss": 0.21652648448944092, + "step": 19660 + }, + { + "epoch": 0.0844474210693525, + "grad_norm": 3.723085641860962, + "learning_rate": 9.195044971240828e-05, + "loss": 0.3138508081436157, + "step": 19670 + }, + { + "epoch": 0.0844903531593725, + "grad_norm": 0.11735602468252182, + "learning_rate": 9.194613799229065e-05, + "loss": 0.2222294807434082, + "step": 19680 + }, + { + "epoch": 0.0845332852493925, + "grad_norm": 0.11522156000137329, + "learning_rate": 9.194182627217303e-05, + "loss": 0.29770517349243164, + "step": 19690 + }, + { + "epoch": 0.08457621733941252, + "grad_norm": 0.09208790957927704, + "learning_rate": 9.193751455205541e-05, + "loss": 0.1982753038406372, + "step": 19700 + }, + { + "epoch": 0.08461914942943252, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.193320283193778e-05, + "loss": 0.3996951818466187, + "step": 19710 + }, + { + "epoch": 0.08466208151945254, + "grad_norm": 12.743633270263672, + "learning_rate": 9.192889111182016e-05, + "loss": 0.21535904407501222, + "step": 19720 + }, + { + "epoch": 0.08470501360947254, + "grad_norm": 0.0034743843134492636, + "learning_rate": 9.192457939170253e-05, + "loss": 0.18522260189056397, + "step": 19730 + }, + { + "epoch": 0.08474794569949254, + "grad_norm": 1.9996086359024048, + "learning_rate": 9.19202676715849e-05, + "loss": 0.29746198654174805, + "step": 19740 + }, + { + "epoch": 0.08479087778951255, + "grad_norm": 1.7059917449951172, + "learning_rate": 9.191595595146728e-05, + "loss": 0.24819631576538087, + "step": 19750 + }, + { + "epoch": 0.08483380987953255, + "grad_norm": 0.013688395731151104, + "learning_rate": 9.191164423134966e-05, + "loss": 0.13482595682144166, + "step": 19760 + }, + { + "epoch": 0.08487674196955255, + "grad_norm": 0.7230508327484131, + "learning_rate": 9.190733251123204e-05, + "loss": 0.22907421588897706, + "step": 19770 + }, + { + "epoch": 0.08491967405957257, + "grad_norm": 0.7261434197425842, + "learning_rate": 9.190302079111441e-05, + "loss": 0.2959503173828125, + "step": 19780 + }, + { + "epoch": 0.08496260614959257, + "grad_norm": 0.03716635704040527, + "learning_rate": 9.189870907099679e-05, + "loss": 0.15986857414245606, + "step": 19790 + }, + { + "epoch": 0.08500553823961259, + "grad_norm": 0.005048257298767567, + "learning_rate": 9.189439735087917e-05, + "loss": 0.1239315152168274, + "step": 19800 + }, + { + "epoch": 0.08504847032963259, + "grad_norm": 19.985151290893555, + "learning_rate": 9.189008563076153e-05, + "loss": 0.24769895076751708, + "step": 19810 + }, + { + "epoch": 0.08509140241965259, + "grad_norm": 0.040633995085954666, + "learning_rate": 9.188577391064391e-05, + "loss": 0.15116959810256958, + "step": 19820 + }, + { + "epoch": 0.0851343345096726, + "grad_norm": 0.0350426509976387, + "learning_rate": 9.188146219052629e-05, + "loss": 0.23688271045684814, + "step": 19830 + }, + { + "epoch": 0.0851772665996926, + "grad_norm": 0.7044867873191833, + "learning_rate": 9.187715047040866e-05, + "loss": 0.29199187755584716, + "step": 19840 + }, + { + "epoch": 0.08522019868971262, + "grad_norm": 0.028381457552313805, + "learning_rate": 9.187283875029104e-05, + "loss": 0.16368452310562134, + "step": 19850 + }, + { + "epoch": 0.08526313077973262, + "grad_norm": 0.10046995431184769, + "learning_rate": 9.186852703017342e-05, + "loss": 0.4022871971130371, + "step": 19860 + }, + { + "epoch": 0.08530606286975262, + "grad_norm": 2.002215623855591, + "learning_rate": 9.18642153100558e-05, + "loss": 0.2921769618988037, + "step": 19870 + }, + { + "epoch": 0.08534899495977263, + "grad_norm": 1.6824100017547607, + "learning_rate": 9.185990358993817e-05, + "loss": 0.2920323610305786, + "step": 19880 + }, + { + "epoch": 0.08539192704979263, + "grad_norm": 0.01267918385565281, + "learning_rate": 9.185559186982055e-05, + "loss": 0.45812501907348635, + "step": 19890 + }, + { + "epoch": 0.08543485913981265, + "grad_norm": 0.0032960656099021435, + "learning_rate": 9.185128014970293e-05, + "loss": 0.1701305627822876, + "step": 19900 + }, + { + "epoch": 0.08547779122983265, + "grad_norm": 0.017295779660344124, + "learning_rate": 9.18469684295853e-05, + "loss": 0.3156233310699463, + "step": 19910 + }, + { + "epoch": 0.08552072331985265, + "grad_norm": 0.10581665486097336, + "learning_rate": 9.184265670946768e-05, + "loss": 0.1895419478416443, + "step": 19920 + }, + { + "epoch": 0.08556365540987267, + "grad_norm": 2.9387333393096924, + "learning_rate": 9.183834498935006e-05, + "loss": 0.20859913825988768, + "step": 19930 + }, + { + "epoch": 0.08560658749989267, + "grad_norm": 0.010136442258954048, + "learning_rate": 9.183403326923244e-05, + "loss": 0.26719396114349364, + "step": 19940 + }, + { + "epoch": 0.08564951958991268, + "grad_norm": 3.116539716720581, + "learning_rate": 9.182972154911481e-05, + "loss": 0.15150291919708253, + "step": 19950 + }, + { + "epoch": 0.08569245167993268, + "grad_norm": 1.7744215726852417, + "learning_rate": 9.182540982899719e-05, + "loss": 0.3903425931930542, + "step": 19960 + }, + { + "epoch": 0.08573538376995268, + "grad_norm": 0.006902491673827171, + "learning_rate": 9.182109810887957e-05, + "loss": 0.09915790557861329, + "step": 19970 + }, + { + "epoch": 0.0857783158599727, + "grad_norm": 0.19755761325359344, + "learning_rate": 9.181678638876193e-05, + "loss": 0.21800611019134522, + "step": 19980 + }, + { + "epoch": 0.0858212479499927, + "grad_norm": 0.004093521274626255, + "learning_rate": 9.181247466864431e-05, + "loss": 0.43779420852661133, + "step": 19990 + }, + { + "epoch": 0.08586418004001271, + "grad_norm": 0.04544459655880928, + "learning_rate": 9.180816294852669e-05, + "loss": 0.19077495336532593, + "step": 20000 + }, + { + "epoch": 0.08586418004001271, + "eval_loss": 0.5211971402168274, + "eval_runtime": 27.4516, + "eval_samples_per_second": 3.643, + "eval_steps_per_second": 3.643, + "step": 20000 + }, + { + "epoch": 0.08590711213003271, + "grad_norm": 2.292238712310791, + "learning_rate": 9.180385122840906e-05, + "loss": 0.22309489250183107, + "step": 20010 + }, + { + "epoch": 0.08595004422005272, + "grad_norm": 0.13180018961429596, + "learning_rate": 9.179953950829144e-05, + "loss": 0.5180996894836426, + "step": 20020 + }, + { + "epoch": 0.08599297631007273, + "grad_norm": 0.03309754282236099, + "learning_rate": 9.179522778817382e-05, + "loss": 0.28703348636627196, + "step": 20030 + }, + { + "epoch": 0.08603590840009273, + "grad_norm": 1.801064133644104, + "learning_rate": 9.17909160680562e-05, + "loss": 0.34079132080078123, + "step": 20040 + }, + { + "epoch": 0.08607884049011275, + "grad_norm": 0.1331978440284729, + "learning_rate": 9.178660434793857e-05, + "loss": 0.09699448347091674, + "step": 20050 + }, + { + "epoch": 0.08612177258013275, + "grad_norm": 0.053721651434898376, + "learning_rate": 9.178229262782094e-05, + "loss": 0.06542769670486451, + "step": 20060 + }, + { + "epoch": 0.08616470467015275, + "grad_norm": 0.33402219414711, + "learning_rate": 9.177798090770331e-05, + "loss": 0.282747745513916, + "step": 20070 + }, + { + "epoch": 0.08620763676017276, + "grad_norm": 0.03361477330327034, + "learning_rate": 9.177366918758569e-05, + "loss": 0.15889936685562134, + "step": 20080 + }, + { + "epoch": 0.08625056885019276, + "grad_norm": 0.0654355138540268, + "learning_rate": 9.176935746746807e-05, + "loss": 0.34210996627807616, + "step": 20090 + }, + { + "epoch": 0.08629350094021278, + "grad_norm": 0.4426393210887909, + "learning_rate": 9.176504574735045e-05, + "loss": 0.16542912721633912, + "step": 20100 + }, + { + "epoch": 0.08633643303023278, + "grad_norm": 0.10327360033988953, + "learning_rate": 9.176073402723282e-05, + "loss": 0.21687026023864747, + "step": 20110 + }, + { + "epoch": 0.08637936512025278, + "grad_norm": 0.19463887810707092, + "learning_rate": 9.175642230711522e-05, + "loss": 0.31909153461456297, + "step": 20120 + }, + { + "epoch": 0.0864222972102728, + "grad_norm": 0.1267862170934677, + "learning_rate": 9.175211058699759e-05, + "loss": 0.20828518867492676, + "step": 20130 + }, + { + "epoch": 0.0864652293002928, + "grad_norm": 0.4136047959327698, + "learning_rate": 9.174779886687996e-05, + "loss": 0.08652875423431397, + "step": 20140 + }, + { + "epoch": 0.08650816139031281, + "grad_norm": 0.04703577607870102, + "learning_rate": 9.174348714676233e-05, + "loss": 0.17158129215240478, + "step": 20150 + }, + { + "epoch": 0.08655109348033281, + "grad_norm": 3.910243511199951, + "learning_rate": 9.173917542664471e-05, + "loss": 0.2885754108428955, + "step": 20160 + }, + { + "epoch": 0.08659402557035281, + "grad_norm": 0.11007514595985413, + "learning_rate": 9.173486370652709e-05, + "loss": 0.2936608076095581, + "step": 20170 + }, + { + "epoch": 0.08663695766037283, + "grad_norm": 0.2206745147705078, + "learning_rate": 9.173055198640947e-05, + "loss": 0.2527461528778076, + "step": 20180 + }, + { + "epoch": 0.08667988975039283, + "grad_norm": 0.09681833535432816, + "learning_rate": 9.172624026629184e-05, + "loss": 0.13297202587127685, + "step": 20190 + }, + { + "epoch": 0.08672282184041283, + "grad_norm": 1.4935474395751953, + "learning_rate": 9.172192854617422e-05, + "loss": 0.10676318407058716, + "step": 20200 + }, + { + "epoch": 0.08676575393043284, + "grad_norm": 0.0713653564453125, + "learning_rate": 9.17176168260566e-05, + "loss": 0.43478784561157224, + "step": 20210 + }, + { + "epoch": 0.08680868602045284, + "grad_norm": 0.05121508985757828, + "learning_rate": 9.171330510593896e-05, + "loss": 0.11575621366500854, + "step": 20220 + }, + { + "epoch": 0.08685161811047286, + "grad_norm": 0.10139039158821106, + "learning_rate": 9.170899338582134e-05, + "loss": 0.26717281341552734, + "step": 20230 + }, + { + "epoch": 0.08689455020049286, + "grad_norm": 0.2904765009880066, + "learning_rate": 9.170468166570372e-05, + "loss": 0.2693187236785889, + "step": 20240 + }, + { + "epoch": 0.08693748229051286, + "grad_norm": 0.34784597158432007, + "learning_rate": 9.17003699455861e-05, + "loss": 0.10214605331420898, + "step": 20250 + }, + { + "epoch": 0.08698041438053287, + "grad_norm": 1.9216970205307007, + "learning_rate": 9.169605822546847e-05, + "loss": 0.15444643497467042, + "step": 20260 + }, + { + "epoch": 0.08702334647055288, + "grad_norm": 0.014310669153928757, + "learning_rate": 9.169174650535085e-05, + "loss": 0.3466238260269165, + "step": 20270 + }, + { + "epoch": 0.08706627856057289, + "grad_norm": 0.15292063355445862, + "learning_rate": 9.168743478523323e-05, + "loss": 0.18252546787261964, + "step": 20280 + }, + { + "epoch": 0.08710921065059289, + "grad_norm": 0.7003419995307922, + "learning_rate": 9.16831230651156e-05, + "loss": 0.18673573732376098, + "step": 20290 + }, + { + "epoch": 0.08715214274061289, + "grad_norm": 0.32834747433662415, + "learning_rate": 9.167881134499797e-05, + "loss": 0.25514419078826905, + "step": 20300 + }, + { + "epoch": 0.08719507483063291, + "grad_norm": 0.07238604873418808, + "learning_rate": 9.167449962488034e-05, + "loss": 0.28299875259399415, + "step": 20310 + }, + { + "epoch": 0.08723800692065291, + "grad_norm": 0.18142496049404144, + "learning_rate": 9.167018790476272e-05, + "loss": 0.10401992797851563, + "step": 20320 + }, + { + "epoch": 0.08728093901067292, + "grad_norm": 1.2397515773773193, + "learning_rate": 9.16658761846451e-05, + "loss": 0.4503783702850342, + "step": 20330 + }, + { + "epoch": 0.08732387110069292, + "grad_norm": 0.0753837451338768, + "learning_rate": 9.166156446452749e-05, + "loss": 0.20443589687347413, + "step": 20340 + }, + { + "epoch": 0.08736680319071292, + "grad_norm": 0.766108512878418, + "learning_rate": 9.165725274440987e-05, + "loss": 0.2131603479385376, + "step": 20350 + }, + { + "epoch": 0.08740973528073294, + "grad_norm": 0.26683467626571655, + "learning_rate": 9.165294102429224e-05, + "loss": 0.3456669092178345, + "step": 20360 + }, + { + "epoch": 0.08745266737075294, + "grad_norm": 7.378421783447266, + "learning_rate": 9.164862930417462e-05, + "loss": 0.23649468421936035, + "step": 20370 + }, + { + "epoch": 0.08749559946077295, + "grad_norm": 6.665241241455078, + "learning_rate": 9.1644317584057e-05, + "loss": 0.19338738918304443, + "step": 20380 + }, + { + "epoch": 0.08753853155079296, + "grad_norm": 1.4870343208312988, + "learning_rate": 9.164000586393936e-05, + "loss": 0.2298222303390503, + "step": 20390 + }, + { + "epoch": 0.08758146364081296, + "grad_norm": 14.525195121765137, + "learning_rate": 9.163569414382174e-05, + "loss": 0.19451080560684203, + "step": 20400 + }, + { + "epoch": 0.08762439573083297, + "grad_norm": 5.415810585021973, + "learning_rate": 9.163138242370412e-05, + "loss": 0.13900411128997803, + "step": 20410 + }, + { + "epoch": 0.08766732782085297, + "grad_norm": 1.1069865226745605, + "learning_rate": 9.16270707035865e-05, + "loss": 0.5359042644500732, + "step": 20420 + }, + { + "epoch": 0.08771025991087299, + "grad_norm": 0.052934516221284866, + "learning_rate": 9.162275898346887e-05, + "loss": 0.27098309993743896, + "step": 20430 + }, + { + "epoch": 0.08775319200089299, + "grad_norm": 0.051988791674375534, + "learning_rate": 9.161844726335125e-05, + "loss": 0.14990001916885376, + "step": 20440 + }, + { + "epoch": 0.08779612409091299, + "grad_norm": 1.257656216621399, + "learning_rate": 9.161413554323363e-05, + "loss": 0.28091883659362793, + "step": 20450 + }, + { + "epoch": 0.087839056180933, + "grad_norm": 0.08454004675149918, + "learning_rate": 9.1609823823116e-05, + "loss": 0.23843903541564943, + "step": 20460 + }, + { + "epoch": 0.087881988270953, + "grad_norm": 0.5705059766769409, + "learning_rate": 9.160551210299837e-05, + "loss": 0.30336081981658936, + "step": 20470 + }, + { + "epoch": 0.08792492036097302, + "grad_norm": 2.9336891174316406, + "learning_rate": 9.160120038288075e-05, + "loss": 0.503492784500122, + "step": 20480 + }, + { + "epoch": 0.08796785245099302, + "grad_norm": 0.20187604427337646, + "learning_rate": 9.159688866276312e-05, + "loss": 0.27225394248962403, + "step": 20490 + }, + { + "epoch": 0.08801078454101302, + "grad_norm": 0.04096986725926399, + "learning_rate": 9.15925769426455e-05, + "loss": 0.23516056537628174, + "step": 20500 + }, + { + "epoch": 0.08805371663103304, + "grad_norm": 0.0220347847789526, + "learning_rate": 9.158826522252788e-05, + "loss": 0.2556183099746704, + "step": 20510 + }, + { + "epoch": 0.08809664872105304, + "grad_norm": 0.33302149176597595, + "learning_rate": 9.158395350241025e-05, + "loss": 0.17670719623565673, + "step": 20520 + }, + { + "epoch": 0.08813958081107305, + "grad_norm": 2.4029204845428467, + "learning_rate": 9.157964178229263e-05, + "loss": 0.20500121116638184, + "step": 20530 + }, + { + "epoch": 0.08818251290109305, + "grad_norm": 4.5730180740356445, + "learning_rate": 9.157533006217501e-05, + "loss": 0.2645660161972046, + "step": 20540 + }, + { + "epoch": 0.08822544499111305, + "grad_norm": 0.03342090919613838, + "learning_rate": 9.157101834205737e-05, + "loss": 0.1793131113052368, + "step": 20550 + }, + { + "epoch": 0.08826837708113307, + "grad_norm": 1.0915799140930176, + "learning_rate": 9.156670662193976e-05, + "loss": 0.24776463508605956, + "step": 20560 + }, + { + "epoch": 0.08831130917115307, + "grad_norm": 0.20644307136535645, + "learning_rate": 9.156239490182214e-05, + "loss": 0.06601312756538391, + "step": 20570 + }, + { + "epoch": 0.08835424126117308, + "grad_norm": 1.0235928297042847, + "learning_rate": 9.155808318170452e-05, + "loss": 0.3110778331756592, + "step": 20580 + }, + { + "epoch": 0.08839717335119308, + "grad_norm": 1.791678786277771, + "learning_rate": 9.15537714615869e-05, + "loss": 0.16456762552261353, + "step": 20590 + }, + { + "epoch": 0.08844010544121308, + "grad_norm": 0.8701760172843933, + "learning_rate": 9.154945974146927e-05, + "loss": 0.17454179525375366, + "step": 20600 + }, + { + "epoch": 0.0884830375312331, + "grad_norm": 0.022485675290226936, + "learning_rate": 9.154514802135165e-05, + "loss": 0.004853111878037453, + "step": 20610 + }, + { + "epoch": 0.0885259696212531, + "grad_norm": 4.354240417480469, + "learning_rate": 9.154083630123403e-05, + "loss": 0.42849555015563967, + "step": 20620 + }, + { + "epoch": 0.0885689017112731, + "grad_norm": 0.08281465619802475, + "learning_rate": 9.153652458111639e-05, + "loss": 0.24617836475372315, + "step": 20630 + }, + { + "epoch": 0.08861183380129312, + "grad_norm": 0.5946925282478333, + "learning_rate": 9.153221286099877e-05, + "loss": 0.2575437068939209, + "step": 20640 + }, + { + "epoch": 0.08865476589131312, + "grad_norm": 0.037425171583890915, + "learning_rate": 9.152790114088115e-05, + "loss": 0.3338440418243408, + "step": 20650 + }, + { + "epoch": 0.08869769798133313, + "grad_norm": 2.2693583965301514, + "learning_rate": 9.152358942076352e-05, + "loss": 0.2294626235961914, + "step": 20660 + }, + { + "epoch": 0.08874063007135313, + "grad_norm": 3.0021560192108154, + "learning_rate": 9.15192777006459e-05, + "loss": 0.19063858985900878, + "step": 20670 + }, + { + "epoch": 0.08878356216137313, + "grad_norm": 0.05422629788517952, + "learning_rate": 9.151496598052828e-05, + "loss": 0.3820706129074097, + "step": 20680 + }, + { + "epoch": 0.08882649425139315, + "grad_norm": 1.2230654954910278, + "learning_rate": 9.151065426041066e-05, + "loss": 0.27535200119018555, + "step": 20690 + }, + { + "epoch": 0.08886942634141315, + "grad_norm": 0.09059865027666092, + "learning_rate": 9.150634254029303e-05, + "loss": 0.3081681728363037, + "step": 20700 + }, + { + "epoch": 0.08891235843143316, + "grad_norm": 0.75705486536026, + "learning_rate": 9.150203082017541e-05, + "loss": 0.3169992446899414, + "step": 20710 + }, + { + "epoch": 0.08895529052145316, + "grad_norm": 0.0281693022698164, + "learning_rate": 9.149771910005777e-05, + "loss": 0.19922350645065307, + "step": 20720 + }, + { + "epoch": 0.08899822261147317, + "grad_norm": 0.5898590087890625, + "learning_rate": 9.149340737994015e-05, + "loss": 0.3216936349868774, + "step": 20730 + }, + { + "epoch": 0.08904115470149318, + "grad_norm": 0.8585788607597351, + "learning_rate": 9.148909565982253e-05, + "loss": 0.15437256097793578, + "step": 20740 + }, + { + "epoch": 0.08908408679151318, + "grad_norm": 0.8398600220680237, + "learning_rate": 9.14847839397049e-05, + "loss": 0.232130765914917, + "step": 20750 + }, + { + "epoch": 0.0891270188815332, + "grad_norm": 0.27168065309524536, + "learning_rate": 9.148047221958728e-05, + "loss": 0.27273604869842527, + "step": 20760 + }, + { + "epoch": 0.0891699509715532, + "grad_norm": 0.03227852284908295, + "learning_rate": 9.147616049946966e-05, + "loss": 0.08514662384986878, + "step": 20770 + }, + { + "epoch": 0.0892128830615732, + "grad_norm": 1.47744882106781, + "learning_rate": 9.147184877935204e-05, + "loss": 0.4660014629364014, + "step": 20780 + }, + { + "epoch": 0.08925581515159321, + "grad_norm": 0.07243969291448593, + "learning_rate": 9.146753705923442e-05, + "loss": 0.2737978458404541, + "step": 20790 + }, + { + "epoch": 0.08929874724161321, + "grad_norm": 2.0715065002441406, + "learning_rate": 9.14632253391168e-05, + "loss": 0.11428431272506714, + "step": 20800 + }, + { + "epoch": 0.08934167933163323, + "grad_norm": 0.15402251482009888, + "learning_rate": 9.145891361899917e-05, + "loss": 0.16179636716842652, + "step": 20810 + }, + { + "epoch": 0.08938461142165323, + "grad_norm": 0.38303110003471375, + "learning_rate": 9.145460189888155e-05, + "loss": 0.2799507141113281, + "step": 20820 + }, + { + "epoch": 0.08942754351167323, + "grad_norm": 4.249147891998291, + "learning_rate": 9.145029017876393e-05, + "loss": 0.4097945213317871, + "step": 20830 + }, + { + "epoch": 0.08947047560169324, + "grad_norm": 0.028834078460931778, + "learning_rate": 9.14459784586463e-05, + "loss": 0.2732088565826416, + "step": 20840 + }, + { + "epoch": 0.08951340769171325, + "grad_norm": 0.13410897552967072, + "learning_rate": 9.144166673852868e-05, + "loss": 0.09479534029960632, + "step": 20850 + }, + { + "epoch": 0.08955633978173326, + "grad_norm": 0.16793487966060638, + "learning_rate": 9.143735501841106e-05, + "loss": 0.26109633445739744, + "step": 20860 + }, + { + "epoch": 0.08959927187175326, + "grad_norm": 3.912290573120117, + "learning_rate": 9.143304329829343e-05, + "loss": 0.31730501651763915, + "step": 20870 + }, + { + "epoch": 0.08964220396177326, + "grad_norm": 2.9360220432281494, + "learning_rate": 9.14287315781758e-05, + "loss": 0.3117853879928589, + "step": 20880 + }, + { + "epoch": 0.08968513605179328, + "grad_norm": 0.0618007592856884, + "learning_rate": 9.142441985805818e-05, + "loss": 0.09674944281578064, + "step": 20890 + }, + { + "epoch": 0.08972806814181328, + "grad_norm": 0.8049418330192566, + "learning_rate": 9.142010813794055e-05, + "loss": 0.3263385772705078, + "step": 20900 + }, + { + "epoch": 0.08977100023183329, + "grad_norm": 0.254452109336853, + "learning_rate": 9.141579641782293e-05, + "loss": 0.28532981872558594, + "step": 20910 + }, + { + "epoch": 0.0898139323218533, + "grad_norm": 0.3107840120792389, + "learning_rate": 9.141148469770531e-05, + "loss": 0.2058807373046875, + "step": 20920 + }, + { + "epoch": 0.0898568644118733, + "grad_norm": 0.18128502368927002, + "learning_rate": 9.140717297758769e-05, + "loss": 0.22402665615081788, + "step": 20930 + }, + { + "epoch": 0.08989979650189331, + "grad_norm": 0.6478937268257141, + "learning_rate": 9.140286125747006e-05, + "loss": 0.17081331014633178, + "step": 20940 + }, + { + "epoch": 0.08994272859191331, + "grad_norm": 3.6722099781036377, + "learning_rate": 9.139854953735244e-05, + "loss": 0.14721962213516235, + "step": 20950 + }, + { + "epoch": 0.08998566068193332, + "grad_norm": 0.052497293800115585, + "learning_rate": 9.13942378172348e-05, + "loss": 0.24483840465545653, + "step": 20960 + }, + { + "epoch": 0.09002859277195333, + "grad_norm": 0.8103175759315491, + "learning_rate": 9.138992609711718e-05, + "loss": 0.32439870834350587, + "step": 20970 + }, + { + "epoch": 0.09007152486197333, + "grad_norm": 2.8278698921203613, + "learning_rate": 9.138561437699956e-05, + "loss": 0.40981359481811525, + "step": 20980 + }, + { + "epoch": 0.09011445695199334, + "grad_norm": 0.04258492588996887, + "learning_rate": 9.138130265688194e-05, + "loss": 0.3865856409072876, + "step": 20990 + }, + { + "epoch": 0.09015738904201334, + "grad_norm": 0.7423449158668518, + "learning_rate": 9.137699093676431e-05, + "loss": 0.3491669178009033, + "step": 21000 + }, + { + "epoch": 0.09015738904201334, + "eval_loss": 0.4936206042766571, + "eval_runtime": 27.4382, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 21000 + }, + { + "epoch": 0.09020032113203336, + "grad_norm": 0.06699751317501068, + "learning_rate": 9.137267921664669e-05, + "loss": 0.2962368011474609, + "step": 21010 + }, + { + "epoch": 0.09024325322205336, + "grad_norm": 0.44763314723968506, + "learning_rate": 9.136836749652907e-05, + "loss": 0.32719204425811765, + "step": 21020 + }, + { + "epoch": 0.09028618531207336, + "grad_norm": 1.079733967781067, + "learning_rate": 9.136405577641145e-05, + "loss": 0.44279727935791013, + "step": 21030 + }, + { + "epoch": 0.09032911740209337, + "grad_norm": 0.30846479535102844, + "learning_rate": 9.135974405629382e-05, + "loss": 0.06859133243560792, + "step": 21040 + }, + { + "epoch": 0.09037204949211337, + "grad_norm": 0.005983108654618263, + "learning_rate": 9.13554323361762e-05, + "loss": 0.29343218803405763, + "step": 21050 + }, + { + "epoch": 0.09041498158213337, + "grad_norm": 6.895624160766602, + "learning_rate": 9.135112061605858e-05, + "loss": 0.2820367097854614, + "step": 21060 + }, + { + "epoch": 0.09045791367215339, + "grad_norm": 1.3584187030792236, + "learning_rate": 9.134680889594095e-05, + "loss": 0.3605871915817261, + "step": 21070 + }, + { + "epoch": 0.09050084576217339, + "grad_norm": 0.22483867406845093, + "learning_rate": 9.134249717582333e-05, + "loss": 0.2500627040863037, + "step": 21080 + }, + { + "epoch": 0.0905437778521934, + "grad_norm": 1.597132682800293, + "learning_rate": 9.133818545570571e-05, + "loss": 0.3377506256103516, + "step": 21090 + }, + { + "epoch": 0.0905867099422134, + "grad_norm": 1.06842041015625, + "learning_rate": 9.133387373558809e-05, + "loss": 0.22344427108764647, + "step": 21100 + }, + { + "epoch": 0.0906296420322334, + "grad_norm": 2.9514353275299072, + "learning_rate": 9.132956201547046e-05, + "loss": 0.30355119705200195, + "step": 21110 + }, + { + "epoch": 0.09067257412225342, + "grad_norm": 0.08379106968641281, + "learning_rate": 9.132525029535284e-05, + "loss": 0.1024692177772522, + "step": 21120 + }, + { + "epoch": 0.09071550621227342, + "grad_norm": 0.6457318663597107, + "learning_rate": 9.13209385752352e-05, + "loss": 0.17067673206329345, + "step": 21130 + }, + { + "epoch": 0.09075843830229344, + "grad_norm": 1.7996233701705933, + "learning_rate": 9.131662685511758e-05, + "loss": 0.2232583522796631, + "step": 21140 + }, + { + "epoch": 0.09080137039231344, + "grad_norm": 1.7304378747940063, + "learning_rate": 9.131231513499996e-05, + "loss": 0.5026605129241943, + "step": 21150 + }, + { + "epoch": 0.09084430248233344, + "grad_norm": 0.26706168055534363, + "learning_rate": 9.130800341488234e-05, + "loss": 0.1385503053665161, + "step": 21160 + }, + { + "epoch": 0.09088723457235345, + "grad_norm": 0.011747085489332676, + "learning_rate": 9.130369169476471e-05, + "loss": 0.18480902910232544, + "step": 21170 + }, + { + "epoch": 0.09093016666237345, + "grad_norm": 0.5335558652877808, + "learning_rate": 9.129937997464709e-05, + "loss": 0.24206953048706054, + "step": 21180 + }, + { + "epoch": 0.09097309875239347, + "grad_norm": 5.258423328399658, + "learning_rate": 9.129506825452947e-05, + "loss": 0.5212468147277832, + "step": 21190 + }, + { + "epoch": 0.09101603084241347, + "grad_norm": 0.03475071117281914, + "learning_rate": 9.129075653441185e-05, + "loss": 0.12896806001663208, + "step": 21200 + }, + { + "epoch": 0.09105896293243347, + "grad_norm": 1.2318061590194702, + "learning_rate": 9.128644481429421e-05, + "loss": 0.41077375411987305, + "step": 21210 + }, + { + "epoch": 0.09110189502245349, + "grad_norm": 0.9526176452636719, + "learning_rate": 9.128213309417659e-05, + "loss": 0.19505181312561035, + "step": 21220 + }, + { + "epoch": 0.09114482711247349, + "grad_norm": 0.38301515579223633, + "learning_rate": 9.127782137405896e-05, + "loss": 0.11115933656692505, + "step": 21230 + }, + { + "epoch": 0.0911877592024935, + "grad_norm": 0.16873271763324738, + "learning_rate": 9.127350965394134e-05, + "loss": 0.10547451972961426, + "step": 21240 + }, + { + "epoch": 0.0912306912925135, + "grad_norm": 0.38345664739608765, + "learning_rate": 9.126919793382372e-05, + "loss": 0.39136838912963867, + "step": 21250 + }, + { + "epoch": 0.0912736233825335, + "grad_norm": 0.0167214535176754, + "learning_rate": 9.12648862137061e-05, + "loss": 0.3666508197784424, + "step": 21260 + }, + { + "epoch": 0.09131655547255352, + "grad_norm": 0.016246158629655838, + "learning_rate": 9.126057449358847e-05, + "loss": 0.22289640903472902, + "step": 21270 + }, + { + "epoch": 0.09135948756257352, + "grad_norm": 3.4992456436157227, + "learning_rate": 9.125626277347085e-05, + "loss": 0.3967678785324097, + "step": 21280 + }, + { + "epoch": 0.09140241965259353, + "grad_norm": 0.15353921055793762, + "learning_rate": 9.125195105335323e-05, + "loss": 0.1501341223716736, + "step": 21290 + }, + { + "epoch": 0.09144535174261353, + "grad_norm": 0.00800306349992752, + "learning_rate": 9.12476393332356e-05, + "loss": 0.08457062244415284, + "step": 21300 + }, + { + "epoch": 0.09148828383263354, + "grad_norm": 0.032460544258356094, + "learning_rate": 9.124332761311798e-05, + "loss": 0.3322447299957275, + "step": 21310 + }, + { + "epoch": 0.09153121592265355, + "grad_norm": 0.012716036289930344, + "learning_rate": 9.123901589300036e-05, + "loss": 0.3365495681762695, + "step": 21320 + }, + { + "epoch": 0.09157414801267355, + "grad_norm": 0.016938934102654457, + "learning_rate": 9.123470417288274e-05, + "loss": 0.07940050959587097, + "step": 21330 + }, + { + "epoch": 0.09161708010269357, + "grad_norm": 0.008436004631221294, + "learning_rate": 9.123039245276512e-05, + "loss": 0.30735015869140625, + "step": 21340 + }, + { + "epoch": 0.09166001219271357, + "grad_norm": 1.228560447692871, + "learning_rate": 9.122608073264749e-05, + "loss": 0.6153797626495361, + "step": 21350 + }, + { + "epoch": 0.09170294428273357, + "grad_norm": 2.067349433898926, + "learning_rate": 9.122176901252987e-05, + "loss": 0.06948390603065491, + "step": 21360 + }, + { + "epoch": 0.09174587637275358, + "grad_norm": 1.3888169527053833, + "learning_rate": 9.121745729241223e-05, + "loss": 0.472440767288208, + "step": 21370 + }, + { + "epoch": 0.09178880846277358, + "grad_norm": 0.07951004058122635, + "learning_rate": 9.121314557229461e-05, + "loss": 0.36643662452697756, + "step": 21380 + }, + { + "epoch": 0.0918317405527936, + "grad_norm": 0.20464085042476654, + "learning_rate": 9.120883385217699e-05, + "loss": 0.2653449535369873, + "step": 21390 + }, + { + "epoch": 0.0918746726428136, + "grad_norm": 0.1735410988330841, + "learning_rate": 9.120452213205937e-05, + "loss": 0.312151837348938, + "step": 21400 + }, + { + "epoch": 0.0919176047328336, + "grad_norm": 0.07625383883714676, + "learning_rate": 9.120021041194174e-05, + "loss": 0.13038358688354493, + "step": 21410 + }, + { + "epoch": 0.09196053682285361, + "grad_norm": 0.16427874565124512, + "learning_rate": 9.119589869182412e-05, + "loss": 0.18846406936645507, + "step": 21420 + }, + { + "epoch": 0.09200346891287361, + "grad_norm": 0.762389600276947, + "learning_rate": 9.11915869717065e-05, + "loss": 0.3079734563827515, + "step": 21430 + }, + { + "epoch": 0.09204640100289363, + "grad_norm": 0.22876478731632233, + "learning_rate": 9.118727525158888e-05, + "loss": 0.3454793691635132, + "step": 21440 + }, + { + "epoch": 0.09208933309291363, + "grad_norm": 1.3125431537628174, + "learning_rate": 9.118296353147125e-05, + "loss": 0.23138630390167236, + "step": 21450 + }, + { + "epoch": 0.09213226518293363, + "grad_norm": 2.790705442428589, + "learning_rate": 9.117865181135362e-05, + "loss": 0.1917582631111145, + "step": 21460 + }, + { + "epoch": 0.09217519727295365, + "grad_norm": 0.008920938707888126, + "learning_rate": 9.1174340091236e-05, + "loss": 0.30131969451904295, + "step": 21470 + }, + { + "epoch": 0.09221812936297365, + "grad_norm": 0.010895448736846447, + "learning_rate": 9.117002837111837e-05, + "loss": 0.21453914642333985, + "step": 21480 + }, + { + "epoch": 0.09226106145299365, + "grad_norm": 0.05802464112639427, + "learning_rate": 9.116571665100075e-05, + "loss": 0.18757236003875732, + "step": 21490 + }, + { + "epoch": 0.09230399354301366, + "grad_norm": 0.8384543061256409, + "learning_rate": 9.116140493088313e-05, + "loss": 0.6185249805450439, + "step": 21500 + }, + { + "epoch": 0.09234692563303366, + "grad_norm": 0.008186266757547855, + "learning_rate": 9.11570932107655e-05, + "loss": 0.31588795185089114, + "step": 21510 + }, + { + "epoch": 0.09238985772305368, + "grad_norm": 0.6603401303291321, + "learning_rate": 9.115278149064788e-05, + "loss": 0.37519667148590086, + "step": 21520 + }, + { + "epoch": 0.09243278981307368, + "grad_norm": 0.041270140558481216, + "learning_rate": 9.114846977053027e-05, + "loss": 0.014683444797992707, + "step": 21530 + }, + { + "epoch": 0.09247572190309368, + "grad_norm": 4.466777801513672, + "learning_rate": 9.114415805041264e-05, + "loss": 0.11065888404846191, + "step": 21540 + }, + { + "epoch": 0.0925186539931137, + "grad_norm": 2.9261770248413086, + "learning_rate": 9.113984633029501e-05, + "loss": 0.28797388076782227, + "step": 21550 + }, + { + "epoch": 0.0925615860831337, + "grad_norm": 0.0005723032518289983, + "learning_rate": 9.113553461017739e-05, + "loss": 0.35480945110321044, + "step": 21560 + }, + { + "epoch": 0.09260451817315371, + "grad_norm": 0.1928175538778305, + "learning_rate": 9.113122289005977e-05, + "loss": 0.24199821949005126, + "step": 21570 + }, + { + "epoch": 0.09264745026317371, + "grad_norm": 1.50874924659729, + "learning_rate": 9.112691116994214e-05, + "loss": 0.16585899591445924, + "step": 21580 + }, + { + "epoch": 0.09269038235319371, + "grad_norm": 0.005837564822286367, + "learning_rate": 9.112259944982452e-05, + "loss": 0.14847633838653565, + "step": 21590 + }, + { + "epoch": 0.09273331444321373, + "grad_norm": 0.0028368725907057524, + "learning_rate": 9.11182877297069e-05, + "loss": 0.19323831796646118, + "step": 21600 + }, + { + "epoch": 0.09277624653323373, + "grad_norm": 0.29099443554878235, + "learning_rate": 9.111397600958928e-05, + "loss": 0.16607074737548827, + "step": 21610 + }, + { + "epoch": 0.09281917862325374, + "grad_norm": 0.8555635213851929, + "learning_rate": 9.110966428947164e-05, + "loss": 0.44680633544921877, + "step": 21620 + }, + { + "epoch": 0.09286211071327374, + "grad_norm": 0.030142752453684807, + "learning_rate": 9.110535256935402e-05, + "loss": 0.3203817129135132, + "step": 21630 + }, + { + "epoch": 0.09290504280329374, + "grad_norm": 3.880009174346924, + "learning_rate": 9.11010408492364e-05, + "loss": 0.11551387310028076, + "step": 21640 + }, + { + "epoch": 0.09294797489331376, + "grad_norm": 0.01011581439524889, + "learning_rate": 9.109672912911877e-05, + "loss": 0.27714385986328127, + "step": 21650 + }, + { + "epoch": 0.09299090698333376, + "grad_norm": 0.052719537168741226, + "learning_rate": 9.109241740900115e-05, + "loss": 0.21967732906341553, + "step": 21660 + }, + { + "epoch": 0.09303383907335377, + "grad_norm": 0.10481604933738708, + "learning_rate": 9.108810568888353e-05, + "loss": 0.35539584159851073, + "step": 21670 + }, + { + "epoch": 0.09307677116337378, + "grad_norm": 1.0373491048812866, + "learning_rate": 9.10837939687659e-05, + "loss": 0.354030704498291, + "step": 21680 + }, + { + "epoch": 0.09311970325339378, + "grad_norm": 0.013149775564670563, + "learning_rate": 9.107948224864828e-05, + "loss": 0.2582766056060791, + "step": 21690 + }, + { + "epoch": 0.09316263534341379, + "grad_norm": 3.9368278980255127, + "learning_rate": 9.107517052853065e-05, + "loss": 0.14385126829147338, + "step": 21700 + }, + { + "epoch": 0.09320556743343379, + "grad_norm": 0.036791346967220306, + "learning_rate": 9.107085880841302e-05, + "loss": 0.17863940000534057, + "step": 21710 + }, + { + "epoch": 0.0932484995234538, + "grad_norm": 0.7378762364387512, + "learning_rate": 9.10665470882954e-05, + "loss": 0.20115478038787843, + "step": 21720 + }, + { + "epoch": 0.09329143161347381, + "grad_norm": 2.740835428237915, + "learning_rate": 9.106223536817778e-05, + "loss": 0.2319530725479126, + "step": 21730 + }, + { + "epoch": 0.09333436370349381, + "grad_norm": 0.07379814237356186, + "learning_rate": 9.105792364806016e-05, + "loss": 0.37325448989868165, + "step": 21740 + }, + { + "epoch": 0.09337729579351382, + "grad_norm": 0.0846996009349823, + "learning_rate": 9.105361192794255e-05, + "loss": 0.3439117431640625, + "step": 21750 + }, + { + "epoch": 0.09342022788353382, + "grad_norm": 1.408515214920044, + "learning_rate": 9.104930020782492e-05, + "loss": 0.2008873462677002, + "step": 21760 + }, + { + "epoch": 0.09346315997355384, + "grad_norm": 0.10412348061800003, + "learning_rate": 9.10449884877073e-05, + "loss": 0.44175071716308595, + "step": 21770 + }, + { + "epoch": 0.09350609206357384, + "grad_norm": 0.14311371743679047, + "learning_rate": 9.104067676758968e-05, + "loss": 0.2105050802230835, + "step": 21780 + }, + { + "epoch": 0.09354902415359384, + "grad_norm": 0.5638694167137146, + "learning_rate": 9.103636504747204e-05, + "loss": 0.1521025061607361, + "step": 21790 + }, + { + "epoch": 0.09359195624361386, + "grad_norm": 2.0134546756744385, + "learning_rate": 9.103205332735442e-05, + "loss": 0.2705402374267578, + "step": 21800 + }, + { + "epoch": 0.09363488833363386, + "grad_norm": 11.786382675170898, + "learning_rate": 9.10277416072368e-05, + "loss": 0.19081075191497804, + "step": 21810 + }, + { + "epoch": 0.09367782042365387, + "grad_norm": 0.10052203387022018, + "learning_rate": 9.102342988711917e-05, + "loss": 0.014866837859153747, + "step": 21820 + }, + { + "epoch": 0.09372075251367387, + "grad_norm": 1.2548933029174805, + "learning_rate": 9.101911816700155e-05, + "loss": 0.2774821281433105, + "step": 21830 + }, + { + "epoch": 0.09376368460369387, + "grad_norm": 0.7686817646026611, + "learning_rate": 9.101480644688393e-05, + "loss": 0.30581719875335694, + "step": 21840 + }, + { + "epoch": 0.09380661669371389, + "grad_norm": 3.6177711486816406, + "learning_rate": 9.10104947267663e-05, + "loss": 0.27634706497192385, + "step": 21850 + }, + { + "epoch": 0.09384954878373389, + "grad_norm": 0.13538040220737457, + "learning_rate": 9.100618300664868e-05, + "loss": 0.11370041370391845, + "step": 21860 + }, + { + "epoch": 0.0938924808737539, + "grad_norm": 0.8200361132621765, + "learning_rate": 9.100187128653105e-05, + "loss": 0.24458799362182618, + "step": 21870 + }, + { + "epoch": 0.0939354129637739, + "grad_norm": 0.12220072746276855, + "learning_rate": 9.099755956641342e-05, + "loss": 0.19076045751571655, + "step": 21880 + }, + { + "epoch": 0.0939783450537939, + "grad_norm": 0.4327422082424164, + "learning_rate": 9.09932478462958e-05, + "loss": 0.2103203535079956, + "step": 21890 + }, + { + "epoch": 0.09402127714381392, + "grad_norm": 0.013189369812607765, + "learning_rate": 9.098893612617818e-05, + "loss": 0.2407254695892334, + "step": 21900 + }, + { + "epoch": 0.09406420923383392, + "grad_norm": 1.8307231664657593, + "learning_rate": 9.098462440606056e-05, + "loss": 0.39395933151245116, + "step": 21910 + }, + { + "epoch": 0.09410714132385392, + "grad_norm": 1.2831162214279175, + "learning_rate": 9.098031268594293e-05, + "loss": 0.1422368049621582, + "step": 21920 + }, + { + "epoch": 0.09415007341387394, + "grad_norm": 4.390259265899658, + "learning_rate": 9.097600096582531e-05, + "loss": 0.21208109855651855, + "step": 21930 + }, + { + "epoch": 0.09419300550389394, + "grad_norm": 2.1142640113830566, + "learning_rate": 9.097168924570769e-05, + "loss": 0.21083295345306396, + "step": 21940 + }, + { + "epoch": 0.09423593759391395, + "grad_norm": 0.3554537892341614, + "learning_rate": 9.096737752559005e-05, + "loss": 0.18508408069610596, + "step": 21950 + }, + { + "epoch": 0.09427886968393395, + "grad_norm": 1.1310135126113892, + "learning_rate": 9.096306580547243e-05, + "loss": 0.4078618049621582, + "step": 21960 + }, + { + "epoch": 0.09432180177395395, + "grad_norm": 0.005195611622184515, + "learning_rate": 9.095875408535482e-05, + "loss": 0.19787473678588868, + "step": 21970 + }, + { + "epoch": 0.09436473386397397, + "grad_norm": 0.0982283353805542, + "learning_rate": 9.09544423652372e-05, + "loss": 0.10687346458435058, + "step": 21980 + }, + { + "epoch": 0.09440766595399397, + "grad_norm": 0.14892923831939697, + "learning_rate": 9.095013064511958e-05, + "loss": 0.36613714694976807, + "step": 21990 + }, + { + "epoch": 0.09445059804401398, + "grad_norm": 0.037686608731746674, + "learning_rate": 9.094581892500195e-05, + "loss": 0.2646932125091553, + "step": 22000 + }, + { + "epoch": 0.09445059804401398, + "eval_loss": 0.49971094727516174, + "eval_runtime": 27.5223, + "eval_samples_per_second": 3.633, + "eval_steps_per_second": 3.633, + "step": 22000 + }, + { + "epoch": 0.09449353013403398, + "grad_norm": 0.008538651280105114, + "learning_rate": 9.094150720488433e-05, + "loss": 0.18507496118545533, + "step": 22010 + }, + { + "epoch": 0.09453646222405399, + "grad_norm": 1.6707645654678345, + "learning_rate": 9.093719548476671e-05, + "loss": 0.39339451789855956, + "step": 22020 + }, + { + "epoch": 0.094579394314074, + "grad_norm": 5.307521820068359, + "learning_rate": 9.093288376464907e-05, + "loss": 0.35367393493652344, + "step": 22030 + }, + { + "epoch": 0.094622326404094, + "grad_norm": 0.9975845217704773, + "learning_rate": 9.092857204453145e-05, + "loss": 0.35877773761749265, + "step": 22040 + }, + { + "epoch": 0.09466525849411402, + "grad_norm": 0.07139477878808975, + "learning_rate": 9.092426032441383e-05, + "loss": 0.2376739501953125, + "step": 22050 + }, + { + "epoch": 0.09470819058413402, + "grad_norm": 0.013264146633446217, + "learning_rate": 9.09199486042962e-05, + "loss": 0.3740490674972534, + "step": 22060 + }, + { + "epoch": 0.09475112267415402, + "grad_norm": 1.3234672546386719, + "learning_rate": 9.091563688417858e-05, + "loss": 0.35382215976715087, + "step": 22070 + }, + { + "epoch": 0.09479405476417403, + "grad_norm": 0.0955483466386795, + "learning_rate": 9.091132516406096e-05, + "loss": 0.20790884494781495, + "step": 22080 + }, + { + "epoch": 0.09483698685419403, + "grad_norm": 0.02523432858288288, + "learning_rate": 9.090701344394333e-05, + "loss": 0.19438637495040895, + "step": 22090 + }, + { + "epoch": 0.09487991894421405, + "grad_norm": 0.7263586521148682, + "learning_rate": 9.090270172382571e-05, + "loss": 0.31020758152008054, + "step": 22100 + }, + { + "epoch": 0.09492285103423405, + "grad_norm": 0.03434018790721893, + "learning_rate": 9.089839000370808e-05, + "loss": 0.2784116744995117, + "step": 22110 + }, + { + "epoch": 0.09496578312425405, + "grad_norm": 0.05946849659085274, + "learning_rate": 9.089407828359045e-05, + "loss": 0.28342130184173586, + "step": 22120 + }, + { + "epoch": 0.09500871521427406, + "grad_norm": 0.05106307566165924, + "learning_rate": 9.088976656347283e-05, + "loss": 0.17503679990768434, + "step": 22130 + }, + { + "epoch": 0.09505164730429407, + "grad_norm": 5.778275489807129, + "learning_rate": 9.088545484335521e-05, + "loss": 0.32182085514068604, + "step": 22140 + }, + { + "epoch": 0.09509457939431408, + "grad_norm": 0.1018679141998291, + "learning_rate": 9.088114312323759e-05, + "loss": 0.22511820793151854, + "step": 22150 + }, + { + "epoch": 0.09513751148433408, + "grad_norm": 0.041737183928489685, + "learning_rate": 9.087683140311996e-05, + "loss": 0.5281650066375733, + "step": 22160 + }, + { + "epoch": 0.09518044357435408, + "grad_norm": 0.528108537197113, + "learning_rate": 9.087251968300234e-05, + "loss": 0.24609570503234862, + "step": 22170 + }, + { + "epoch": 0.0952233756643741, + "grad_norm": 2.166268825531006, + "learning_rate": 9.086820796288472e-05, + "loss": 0.1980876326560974, + "step": 22180 + }, + { + "epoch": 0.0952663077543941, + "grad_norm": 0.004012465942651033, + "learning_rate": 9.08638962427671e-05, + "loss": 0.08182164430618286, + "step": 22190 + }, + { + "epoch": 0.09530923984441411, + "grad_norm": 0.8156777620315552, + "learning_rate": 9.085958452264947e-05, + "loss": 0.34822816848754884, + "step": 22200 + }, + { + "epoch": 0.09535217193443411, + "grad_norm": 0.11005112528800964, + "learning_rate": 9.085527280253185e-05, + "loss": 0.3492276191711426, + "step": 22210 + }, + { + "epoch": 0.09539510402445411, + "grad_norm": 1.879366397857666, + "learning_rate": 9.085096108241423e-05, + "loss": 0.1570887804031372, + "step": 22220 + }, + { + "epoch": 0.09543803611447413, + "grad_norm": 3.163851499557495, + "learning_rate": 9.08466493622966e-05, + "loss": 0.20503544807434082, + "step": 22230 + }, + { + "epoch": 0.09548096820449413, + "grad_norm": 0.0023948336020112038, + "learning_rate": 9.084233764217898e-05, + "loss": 0.28005211353302, + "step": 22240 + }, + { + "epoch": 0.09552390029451414, + "grad_norm": 0.2130800187587738, + "learning_rate": 9.083802592206136e-05, + "loss": 0.4176482677459717, + "step": 22250 + }, + { + "epoch": 0.09556683238453414, + "grad_norm": 0.28723445534706116, + "learning_rate": 9.083371420194374e-05, + "loss": 0.15389590263366698, + "step": 22260 + }, + { + "epoch": 0.09560976447455415, + "grad_norm": 1.2294330596923828, + "learning_rate": 9.082940248182611e-05, + "loss": 0.19988157749176025, + "step": 22270 + }, + { + "epoch": 0.09565269656457416, + "grad_norm": 1.211296796798706, + "learning_rate": 9.082509076170848e-05, + "loss": 0.1836371898651123, + "step": 22280 + }, + { + "epoch": 0.09569562865459416, + "grad_norm": 0.012455378659069538, + "learning_rate": 9.082077904159085e-05, + "loss": 0.18517324924468995, + "step": 22290 + }, + { + "epoch": 0.09573856074461418, + "grad_norm": 3.2311062812805176, + "learning_rate": 9.081646732147323e-05, + "loss": 0.12784696817398072, + "step": 22300 + }, + { + "epoch": 0.09578149283463418, + "grad_norm": 0.06623481214046478, + "learning_rate": 9.081215560135561e-05, + "loss": 0.3080892086029053, + "step": 22310 + }, + { + "epoch": 0.09582442492465418, + "grad_norm": 1.6185115575790405, + "learning_rate": 9.080784388123799e-05, + "loss": 0.13115832805633545, + "step": 22320 + }, + { + "epoch": 0.09586735701467419, + "grad_norm": 2.0833399295806885, + "learning_rate": 9.080353216112036e-05, + "loss": 0.25316872596740725, + "step": 22330 + }, + { + "epoch": 0.0959102891046942, + "grad_norm": 1.0511176586151123, + "learning_rate": 9.079922044100274e-05, + "loss": 0.365296459197998, + "step": 22340 + }, + { + "epoch": 0.0959532211947142, + "grad_norm": 0.12100150436162949, + "learning_rate": 9.079490872088512e-05, + "loss": 0.41388521194458006, + "step": 22350 + }, + { + "epoch": 0.09599615328473421, + "grad_norm": 0.14868609607219696, + "learning_rate": 9.079059700076748e-05, + "loss": 0.0745149314403534, + "step": 22360 + }, + { + "epoch": 0.09603908537475421, + "grad_norm": 4.128106117248535, + "learning_rate": 9.078628528064986e-05, + "loss": 0.17585846185684204, + "step": 22370 + }, + { + "epoch": 0.09608201746477422, + "grad_norm": 0.2254243791103363, + "learning_rate": 9.078197356053224e-05, + "loss": 0.14409635066986085, + "step": 22380 + }, + { + "epoch": 0.09612494955479423, + "grad_norm": 0.004599638283252716, + "learning_rate": 9.077766184041461e-05, + "loss": 0.26925704479217527, + "step": 22390 + }, + { + "epoch": 0.09616788164481423, + "grad_norm": 1.0302200317382812, + "learning_rate": 9.077335012029699e-05, + "loss": 0.3191660165786743, + "step": 22400 + }, + { + "epoch": 0.09621081373483424, + "grad_norm": 15.162273406982422, + "learning_rate": 9.076903840017937e-05, + "loss": 0.28926901817321776, + "step": 22410 + }, + { + "epoch": 0.09625374582485424, + "grad_norm": 0.34869951009750366, + "learning_rate": 9.076472668006175e-05, + "loss": 0.21189517974853517, + "step": 22420 + }, + { + "epoch": 0.09629667791487426, + "grad_norm": 1.3535178899765015, + "learning_rate": 9.076041495994412e-05, + "loss": 0.22193832397460939, + "step": 22430 + }, + { + "epoch": 0.09633961000489426, + "grad_norm": 0.17086510360240936, + "learning_rate": 9.07561032398265e-05, + "loss": 0.40629210472106936, + "step": 22440 + }, + { + "epoch": 0.09638254209491426, + "grad_norm": 0.9439690113067627, + "learning_rate": 9.075179151970888e-05, + "loss": 0.24682340621948243, + "step": 22450 + }, + { + "epoch": 0.09642547418493427, + "grad_norm": 0.005355150904506445, + "learning_rate": 9.074747979959126e-05, + "loss": 0.21591801643371583, + "step": 22460 + }, + { + "epoch": 0.09646840627495427, + "grad_norm": 12.08975601196289, + "learning_rate": 9.074316807947363e-05, + "loss": 0.29039928913116453, + "step": 22470 + }, + { + "epoch": 0.09651133836497429, + "grad_norm": 0.013028501532971859, + "learning_rate": 9.073885635935601e-05, + "loss": 0.29697093963623045, + "step": 22480 + }, + { + "epoch": 0.09655427045499429, + "grad_norm": 2.355160713195801, + "learning_rate": 9.073454463923839e-05, + "loss": 0.3402007818222046, + "step": 22490 + }, + { + "epoch": 0.09659720254501429, + "grad_norm": 0.0010523615637794137, + "learning_rate": 9.073023291912077e-05, + "loss": 0.03701513111591339, + "step": 22500 + }, + { + "epoch": 0.0966401346350343, + "grad_norm": 0.015033922158181667, + "learning_rate": 9.072592119900314e-05, + "loss": 0.23244831562042237, + "step": 22510 + }, + { + "epoch": 0.0966830667250543, + "grad_norm": 0.0025452927220612764, + "learning_rate": 9.072160947888552e-05, + "loss": 0.24909675121307373, + "step": 22520 + }, + { + "epoch": 0.09672599881507432, + "grad_norm": 2.8800175189971924, + "learning_rate": 9.071729775876788e-05, + "loss": 0.46021552085876466, + "step": 22530 + }, + { + "epoch": 0.09676893090509432, + "grad_norm": 0.0061400760896503925, + "learning_rate": 9.071298603865026e-05, + "loss": 0.20453925132751466, + "step": 22540 + }, + { + "epoch": 0.09681186299511432, + "grad_norm": 0.036182962357997894, + "learning_rate": 9.070867431853264e-05, + "loss": 0.1192806601524353, + "step": 22550 + }, + { + "epoch": 0.09685479508513434, + "grad_norm": 0.09762956947088242, + "learning_rate": 9.070436259841502e-05, + "loss": 0.16233222484588622, + "step": 22560 + }, + { + "epoch": 0.09689772717515434, + "grad_norm": 2.074234962463379, + "learning_rate": 9.070005087829739e-05, + "loss": 0.3488273620605469, + "step": 22570 + }, + { + "epoch": 0.09694065926517435, + "grad_norm": 0.06691594421863556, + "learning_rate": 9.069573915817977e-05, + "loss": 0.09470370411872864, + "step": 22580 + }, + { + "epoch": 0.09698359135519435, + "grad_norm": 0.08174016326665878, + "learning_rate": 9.069142743806215e-05, + "loss": 0.0023634165525436403, + "step": 22590 + }, + { + "epoch": 0.09702652344521435, + "grad_norm": 0.0033455390948802233, + "learning_rate": 9.068711571794453e-05, + "loss": 0.17757070064544678, + "step": 22600 + }, + { + "epoch": 0.09706945553523437, + "grad_norm": 0.14201530814170837, + "learning_rate": 9.068280399782689e-05, + "loss": 0.13758721351623535, + "step": 22610 + }, + { + "epoch": 0.09711238762525437, + "grad_norm": 5.6771674156188965, + "learning_rate": 9.067849227770927e-05, + "loss": 0.07379586100578309, + "step": 22620 + }, + { + "epoch": 0.09715531971527439, + "grad_norm": 1.037272572517395, + "learning_rate": 9.067418055759164e-05, + "loss": 0.21951465606689452, + "step": 22630 + }, + { + "epoch": 0.09719825180529439, + "grad_norm": 0.0346045047044754, + "learning_rate": 9.066986883747402e-05, + "loss": 0.3791325330734253, + "step": 22640 + }, + { + "epoch": 0.09724118389531439, + "grad_norm": 1.0330848693847656, + "learning_rate": 9.06655571173564e-05, + "loss": 0.40000143051147463, + "step": 22650 + }, + { + "epoch": 0.0972841159853344, + "grad_norm": 0.19888627529144287, + "learning_rate": 9.066124539723878e-05, + "loss": 0.2448514938354492, + "step": 22660 + }, + { + "epoch": 0.0973270480753544, + "grad_norm": 0.08020024746656418, + "learning_rate": 9.065693367712115e-05, + "loss": 0.09969213604927063, + "step": 22670 + }, + { + "epoch": 0.09736998016537442, + "grad_norm": 2.1089484691619873, + "learning_rate": 9.065262195700353e-05, + "loss": 0.5607002735137939, + "step": 22680 + }, + { + "epoch": 0.09741291225539442, + "grad_norm": 0.018384765833616257, + "learning_rate": 9.064831023688591e-05, + "loss": 0.317985463142395, + "step": 22690 + }, + { + "epoch": 0.09745584434541442, + "grad_norm": 0.06176162511110306, + "learning_rate": 9.064399851676829e-05, + "loss": 0.19674248695373536, + "step": 22700 + }, + { + "epoch": 0.09749877643543443, + "grad_norm": 0.00424107676371932, + "learning_rate": 9.063968679665066e-05, + "loss": 0.24540879726409912, + "step": 22710 + }, + { + "epoch": 0.09754170852545443, + "grad_norm": 0.09543658047914505, + "learning_rate": 9.063537507653304e-05, + "loss": 0.35407238006591796, + "step": 22720 + }, + { + "epoch": 0.09758464061547445, + "grad_norm": 0.019383370876312256, + "learning_rate": 9.063106335641542e-05, + "loss": 0.1949351906776428, + "step": 22730 + }, + { + "epoch": 0.09762757270549445, + "grad_norm": 0.008234373293817043, + "learning_rate": 9.06267516362978e-05, + "loss": 0.4226356029510498, + "step": 22740 + }, + { + "epoch": 0.09767050479551445, + "grad_norm": 1.8291183710098267, + "learning_rate": 9.062243991618017e-05, + "loss": 0.25851223468780515, + "step": 22750 + }, + { + "epoch": 0.09771343688553447, + "grad_norm": 4.839968681335449, + "learning_rate": 9.061812819606255e-05, + "loss": 0.21467921733856202, + "step": 22760 + }, + { + "epoch": 0.09775636897555447, + "grad_norm": 0.09866461157798767, + "learning_rate": 9.061381647594491e-05, + "loss": 0.27601807117462157, + "step": 22770 + }, + { + "epoch": 0.09779930106557447, + "grad_norm": 7.941098690032959, + "learning_rate": 9.060950475582729e-05, + "loss": 0.3188823699951172, + "step": 22780 + }, + { + "epoch": 0.09784223315559448, + "grad_norm": 1.3161903619766235, + "learning_rate": 9.060519303570967e-05, + "loss": 0.261748743057251, + "step": 22790 + }, + { + "epoch": 0.09788516524561448, + "grad_norm": 1.3355660438537598, + "learning_rate": 9.060088131559204e-05, + "loss": 0.3479597568511963, + "step": 22800 + }, + { + "epoch": 0.0979280973356345, + "grad_norm": 2.689037322998047, + "learning_rate": 9.059656959547442e-05, + "loss": 0.3787343502044678, + "step": 22810 + }, + { + "epoch": 0.0979710294256545, + "grad_norm": 1.72267484664917, + "learning_rate": 9.05922578753568e-05, + "loss": 0.21570446491241455, + "step": 22820 + }, + { + "epoch": 0.0980139615156745, + "grad_norm": 9.35726547241211, + "learning_rate": 9.058794615523918e-05, + "loss": 0.21867167949676514, + "step": 22830 + }, + { + "epoch": 0.09805689360569451, + "grad_norm": 3.232853412628174, + "learning_rate": 9.058363443512155e-05, + "loss": 0.12764328718185425, + "step": 22840 + }, + { + "epoch": 0.09809982569571452, + "grad_norm": 0.0028268240857869387, + "learning_rate": 9.057932271500392e-05, + "loss": 0.273606538772583, + "step": 22850 + }, + { + "epoch": 0.09814275778573453, + "grad_norm": 0.5840247273445129, + "learning_rate": 9.05750109948863e-05, + "loss": 0.2192753553390503, + "step": 22860 + }, + { + "epoch": 0.09818568987575453, + "grad_norm": 0.022768119350075722, + "learning_rate": 9.057069927476867e-05, + "loss": 0.32812683582305907, + "step": 22870 + }, + { + "epoch": 0.09822862196577453, + "grad_norm": 0.01956142857670784, + "learning_rate": 9.056638755465105e-05, + "loss": 0.10067062377929688, + "step": 22880 + }, + { + "epoch": 0.09827155405579455, + "grad_norm": 0.9932413697242737, + "learning_rate": 9.056207583453343e-05, + "loss": 0.26803138256073, + "step": 22890 + }, + { + "epoch": 0.09831448614581455, + "grad_norm": 0.022333988919854164, + "learning_rate": 9.05577641144158e-05, + "loss": 0.20267021656036377, + "step": 22900 + }, + { + "epoch": 0.09835741823583456, + "grad_norm": 0.13647231459617615, + "learning_rate": 9.055345239429818e-05, + "loss": 0.33862149715423584, + "step": 22910 + }, + { + "epoch": 0.09840035032585456, + "grad_norm": 0.15876804292201996, + "learning_rate": 9.054914067418056e-05, + "loss": 0.21299707889556885, + "step": 22920 + }, + { + "epoch": 0.09844328241587456, + "grad_norm": 0.18488116562366486, + "learning_rate": 9.054482895406294e-05, + "loss": 0.20834004878997803, + "step": 22930 + }, + { + "epoch": 0.09848621450589458, + "grad_norm": 0.08750098943710327, + "learning_rate": 9.054051723394531e-05, + "loss": 0.20142159461975098, + "step": 22940 + }, + { + "epoch": 0.09852914659591458, + "grad_norm": 0.7470207810401917, + "learning_rate": 9.053620551382769e-05, + "loss": 0.2614895343780518, + "step": 22950 + }, + { + "epoch": 0.0985720786859346, + "grad_norm": 8.173144340515137, + "learning_rate": 9.053189379371007e-05, + "loss": 0.11097879409790039, + "step": 22960 + }, + { + "epoch": 0.0986150107759546, + "grad_norm": 0.004678850993514061, + "learning_rate": 9.052758207359245e-05, + "loss": 0.24954421520233155, + "step": 22970 + }, + { + "epoch": 0.0986579428659746, + "grad_norm": 0.14163123071193695, + "learning_rate": 9.052327035347482e-05, + "loss": 0.1883327603340149, + "step": 22980 + }, + { + "epoch": 0.09870087495599461, + "grad_norm": 1.834106206893921, + "learning_rate": 9.05189586333572e-05, + "loss": 0.19506003856658935, + "step": 22990 + }, + { + "epoch": 0.09874380704601461, + "grad_norm": 0.9665418863296509, + "learning_rate": 9.051464691323958e-05, + "loss": 0.38878982067108153, + "step": 23000 + }, + { + "epoch": 0.09874380704601461, + "eval_loss": 0.48440316319465637, + "eval_runtime": 27.4341, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 23000 + }, + { + "epoch": 0.09878673913603463, + "grad_norm": 13.875965118408203, + "learning_rate": 9.051033519312196e-05, + "loss": 0.2893874168395996, + "step": 23010 + }, + { + "epoch": 0.09882967122605463, + "grad_norm": 0.04340042173862457, + "learning_rate": 9.050602347300432e-05, + "loss": 0.3228999137878418, + "step": 23020 + }, + { + "epoch": 0.09887260331607463, + "grad_norm": 0.05363241955637932, + "learning_rate": 9.05017117528867e-05, + "loss": 0.36945428848266604, + "step": 23030 + }, + { + "epoch": 0.09891553540609464, + "grad_norm": 0.15401561558246613, + "learning_rate": 9.049740003276907e-05, + "loss": 0.19055159091949464, + "step": 23040 + }, + { + "epoch": 0.09895846749611464, + "grad_norm": 0.008086834102869034, + "learning_rate": 9.049308831265145e-05, + "loss": 0.010691716521978378, + "step": 23050 + }, + { + "epoch": 0.09900139958613466, + "grad_norm": 0.08624149858951569, + "learning_rate": 9.048877659253383e-05, + "loss": 0.15782138109207153, + "step": 23060 + }, + { + "epoch": 0.09904433167615466, + "grad_norm": 0.022653276100754738, + "learning_rate": 9.04844648724162e-05, + "loss": 0.09881403446197509, + "step": 23070 + }, + { + "epoch": 0.09908726376617466, + "grad_norm": 0.002614100929349661, + "learning_rate": 9.048015315229858e-05, + "loss": 0.33692307472229005, + "step": 23080 + }, + { + "epoch": 0.09913019585619467, + "grad_norm": 1.3454197645187378, + "learning_rate": 9.047584143218096e-05, + "loss": 0.37106547355651853, + "step": 23090 + }, + { + "epoch": 0.09917312794621468, + "grad_norm": 0.00979001447558403, + "learning_rate": 9.047152971206332e-05, + "loss": 0.2598975896835327, + "step": 23100 + }, + { + "epoch": 0.09921606003623469, + "grad_norm": 23.73790740966797, + "learning_rate": 9.04672179919457e-05, + "loss": 0.33892526626586916, + "step": 23110 + }, + { + "epoch": 0.09925899212625469, + "grad_norm": 6.083474636077881, + "learning_rate": 9.046290627182808e-05, + "loss": 0.25849740505218505, + "step": 23120 + }, + { + "epoch": 0.09930192421627469, + "grad_norm": 0.8224849104881287, + "learning_rate": 9.045859455171046e-05, + "loss": 0.4263033390045166, + "step": 23130 + }, + { + "epoch": 0.09934485630629471, + "grad_norm": 0.017254041507840157, + "learning_rate": 9.045428283159283e-05, + "loss": 0.17393676042556763, + "step": 23140 + }, + { + "epoch": 0.09938778839631471, + "grad_norm": 0.10545994341373444, + "learning_rate": 9.044997111147521e-05, + "loss": 0.29897661209106446, + "step": 23150 + }, + { + "epoch": 0.09943072048633471, + "grad_norm": 0.36653459072113037, + "learning_rate": 9.04456593913576e-05, + "loss": 0.2769860506057739, + "step": 23160 + }, + { + "epoch": 0.09947365257635472, + "grad_norm": 1.0582211017608643, + "learning_rate": 9.044134767123998e-05, + "loss": 0.32262184619903567, + "step": 23170 + }, + { + "epoch": 0.09951658466637472, + "grad_norm": 10.352930068969727, + "learning_rate": 9.043703595112234e-05, + "loss": 0.2117457628250122, + "step": 23180 + }, + { + "epoch": 0.09955951675639474, + "grad_norm": 0.03504159301519394, + "learning_rate": 9.043272423100472e-05, + "loss": 0.14238468408584595, + "step": 23190 + }, + { + "epoch": 0.09960244884641474, + "grad_norm": 0.02711699716746807, + "learning_rate": 9.04284125108871e-05, + "loss": 0.19559116363525392, + "step": 23200 + }, + { + "epoch": 0.09964538093643474, + "grad_norm": 0.0065023526549339294, + "learning_rate": 9.042410079076948e-05, + "loss": 0.08624934554100036, + "step": 23210 + }, + { + "epoch": 0.09968831302645476, + "grad_norm": 0.043089572340250015, + "learning_rate": 9.041978907065185e-05, + "loss": 0.07906042337417603, + "step": 23220 + }, + { + "epoch": 0.09973124511647476, + "grad_norm": 0.021948212757706642, + "learning_rate": 9.041547735053423e-05, + "loss": 0.2752461671829224, + "step": 23230 + }, + { + "epoch": 0.09977417720649477, + "grad_norm": 0.012341726571321487, + "learning_rate": 9.041116563041661e-05, + "loss": 0.18004517555236815, + "step": 23240 + }, + { + "epoch": 0.09981710929651477, + "grad_norm": 1.122619867324829, + "learning_rate": 9.040685391029898e-05, + "loss": 0.15641025304794312, + "step": 23250 + }, + { + "epoch": 0.09986004138653477, + "grad_norm": 0.021335327997803688, + "learning_rate": 9.040254219018136e-05, + "loss": 0.22120120525360107, + "step": 23260 + }, + { + "epoch": 0.09990297347655479, + "grad_norm": 0.8859668970108032, + "learning_rate": 9.039823047006373e-05, + "loss": 0.3420703649520874, + "step": 23270 + }, + { + "epoch": 0.09994590556657479, + "grad_norm": 0.0022604737896472216, + "learning_rate": 9.03939187499461e-05, + "loss": 0.20502817630767822, + "step": 23280 + }, + { + "epoch": 0.0999888376565948, + "grad_norm": 0.15698321163654327, + "learning_rate": 9.038960702982848e-05, + "loss": 0.32589011192321776, + "step": 23290 + }, + { + "epoch": 0.1000317697466148, + "grad_norm": 1.4402498006820679, + "learning_rate": 9.038529530971086e-05, + "loss": 0.28263533115386963, + "step": 23300 + }, + { + "epoch": 0.1000747018366348, + "grad_norm": 1.280652642250061, + "learning_rate": 9.038098358959324e-05, + "loss": 0.1799672245979309, + "step": 23310 + }, + { + "epoch": 0.10011763392665482, + "grad_norm": 0.2672135829925537, + "learning_rate": 9.037667186947561e-05, + "loss": 0.18167222738265992, + "step": 23320 + }, + { + "epoch": 0.10016056601667482, + "grad_norm": 0.05923588201403618, + "learning_rate": 9.037236014935799e-05, + "loss": 0.17600538730621337, + "step": 23330 + }, + { + "epoch": 0.10020349810669484, + "grad_norm": 0.689048707485199, + "learning_rate": 9.036804842924037e-05, + "loss": 0.3387850522994995, + "step": 23340 + }, + { + "epoch": 0.10024643019671484, + "grad_norm": 4.445986747741699, + "learning_rate": 9.036373670912273e-05, + "loss": 0.2602772951126099, + "step": 23350 + }, + { + "epoch": 0.10028936228673484, + "grad_norm": 4.003875255584717, + "learning_rate": 9.035942498900511e-05, + "loss": 0.48433527946472166, + "step": 23360 + }, + { + "epoch": 0.10033229437675485, + "grad_norm": 2.171830177307129, + "learning_rate": 9.035511326888749e-05, + "loss": 0.30810022354125977, + "step": 23370 + }, + { + "epoch": 0.10037522646677485, + "grad_norm": 5.789403915405273, + "learning_rate": 9.035080154876988e-05, + "loss": 0.2114821195602417, + "step": 23380 + }, + { + "epoch": 0.10041815855679487, + "grad_norm": 0.8029804825782776, + "learning_rate": 9.034648982865225e-05, + "loss": 0.21316425800323485, + "step": 23390 + }, + { + "epoch": 0.10046109064681487, + "grad_norm": 3.6660075187683105, + "learning_rate": 9.034217810853463e-05, + "loss": 0.3459254026412964, + "step": 23400 + }, + { + "epoch": 0.10050402273683487, + "grad_norm": 0.025712208822369576, + "learning_rate": 9.033786638841701e-05, + "loss": 0.18357253074645996, + "step": 23410 + }, + { + "epoch": 0.10054695482685488, + "grad_norm": 1.9867717027664185, + "learning_rate": 9.033355466829939e-05, + "loss": 0.19808287620544435, + "step": 23420 + }, + { + "epoch": 0.10058988691687488, + "grad_norm": 0.25725287199020386, + "learning_rate": 9.032924294818175e-05, + "loss": 0.35228469371795657, + "step": 23430 + }, + { + "epoch": 0.1006328190068949, + "grad_norm": 2.235778570175171, + "learning_rate": 9.032493122806413e-05, + "loss": 0.3108292818069458, + "step": 23440 + }, + { + "epoch": 0.1006757510969149, + "grad_norm": 0.0473552830517292, + "learning_rate": 9.03206195079465e-05, + "loss": 0.14987121820449828, + "step": 23450 + }, + { + "epoch": 0.1007186831869349, + "grad_norm": 0.05179880931973457, + "learning_rate": 9.031630778782888e-05, + "loss": 0.2000588893890381, + "step": 23460 + }, + { + "epoch": 0.10076161527695492, + "grad_norm": 1.4705721139907837, + "learning_rate": 9.031199606771126e-05, + "loss": 0.33507752418518066, + "step": 23470 + }, + { + "epoch": 0.10080454736697492, + "grad_norm": 14.33340072631836, + "learning_rate": 9.030768434759364e-05, + "loss": 0.39852664470672605, + "step": 23480 + }, + { + "epoch": 0.10084747945699493, + "grad_norm": 0.0033453968353569508, + "learning_rate": 9.030337262747601e-05, + "loss": 0.25700409412384034, + "step": 23490 + }, + { + "epoch": 0.10089041154701493, + "grad_norm": 0.026818279176950455, + "learning_rate": 9.029906090735839e-05, + "loss": 0.0700181007385254, + "step": 23500 + }, + { + "epoch": 0.10093334363703493, + "grad_norm": 0.011163324117660522, + "learning_rate": 9.029474918724076e-05, + "loss": 0.26866855621337893, + "step": 23510 + }, + { + "epoch": 0.10097627572705495, + "grad_norm": 2.636354684829712, + "learning_rate": 9.029043746712313e-05, + "loss": 0.30903441905975343, + "step": 23520 + }, + { + "epoch": 0.10101920781707495, + "grad_norm": 4.494817733764648, + "learning_rate": 9.028612574700551e-05, + "loss": 0.32044272422790526, + "step": 23530 + }, + { + "epoch": 0.10106213990709496, + "grad_norm": 1.129416584968567, + "learning_rate": 9.028181402688789e-05, + "loss": 0.3815204620361328, + "step": 23540 + }, + { + "epoch": 0.10110507199711496, + "grad_norm": 1.175889015197754, + "learning_rate": 9.027750230677026e-05, + "loss": 0.15066792964935302, + "step": 23550 + }, + { + "epoch": 0.10114800408713497, + "grad_norm": 0.4594559967517853, + "learning_rate": 9.027319058665264e-05, + "loss": 0.32463822364807127, + "step": 23560 + }, + { + "epoch": 0.10119093617715498, + "grad_norm": 0.20249129831790924, + "learning_rate": 9.026887886653502e-05, + "loss": 0.16981053352355957, + "step": 23570 + }, + { + "epoch": 0.10123386826717498, + "grad_norm": 0.18544776737689972, + "learning_rate": 9.02645671464174e-05, + "loss": 0.2120675802230835, + "step": 23580 + }, + { + "epoch": 0.10127680035719498, + "grad_norm": 1.3284571170806885, + "learning_rate": 9.026025542629976e-05, + "loss": 0.29755163192749023, + "step": 23590 + }, + { + "epoch": 0.101319732447215, + "grad_norm": 0.09027359634637833, + "learning_rate": 9.025594370618215e-05, + "loss": 0.3194664478302002, + "step": 23600 + }, + { + "epoch": 0.101362664537235, + "grad_norm": 0.013314232230186462, + "learning_rate": 9.025163198606453e-05, + "loss": 0.4534477233886719, + "step": 23610 + }, + { + "epoch": 0.10140559662725501, + "grad_norm": 3.5338640213012695, + "learning_rate": 9.02473202659469e-05, + "loss": 0.32490148544311526, + "step": 23620 + }, + { + "epoch": 0.10144852871727501, + "grad_norm": 3.0845723152160645, + "learning_rate": 9.024300854582928e-05, + "loss": 0.2430340528488159, + "step": 23630 + }, + { + "epoch": 0.10149146080729501, + "grad_norm": 0.8819409608840942, + "learning_rate": 9.023869682571166e-05, + "loss": 0.28399295806884767, + "step": 23640 + }, + { + "epoch": 0.10153439289731503, + "grad_norm": 0.03459606692194939, + "learning_rate": 9.023438510559404e-05, + "loss": 0.23548510074615478, + "step": 23650 + }, + { + "epoch": 0.10157732498733503, + "grad_norm": 12.161425590515137, + "learning_rate": 9.023007338547642e-05, + "loss": 0.39958481788635253, + "step": 23660 + }, + { + "epoch": 0.10162025707735504, + "grad_norm": 0.08297639340162277, + "learning_rate": 9.022576166535879e-05, + "loss": 0.15242440700531007, + "step": 23670 + }, + { + "epoch": 0.10166318916737505, + "grad_norm": 0.17127041518688202, + "learning_rate": 9.022144994524116e-05, + "loss": 0.22498526573181152, + "step": 23680 + }, + { + "epoch": 0.10170612125739505, + "grad_norm": 1.2555885314941406, + "learning_rate": 9.021713822512353e-05, + "loss": 0.3380129337310791, + "step": 23690 + }, + { + "epoch": 0.10174905334741506, + "grad_norm": 1.3294332027435303, + "learning_rate": 9.021282650500591e-05, + "loss": 0.22705109119415284, + "step": 23700 + }, + { + "epoch": 0.10179198543743506, + "grad_norm": 0.050139833241701126, + "learning_rate": 9.020851478488829e-05, + "loss": 0.3327648401260376, + "step": 23710 + }, + { + "epoch": 0.10183491752745508, + "grad_norm": 2.695390224456787, + "learning_rate": 9.020420306477067e-05, + "loss": 0.4076399803161621, + "step": 23720 + }, + { + "epoch": 0.10187784961747508, + "grad_norm": 0.45680272579193115, + "learning_rate": 9.019989134465304e-05, + "loss": 0.2645672082901001, + "step": 23730 + }, + { + "epoch": 0.10192078170749508, + "grad_norm": 4.387186527252197, + "learning_rate": 9.019557962453542e-05, + "loss": 0.18714665174484252, + "step": 23740 + }, + { + "epoch": 0.1019637137975151, + "grad_norm": 0.34954363107681274, + "learning_rate": 9.01912679044178e-05, + "loss": 0.19371408224105835, + "step": 23750 + }, + { + "epoch": 0.1020066458875351, + "grad_norm": 0.18718601763248444, + "learning_rate": 9.018695618430016e-05, + "loss": 0.11431068181991577, + "step": 23760 + }, + { + "epoch": 0.10204957797755511, + "grad_norm": 0.027119481936097145, + "learning_rate": 9.018264446418254e-05, + "loss": 0.27887601852416993, + "step": 23770 + }, + { + "epoch": 0.10209251006757511, + "grad_norm": 0.0788949579000473, + "learning_rate": 9.017833274406492e-05, + "loss": 0.17622390985488892, + "step": 23780 + }, + { + "epoch": 0.10213544215759511, + "grad_norm": 0.07781904935836792, + "learning_rate": 9.01740210239473e-05, + "loss": 0.16738327741622924, + "step": 23790 + }, + { + "epoch": 0.10217837424761513, + "grad_norm": 0.7536662817001343, + "learning_rate": 9.016970930382967e-05, + "loss": 0.22194280624389648, + "step": 23800 + }, + { + "epoch": 0.10222130633763513, + "grad_norm": 0.008254468441009521, + "learning_rate": 9.016539758371205e-05, + "loss": 0.35129923820495607, + "step": 23810 + }, + { + "epoch": 0.10226423842765514, + "grad_norm": 0.7630317807197571, + "learning_rate": 9.016108586359443e-05, + "loss": 0.2706347942352295, + "step": 23820 + }, + { + "epoch": 0.10230717051767514, + "grad_norm": 1.1152968406677246, + "learning_rate": 9.01567741434768e-05, + "loss": 0.424467134475708, + "step": 23830 + }, + { + "epoch": 0.10235010260769514, + "grad_norm": 9.643174171447754, + "learning_rate": 9.015246242335918e-05, + "loss": 0.20278689861297608, + "step": 23840 + }, + { + "epoch": 0.10239303469771516, + "grad_norm": 0.22802026569843292, + "learning_rate": 9.014815070324156e-05, + "loss": 0.3205211877822876, + "step": 23850 + }, + { + "epoch": 0.10243596678773516, + "grad_norm": 0.010943016968667507, + "learning_rate": 9.014383898312393e-05, + "loss": 0.08449001908302307, + "step": 23860 + }, + { + "epoch": 0.10247889887775517, + "grad_norm": 2.0597031116485596, + "learning_rate": 9.013952726300631e-05, + "loss": 0.46553263664245603, + "step": 23870 + }, + { + "epoch": 0.10252183096777517, + "grad_norm": 0.05474220588803291, + "learning_rate": 9.013521554288869e-05, + "loss": 0.36751723289489746, + "step": 23880 + }, + { + "epoch": 0.10256476305779517, + "grad_norm": 0.12750476598739624, + "learning_rate": 9.013090382277107e-05, + "loss": 0.15174291133880616, + "step": 23890 + }, + { + "epoch": 0.10260769514781519, + "grad_norm": 2.684352397918701, + "learning_rate": 9.012659210265344e-05, + "loss": 0.15601576566696168, + "step": 23900 + }, + { + "epoch": 0.10265062723783519, + "grad_norm": 0.024600274860858917, + "learning_rate": 9.012228038253582e-05, + "loss": 0.28514063358306885, + "step": 23910 + }, + { + "epoch": 0.1026935593278552, + "grad_norm": 0.1267337203025818, + "learning_rate": 9.011796866241819e-05, + "loss": 0.28657100200653074, + "step": 23920 + }, + { + "epoch": 0.1027364914178752, + "grad_norm": 0.09003254026174545, + "learning_rate": 9.011365694230056e-05, + "loss": 0.37873115539550783, + "step": 23930 + }, + { + "epoch": 0.1027794235078952, + "grad_norm": 0.08197905868291855, + "learning_rate": 9.010934522218294e-05, + "loss": 0.2803717374801636, + "step": 23940 + }, + { + "epoch": 0.10282235559791522, + "grad_norm": 0.034923408180475235, + "learning_rate": 9.010503350206532e-05, + "loss": 0.24216341972351074, + "step": 23950 + }, + { + "epoch": 0.10286528768793522, + "grad_norm": 0.06310239434242249, + "learning_rate": 9.01007217819477e-05, + "loss": 0.3100374460220337, + "step": 23960 + }, + { + "epoch": 0.10290821977795524, + "grad_norm": 3.169227361679077, + "learning_rate": 9.009641006183007e-05, + "loss": 0.352788519859314, + "step": 23970 + }, + { + "epoch": 0.10295115186797524, + "grad_norm": 0.026941534131765366, + "learning_rate": 9.009209834171245e-05, + "loss": 0.1345282554626465, + "step": 23980 + }, + { + "epoch": 0.10299408395799524, + "grad_norm": 1.0152279138565063, + "learning_rate": 9.008778662159483e-05, + "loss": 0.15926393270492553, + "step": 23990 + }, + { + "epoch": 0.10303701604801525, + "grad_norm": 0.04590229690074921, + "learning_rate": 9.00834749014772e-05, + "loss": 0.25821306705474856, + "step": 24000 + }, + { + "epoch": 0.10303701604801525, + "eval_loss": 0.49062225222587585, + "eval_runtime": 27.4363, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 24000 + }, + { + "epoch": 0.10307994813803525, + "grad_norm": 3.166994094848633, + "learning_rate": 9.007916318135957e-05, + "loss": 0.5217941761016845, + "step": 24010 + }, + { + "epoch": 0.10312288022805526, + "grad_norm": 2.5861167907714844, + "learning_rate": 9.007485146124195e-05, + "loss": 0.474554443359375, + "step": 24020 + }, + { + "epoch": 0.10316581231807527, + "grad_norm": 0.3263162672519684, + "learning_rate": 9.007053974112432e-05, + "loss": 0.09132232069969178, + "step": 24030 + }, + { + "epoch": 0.10320874440809527, + "grad_norm": 0.027722327038645744, + "learning_rate": 9.00662280210067e-05, + "loss": 0.22393157482147216, + "step": 24040 + }, + { + "epoch": 0.10325167649811529, + "grad_norm": 0.041487690061330795, + "learning_rate": 9.006191630088908e-05, + "loss": 0.2686819553375244, + "step": 24050 + }, + { + "epoch": 0.10329460858813529, + "grad_norm": 1.3169100284576416, + "learning_rate": 9.005760458077145e-05, + "loss": 0.34045383930206297, + "step": 24060 + }, + { + "epoch": 0.10333754067815529, + "grad_norm": 35.14598083496094, + "learning_rate": 9.005329286065383e-05, + "loss": 0.26218743324279786, + "step": 24070 + }, + { + "epoch": 0.1033804727681753, + "grad_norm": 5.64556360244751, + "learning_rate": 9.004898114053621e-05, + "loss": 0.31703526973724366, + "step": 24080 + }, + { + "epoch": 0.1034234048581953, + "grad_norm": 0.6731160283088684, + "learning_rate": 9.004466942041859e-05, + "loss": 0.22504048347473143, + "step": 24090 + }, + { + "epoch": 0.10346633694821532, + "grad_norm": 1.6286805868148804, + "learning_rate": 9.004035770030096e-05, + "loss": 0.34985036849975587, + "step": 24100 + }, + { + "epoch": 0.10350926903823532, + "grad_norm": 0.1510307937860489, + "learning_rate": 9.003604598018334e-05, + "loss": 0.25102245807647705, + "step": 24110 + }, + { + "epoch": 0.10355220112825532, + "grad_norm": 0.772885262966156, + "learning_rate": 9.003173426006572e-05, + "loss": 0.35218875408172606, + "step": 24120 + }, + { + "epoch": 0.10359513321827533, + "grad_norm": 0.02091173827648163, + "learning_rate": 9.00274225399481e-05, + "loss": 0.2027892827987671, + "step": 24130 + }, + { + "epoch": 0.10363806530829534, + "grad_norm": 0.8786803483963013, + "learning_rate": 9.002311081983047e-05, + "loss": 0.31464924812316897, + "step": 24140 + }, + { + "epoch": 0.10368099739831535, + "grad_norm": 0.006752110552042723, + "learning_rate": 9.001879909971285e-05, + "loss": 0.2620436906814575, + "step": 24150 + }, + { + "epoch": 0.10372392948833535, + "grad_norm": 1.4851539134979248, + "learning_rate": 9.001448737959523e-05, + "loss": 0.30434255599975585, + "step": 24160 + }, + { + "epoch": 0.10376686157835535, + "grad_norm": 0.0038928573485463858, + "learning_rate": 9.001017565947759e-05, + "loss": 0.21102461814880372, + "step": 24170 + }, + { + "epoch": 0.10380979366837537, + "grad_norm": 0.028493205085396767, + "learning_rate": 9.000586393935997e-05, + "loss": 0.4070149898529053, + "step": 24180 + }, + { + "epoch": 0.10385272575839537, + "grad_norm": 4.407706260681152, + "learning_rate": 9.000155221924235e-05, + "loss": 0.30825190544128417, + "step": 24190 + }, + { + "epoch": 0.10389565784841538, + "grad_norm": 1.131311058998108, + "learning_rate": 8.999724049912472e-05, + "loss": 0.2937792778015137, + "step": 24200 + }, + { + "epoch": 0.10393858993843538, + "grad_norm": 1.499942421913147, + "learning_rate": 8.99929287790071e-05, + "loss": 0.25059003829956056, + "step": 24210 + }, + { + "epoch": 0.10398152202845538, + "grad_norm": 1.6240562200546265, + "learning_rate": 8.998861705888948e-05, + "loss": 0.2966706991195679, + "step": 24220 + }, + { + "epoch": 0.1040244541184754, + "grad_norm": 0.2789813280105591, + "learning_rate": 8.998430533877186e-05, + "loss": 0.2914541721343994, + "step": 24230 + }, + { + "epoch": 0.1040673862084954, + "grad_norm": 0.010012147016823292, + "learning_rate": 8.997999361865423e-05, + "loss": 0.4344785690307617, + "step": 24240 + }, + { + "epoch": 0.10411031829851541, + "grad_norm": 1.3773272037506104, + "learning_rate": 8.99756818985366e-05, + "loss": 0.22288546562194825, + "step": 24250 + }, + { + "epoch": 0.10415325038853541, + "grad_norm": 0.09848473966121674, + "learning_rate": 8.997137017841897e-05, + "loss": 0.21312005519866944, + "step": 24260 + }, + { + "epoch": 0.10419618247855542, + "grad_norm": 0.029853790998458862, + "learning_rate": 8.996705845830135e-05, + "loss": 0.28399271965026857, + "step": 24270 + }, + { + "epoch": 0.10423911456857543, + "grad_norm": 0.01982700265944004, + "learning_rate": 8.996274673818373e-05, + "loss": 0.2613893270492554, + "step": 24280 + }, + { + "epoch": 0.10428204665859543, + "grad_norm": 0.0047102137468755245, + "learning_rate": 8.99584350180661e-05, + "loss": 0.25738341808319093, + "step": 24290 + }, + { + "epoch": 0.10432497874861545, + "grad_norm": 2.0769221782684326, + "learning_rate": 8.995412329794848e-05, + "loss": 0.1588195562362671, + "step": 24300 + }, + { + "epoch": 0.10436791083863545, + "grad_norm": 0.09412598609924316, + "learning_rate": 8.994981157783086e-05, + "loss": 0.22160720825195312, + "step": 24310 + }, + { + "epoch": 0.10441084292865545, + "grad_norm": 0.9660851359367371, + "learning_rate": 8.994549985771324e-05, + "loss": 0.5141227722167969, + "step": 24320 + }, + { + "epoch": 0.10445377501867546, + "grad_norm": 0.6510734558105469, + "learning_rate": 8.994118813759562e-05, + "loss": 0.11361143589019776, + "step": 24330 + }, + { + "epoch": 0.10449670710869546, + "grad_norm": 0.20415718853473663, + "learning_rate": 8.993687641747799e-05, + "loss": 0.3651560306549072, + "step": 24340 + }, + { + "epoch": 0.10453963919871548, + "grad_norm": 0.007316565606743097, + "learning_rate": 8.993256469736037e-05, + "loss": 0.26250133514404295, + "step": 24350 + }, + { + "epoch": 0.10458257128873548, + "grad_norm": 1.585828185081482, + "learning_rate": 8.992825297724275e-05, + "loss": 0.28480618000030516, + "step": 24360 + }, + { + "epoch": 0.10462550337875548, + "grad_norm": 0.01666135899722576, + "learning_rate": 8.992394125712513e-05, + "loss": 0.2830787181854248, + "step": 24370 + }, + { + "epoch": 0.1046684354687755, + "grad_norm": 1.8920531272888184, + "learning_rate": 8.99196295370075e-05, + "loss": 0.29449806213378904, + "step": 24380 + }, + { + "epoch": 0.1047113675587955, + "grad_norm": 1.3783341646194458, + "learning_rate": 8.991531781688988e-05, + "loss": 0.35810155868530275, + "step": 24390 + }, + { + "epoch": 0.10475429964881551, + "grad_norm": 0.01928071118891239, + "learning_rate": 8.991100609677226e-05, + "loss": 0.14494633674621582, + "step": 24400 + }, + { + "epoch": 0.10479723173883551, + "grad_norm": 0.025622989982366562, + "learning_rate": 8.990669437665463e-05, + "loss": 0.39835395812988283, + "step": 24410 + }, + { + "epoch": 0.10484016382885551, + "grad_norm": 1.062815546989441, + "learning_rate": 8.9902382656537e-05, + "loss": 0.5315930366516113, + "step": 24420 + }, + { + "epoch": 0.10488309591887553, + "grad_norm": 3.022416830062866, + "learning_rate": 8.989807093641938e-05, + "loss": 0.30136756896972655, + "step": 24430 + }, + { + "epoch": 0.10492602800889553, + "grad_norm": 1.4548002481460571, + "learning_rate": 8.989375921630175e-05, + "loss": 0.3413592100143433, + "step": 24440 + }, + { + "epoch": 0.10496896009891553, + "grad_norm": 0.19854702055454254, + "learning_rate": 8.988944749618413e-05, + "loss": 0.13472495079040528, + "step": 24450 + }, + { + "epoch": 0.10501189218893554, + "grad_norm": 2.6877894401550293, + "learning_rate": 8.988513577606651e-05, + "loss": 0.40897254943847655, + "step": 24460 + }, + { + "epoch": 0.10505482427895554, + "grad_norm": 1.532139539718628, + "learning_rate": 8.988082405594889e-05, + "loss": 0.1912761688232422, + "step": 24470 + }, + { + "epoch": 0.10509775636897556, + "grad_norm": 0.2924424409866333, + "learning_rate": 8.987651233583126e-05, + "loss": 0.19859321117401124, + "step": 24480 + }, + { + "epoch": 0.10514068845899556, + "grad_norm": 0.016296448186039925, + "learning_rate": 8.987220061571364e-05, + "loss": 0.18080949783325195, + "step": 24490 + }, + { + "epoch": 0.10518362054901556, + "grad_norm": 1.8604021072387695, + "learning_rate": 8.9867888895596e-05, + "loss": 0.08436711430549622, + "step": 24500 + }, + { + "epoch": 0.10522655263903558, + "grad_norm": 0.9807556867599487, + "learning_rate": 8.986357717547838e-05, + "loss": 0.13042645454406737, + "step": 24510 + }, + { + "epoch": 0.10526948472905558, + "grad_norm": 0.08815609663724899, + "learning_rate": 8.985926545536076e-05, + "loss": 0.21929657459259033, + "step": 24520 + }, + { + "epoch": 0.10531241681907559, + "grad_norm": 0.2408457249403, + "learning_rate": 8.985495373524314e-05, + "loss": 0.27845087051391604, + "step": 24530 + }, + { + "epoch": 0.10535534890909559, + "grad_norm": 0.027684088796377182, + "learning_rate": 8.985064201512551e-05, + "loss": 0.40946121215820314, + "step": 24540 + }, + { + "epoch": 0.10539828099911559, + "grad_norm": 0.05740318447351456, + "learning_rate": 8.984633029500789e-05, + "loss": 0.10234721899032592, + "step": 24550 + }, + { + "epoch": 0.10544121308913561, + "grad_norm": 0.026760630309581757, + "learning_rate": 8.984201857489027e-05, + "loss": 0.24178724288940429, + "step": 24560 + }, + { + "epoch": 0.10548414517915561, + "grad_norm": 1.6453255414962769, + "learning_rate": 8.983770685477266e-05, + "loss": 0.22513704299926757, + "step": 24570 + }, + { + "epoch": 0.10552707726917562, + "grad_norm": 3.5417139530181885, + "learning_rate": 8.983339513465502e-05, + "loss": 0.44940686225891113, + "step": 24580 + }, + { + "epoch": 0.10557000935919562, + "grad_norm": 0.02605029195547104, + "learning_rate": 8.98290834145374e-05, + "loss": 0.33814427852630613, + "step": 24590 + }, + { + "epoch": 0.10561294144921562, + "grad_norm": 2.6322715282440186, + "learning_rate": 8.982477169441978e-05, + "loss": 0.3033830404281616, + "step": 24600 + }, + { + "epoch": 0.10565587353923564, + "grad_norm": 2.977161169052124, + "learning_rate": 8.982045997430215e-05, + "loss": 0.20005474090576172, + "step": 24610 + }, + { + "epoch": 0.10569880562925564, + "grad_norm": 0.21439577639102936, + "learning_rate": 8.981614825418453e-05, + "loss": 0.3983053207397461, + "step": 24620 + }, + { + "epoch": 0.10574173771927566, + "grad_norm": 0.20449991524219513, + "learning_rate": 8.981183653406691e-05, + "loss": 0.2888350009918213, + "step": 24630 + }, + { + "epoch": 0.10578466980929566, + "grad_norm": 0.22445398569107056, + "learning_rate": 8.980752481394929e-05, + "loss": 0.2892467737197876, + "step": 24640 + }, + { + "epoch": 0.10582760189931566, + "grad_norm": 2.5356032848358154, + "learning_rate": 8.980321309383166e-05, + "loss": 0.1676466941833496, + "step": 24650 + }, + { + "epoch": 0.10587053398933567, + "grad_norm": 1.0866541862487793, + "learning_rate": 8.979890137371403e-05, + "loss": 0.12875763177871705, + "step": 24660 + }, + { + "epoch": 0.10591346607935567, + "grad_norm": 19.632532119750977, + "learning_rate": 8.97945896535964e-05, + "loss": 0.2675110578536987, + "step": 24670 + }, + { + "epoch": 0.10595639816937569, + "grad_norm": 0.48357057571411133, + "learning_rate": 8.979027793347878e-05, + "loss": 0.25778658390045167, + "step": 24680 + }, + { + "epoch": 0.10599933025939569, + "grad_norm": 0.12672166526317596, + "learning_rate": 8.978596621336116e-05, + "loss": 0.2856446743011475, + "step": 24690 + }, + { + "epoch": 0.10604226234941569, + "grad_norm": 2.005126953125, + "learning_rate": 8.978165449324354e-05, + "loss": 0.3275081396102905, + "step": 24700 + }, + { + "epoch": 0.1060851944394357, + "grad_norm": 0.10233116894960403, + "learning_rate": 8.977734277312591e-05, + "loss": 0.41061697006225584, + "step": 24710 + }, + { + "epoch": 0.1061281265294557, + "grad_norm": 0.025767680257558823, + "learning_rate": 8.977303105300829e-05, + "loss": 0.1702197551727295, + "step": 24720 + }, + { + "epoch": 0.10617105861947572, + "grad_norm": 0.45195960998535156, + "learning_rate": 8.976871933289067e-05, + "loss": 0.35706462860107424, + "step": 24730 + }, + { + "epoch": 0.10621399070949572, + "grad_norm": 0.8962579369544983, + "learning_rate": 8.976440761277305e-05, + "loss": 0.35692172050476073, + "step": 24740 + }, + { + "epoch": 0.10625692279951572, + "grad_norm": 3.8452181816101074, + "learning_rate": 8.976009589265541e-05, + "loss": 0.35666708946228026, + "step": 24750 + }, + { + "epoch": 0.10629985488953574, + "grad_norm": 0.9256759285926819, + "learning_rate": 8.975578417253779e-05, + "loss": 0.21581745147705078, + "step": 24760 + }, + { + "epoch": 0.10634278697955574, + "grad_norm": 0.012155724689364433, + "learning_rate": 8.975147245242016e-05, + "loss": 0.2863577604293823, + "step": 24770 + }, + { + "epoch": 0.10638571906957575, + "grad_norm": 1.3886440992355347, + "learning_rate": 8.974716073230254e-05, + "loss": 0.43889813423156737, + "step": 24780 + }, + { + "epoch": 0.10642865115959575, + "grad_norm": 0.02271793968975544, + "learning_rate": 8.974284901218493e-05, + "loss": 0.04355182945728302, + "step": 24790 + }, + { + "epoch": 0.10647158324961575, + "grad_norm": 0.03789331763982773, + "learning_rate": 8.973853729206731e-05, + "loss": 0.42855305671691896, + "step": 24800 + }, + { + "epoch": 0.10651451533963577, + "grad_norm": 0.04173389449715614, + "learning_rate": 8.973422557194969e-05, + "loss": 0.1199187159538269, + "step": 24810 + }, + { + "epoch": 0.10655744742965577, + "grad_norm": 0.034947946667671204, + "learning_rate": 8.972991385183206e-05, + "loss": 0.5193025588989257, + "step": 24820 + }, + { + "epoch": 0.10660037951967578, + "grad_norm": 0.08006519079208374, + "learning_rate": 8.972560213171443e-05, + "loss": 0.19815462827682495, + "step": 24830 + }, + { + "epoch": 0.10664331160969578, + "grad_norm": 0.07318032532930374, + "learning_rate": 8.97212904115968e-05, + "loss": 0.06539644002914428, + "step": 24840 + }, + { + "epoch": 0.10668624369971579, + "grad_norm": 0.4578899145126343, + "learning_rate": 8.971697869147918e-05, + "loss": 0.33419122695922854, + "step": 24850 + }, + { + "epoch": 0.1067291757897358, + "grad_norm": 0.006664654705673456, + "learning_rate": 8.971266697136156e-05, + "loss": 0.5630306720733642, + "step": 24860 + }, + { + "epoch": 0.1067721078797558, + "grad_norm": 0.0594543032348156, + "learning_rate": 8.970835525124394e-05, + "loss": 0.29572596549987795, + "step": 24870 + }, + { + "epoch": 0.1068150399697758, + "grad_norm": 0.047232624143362045, + "learning_rate": 8.970404353112632e-05, + "loss": 0.4749437808990479, + "step": 24880 + }, + { + "epoch": 0.10685797205979582, + "grad_norm": 0.08967532217502594, + "learning_rate": 8.969973181100869e-05, + "loss": 0.008667629212141037, + "step": 24890 + }, + { + "epoch": 0.10690090414981582, + "grad_norm": 0.01756768673658371, + "learning_rate": 8.969542009089107e-05, + "loss": 0.3080634593963623, + "step": 24900 + }, + { + "epoch": 0.10694383623983583, + "grad_norm": 1.994766354560852, + "learning_rate": 8.969110837077343e-05, + "loss": 0.20746369361877443, + "step": 24910 + }, + { + "epoch": 0.10698676832985583, + "grad_norm": 1.1729373931884766, + "learning_rate": 8.968679665065581e-05, + "loss": 0.3455613136291504, + "step": 24920 + }, + { + "epoch": 0.10702970041987583, + "grad_norm": 5.345548152923584, + "learning_rate": 8.968248493053819e-05, + "loss": 0.44353442192077636, + "step": 24930 + }, + { + "epoch": 0.10707263250989585, + "grad_norm": 0.9387415647506714, + "learning_rate": 8.967817321042057e-05, + "loss": 0.3211450338363647, + "step": 24940 + }, + { + "epoch": 0.10711556459991585, + "grad_norm": 0.5626527667045593, + "learning_rate": 8.967386149030294e-05, + "loss": 0.24862992763519287, + "step": 24950 + }, + { + "epoch": 0.10715849668993586, + "grad_norm": 0.015321125276386738, + "learning_rate": 8.966954977018532e-05, + "loss": 0.24494407176971436, + "step": 24960 + }, + { + "epoch": 0.10720142877995587, + "grad_norm": 0.11206818372011185, + "learning_rate": 8.96652380500677e-05, + "loss": 0.17858786582946778, + "step": 24970 + }, + { + "epoch": 0.10724436086997587, + "grad_norm": 2.545865774154663, + "learning_rate": 8.966092632995008e-05, + "loss": 0.29294619560241697, + "step": 24980 + }, + { + "epoch": 0.10728729295999588, + "grad_norm": 1.6691718101501465, + "learning_rate": 8.965661460983244e-05, + "loss": 0.3634697914123535, + "step": 24990 + }, + { + "epoch": 0.10733022505001588, + "grad_norm": 0.22247134149074554, + "learning_rate": 8.965230288971482e-05, + "loss": 0.31081109046936034, + "step": 25000 + }, + { + "epoch": 0.10733022505001588, + "eval_loss": 0.4849245548248291, + "eval_runtime": 27.44, + "eval_samples_per_second": 3.644, + "eval_steps_per_second": 3.644, + "step": 25000 + }, + { + "epoch": 0.1073731571400359, + "grad_norm": 0.30741971731185913, + "learning_rate": 8.964799116959721e-05, + "loss": 0.37556867599487304, + "step": 25010 + }, + { + "epoch": 0.1074160892300559, + "grad_norm": 0.27221789956092834, + "learning_rate": 8.964367944947958e-05, + "loss": 0.14985108375549316, + "step": 25020 + }, + { + "epoch": 0.1074590213200759, + "grad_norm": 1.1760873794555664, + "learning_rate": 8.963936772936196e-05, + "loss": 0.2914012908935547, + "step": 25030 + }, + { + "epoch": 0.10750195341009591, + "grad_norm": 1.5843279361724854, + "learning_rate": 8.963505600924434e-05, + "loss": 0.6233903884887695, + "step": 25040 + }, + { + "epoch": 0.10754488550011591, + "grad_norm": 0.06669995188713074, + "learning_rate": 8.963074428912672e-05, + "loss": 0.10967894792556762, + "step": 25050 + }, + { + "epoch": 0.10758781759013593, + "grad_norm": 0.09085310995578766, + "learning_rate": 8.96264325690091e-05, + "loss": 0.15039405822753907, + "step": 25060 + }, + { + "epoch": 0.10763074968015593, + "grad_norm": 0.7689294219017029, + "learning_rate": 8.962212084889147e-05, + "loss": 0.38093998432159426, + "step": 25070 + }, + { + "epoch": 0.10767368177017593, + "grad_norm": 0.1175433024764061, + "learning_rate": 8.961780912877384e-05, + "loss": 0.2116264820098877, + "step": 25080 + }, + { + "epoch": 0.10771661386019594, + "grad_norm": 0.04340391606092453, + "learning_rate": 8.961349740865621e-05, + "loss": 0.2621105194091797, + "step": 25090 + }, + { + "epoch": 0.10775954595021595, + "grad_norm": 1.2593803405761719, + "learning_rate": 8.960918568853859e-05, + "loss": 0.26323814392089845, + "step": 25100 + }, + { + "epoch": 0.10780247804023596, + "grad_norm": 1.90890634059906, + "learning_rate": 8.960487396842097e-05, + "loss": 0.22531487941741943, + "step": 25110 + }, + { + "epoch": 0.10784541013025596, + "grad_norm": 0.18675051629543304, + "learning_rate": 8.960056224830334e-05, + "loss": 0.1788573145866394, + "step": 25120 + }, + { + "epoch": 0.10788834222027596, + "grad_norm": 2.0584561824798584, + "learning_rate": 8.959625052818572e-05, + "loss": 0.2614114761352539, + "step": 25130 + }, + { + "epoch": 0.10793127431029598, + "grad_norm": 0.2771095037460327, + "learning_rate": 8.95919388080681e-05, + "loss": 0.31394219398498535, + "step": 25140 + }, + { + "epoch": 0.10797420640031598, + "grad_norm": 2.259929895401001, + "learning_rate": 8.958762708795048e-05, + "loss": 0.23377797603607178, + "step": 25150 + }, + { + "epoch": 0.10801713849033599, + "grad_norm": 5.453269004821777, + "learning_rate": 8.958331536783284e-05, + "loss": 0.20484549999237062, + "step": 25160 + }, + { + "epoch": 0.108060070580356, + "grad_norm": 6.210053443908691, + "learning_rate": 8.957900364771522e-05, + "loss": 0.46298751831054685, + "step": 25170 + }, + { + "epoch": 0.108103002670376, + "grad_norm": 0.21815641224384308, + "learning_rate": 8.95746919275976e-05, + "loss": 0.15839173793792724, + "step": 25180 + }, + { + "epoch": 0.10814593476039601, + "grad_norm": 0.016512513160705566, + "learning_rate": 8.957038020747997e-05, + "loss": 0.2333528757095337, + "step": 25190 + }, + { + "epoch": 0.10818886685041601, + "grad_norm": 1.9446250200271606, + "learning_rate": 8.956606848736235e-05, + "loss": 0.36624941825866697, + "step": 25200 + }, + { + "epoch": 0.10823179894043602, + "grad_norm": 1.332423210144043, + "learning_rate": 8.956175676724473e-05, + "loss": 0.38559038639068605, + "step": 25210 + }, + { + "epoch": 0.10827473103045603, + "grad_norm": 0.09285473078489304, + "learning_rate": 8.95574450471271e-05, + "loss": 0.39534683227539064, + "step": 25220 + }, + { + "epoch": 0.10831766312047603, + "grad_norm": 0.007422926835715771, + "learning_rate": 8.955313332700948e-05, + "loss": 0.23534021377563477, + "step": 25230 + }, + { + "epoch": 0.10836059521049604, + "grad_norm": 0.02128664217889309, + "learning_rate": 8.954882160689186e-05, + "loss": 0.20301442146301268, + "step": 25240 + }, + { + "epoch": 0.10840352730051604, + "grad_norm": 0.013315998017787933, + "learning_rate": 8.954450988677424e-05, + "loss": 0.1390451669692993, + "step": 25250 + }, + { + "epoch": 0.10844645939053606, + "grad_norm": 0.007063603959977627, + "learning_rate": 8.954019816665661e-05, + "loss": 0.05682712197303772, + "step": 25260 + }, + { + "epoch": 0.10848939148055606, + "grad_norm": 0.017539670690894127, + "learning_rate": 8.953588644653899e-05, + "loss": 0.07583492398262023, + "step": 25270 + }, + { + "epoch": 0.10853232357057606, + "grad_norm": 0.06756465137004852, + "learning_rate": 8.953157472642137e-05, + "loss": 0.24220926761627198, + "step": 25280 + }, + { + "epoch": 0.10857525566059607, + "grad_norm": 0.029334593564271927, + "learning_rate": 8.952726300630375e-05, + "loss": 0.36937508583068845, + "step": 25290 + }, + { + "epoch": 0.10861818775061607, + "grad_norm": 0.03658639267086983, + "learning_rate": 8.952295128618612e-05, + "loss": 0.13479502201080323, + "step": 25300 + }, + { + "epoch": 0.10866111984063608, + "grad_norm": 0.031390100717544556, + "learning_rate": 8.95186395660685e-05, + "loss": 0.12219811677932739, + "step": 25310 + }, + { + "epoch": 0.10870405193065609, + "grad_norm": 1.230584740638733, + "learning_rate": 8.951432784595086e-05, + "loss": 0.3189370632171631, + "step": 25320 + }, + { + "epoch": 0.10874698402067609, + "grad_norm": 1.7964065074920654, + "learning_rate": 8.951001612583324e-05, + "loss": 0.1912916660308838, + "step": 25330 + }, + { + "epoch": 0.1087899161106961, + "grad_norm": 0.06472434848546982, + "learning_rate": 8.950570440571562e-05, + "loss": 0.05857505798339844, + "step": 25340 + }, + { + "epoch": 0.1088328482007161, + "grad_norm": 10.742326736450195, + "learning_rate": 8.9501392685598e-05, + "loss": 0.38807761669158936, + "step": 25350 + }, + { + "epoch": 0.10887578029073611, + "grad_norm": 0.01679828017950058, + "learning_rate": 8.949708096548037e-05, + "loss": 0.20313003063201904, + "step": 25360 + }, + { + "epoch": 0.10891871238075612, + "grad_norm": 1.5688421726226807, + "learning_rate": 8.949276924536275e-05, + "loss": 0.3708950519561768, + "step": 25370 + }, + { + "epoch": 0.10896164447077612, + "grad_norm": 0.0956941694021225, + "learning_rate": 8.948845752524513e-05, + "loss": 0.04957548379898071, + "step": 25380 + }, + { + "epoch": 0.10900457656079614, + "grad_norm": 0.015790555626153946, + "learning_rate": 8.94841458051275e-05, + "loss": 0.1790858745574951, + "step": 25390 + }, + { + "epoch": 0.10904750865081614, + "grad_norm": 1.5737788677215576, + "learning_rate": 8.947983408500987e-05, + "loss": 0.15643935203552245, + "step": 25400 + }, + { + "epoch": 0.10909044074083614, + "grad_norm": 3.9349513053894043, + "learning_rate": 8.947552236489225e-05, + "loss": 0.2875617504119873, + "step": 25410 + }, + { + "epoch": 0.10913337283085615, + "grad_norm": 0.012580066919326782, + "learning_rate": 8.947121064477462e-05, + "loss": 0.4018566608428955, + "step": 25420 + }, + { + "epoch": 0.10917630492087615, + "grad_norm": 3.0949137210845947, + "learning_rate": 8.9466898924657e-05, + "loss": 0.38313732147216795, + "step": 25430 + }, + { + "epoch": 0.10921923701089617, + "grad_norm": 0.9923470616340637, + "learning_rate": 8.946258720453938e-05, + "loss": 0.26145000457763673, + "step": 25440 + }, + { + "epoch": 0.10926216910091617, + "grad_norm": 0.7341137528419495, + "learning_rate": 8.945827548442176e-05, + "loss": 0.1047516942024231, + "step": 25450 + }, + { + "epoch": 0.10930510119093617, + "grad_norm": 11.855466842651367, + "learning_rate": 8.945396376430413e-05, + "loss": 0.1956933856010437, + "step": 25460 + }, + { + "epoch": 0.10934803328095619, + "grad_norm": 0.007624823600053787, + "learning_rate": 8.944965204418651e-05, + "loss": 0.2737696886062622, + "step": 25470 + }, + { + "epoch": 0.10939096537097619, + "grad_norm": 0.040049996227025986, + "learning_rate": 8.944534032406889e-05, + "loss": 0.2754047870635986, + "step": 25480 + }, + { + "epoch": 0.1094338974609962, + "grad_norm": 1.1522254943847656, + "learning_rate": 8.944102860395127e-05, + "loss": 0.2701746940612793, + "step": 25490 + }, + { + "epoch": 0.1094768295510162, + "grad_norm": 0.12508395314216614, + "learning_rate": 8.943671688383364e-05, + "loss": 0.1981469988822937, + "step": 25500 + }, + { + "epoch": 0.1095197616410362, + "grad_norm": 3.565103530883789, + "learning_rate": 8.943240516371602e-05, + "loss": 0.34433903694152834, + "step": 25510 + }, + { + "epoch": 0.10956269373105622, + "grad_norm": 0.1249883770942688, + "learning_rate": 8.94280934435984e-05, + "loss": 0.08808074593544006, + "step": 25520 + }, + { + "epoch": 0.10960562582107622, + "grad_norm": 0.03776920959353447, + "learning_rate": 8.942378172348078e-05, + "loss": 0.1539943814277649, + "step": 25530 + }, + { + "epoch": 0.10964855791109623, + "grad_norm": 0.05006266012787819, + "learning_rate": 8.941947000336315e-05, + "loss": 0.1190767765045166, + "step": 25540 + }, + { + "epoch": 0.10969149000111623, + "grad_norm": 0.1297246515750885, + "learning_rate": 8.941515828324553e-05, + "loss": 0.34765305519104006, + "step": 25550 + }, + { + "epoch": 0.10973442209113624, + "grad_norm": 0.0076711843721568584, + "learning_rate": 8.941084656312791e-05, + "loss": 0.1391259789466858, + "step": 25560 + }, + { + "epoch": 0.10977735418115625, + "grad_norm": 1.8917896747589111, + "learning_rate": 8.940653484301027e-05, + "loss": 0.25439977645874023, + "step": 25570 + }, + { + "epoch": 0.10982028627117625, + "grad_norm": 5.253777503967285, + "learning_rate": 8.940222312289265e-05, + "loss": 0.19333921670913695, + "step": 25580 + }, + { + "epoch": 0.10986321836119627, + "grad_norm": 0.07230553030967712, + "learning_rate": 8.939791140277503e-05, + "loss": 0.17542632818222045, + "step": 25590 + }, + { + "epoch": 0.10990615045121627, + "grad_norm": 0.011886064894497395, + "learning_rate": 8.93935996826574e-05, + "loss": 0.23904857635498047, + "step": 25600 + }, + { + "epoch": 0.10994908254123627, + "grad_norm": 0.002963064704090357, + "learning_rate": 8.938928796253978e-05, + "loss": 0.22038490772247316, + "step": 25610 + }, + { + "epoch": 0.10999201463125628, + "grad_norm": 0.01600109040737152, + "learning_rate": 8.938497624242216e-05, + "loss": 0.1403309464454651, + "step": 25620 + }, + { + "epoch": 0.11003494672127628, + "grad_norm": 1.7151718139648438, + "learning_rate": 8.938066452230453e-05, + "loss": 0.31475276947021485, + "step": 25630 + }, + { + "epoch": 0.1100778788112963, + "grad_norm": 0.000949110253714025, + "learning_rate": 8.937635280218691e-05, + "loss": 0.15096465349197388, + "step": 25640 + }, + { + "epoch": 0.1101208109013163, + "grad_norm": 0.16747067868709564, + "learning_rate": 8.937204108206928e-05, + "loss": 0.3146634101867676, + "step": 25650 + }, + { + "epoch": 0.1101637429913363, + "grad_norm": 2.464406728744507, + "learning_rate": 8.936772936195165e-05, + "loss": 0.2665748119354248, + "step": 25660 + }, + { + "epoch": 0.11020667508135631, + "grad_norm": 0.048684414476156235, + "learning_rate": 8.936341764183403e-05, + "loss": 0.2529994010925293, + "step": 25670 + }, + { + "epoch": 0.11024960717137632, + "grad_norm": 0.18311643600463867, + "learning_rate": 8.935910592171641e-05, + "loss": 0.15747172832489015, + "step": 25680 + }, + { + "epoch": 0.11029253926139633, + "grad_norm": 0.11484615504741669, + "learning_rate": 8.935479420159879e-05, + "loss": 0.5332373142242431, + "step": 25690 + }, + { + "epoch": 0.11033547135141633, + "grad_norm": 0.3284884989261627, + "learning_rate": 8.935048248148116e-05, + "loss": 0.1697358727455139, + "step": 25700 + }, + { + "epoch": 0.11037840344143633, + "grad_norm": 0.5205300450325012, + "learning_rate": 8.934617076136354e-05, + "loss": 0.16440855264663695, + "step": 25710 + }, + { + "epoch": 0.11042133553145635, + "grad_norm": 0.04689498245716095, + "learning_rate": 8.934185904124592e-05, + "loss": 0.1195969820022583, + "step": 25720 + }, + { + "epoch": 0.11046426762147635, + "grad_norm": 2.117689609527588, + "learning_rate": 8.93375473211283e-05, + "loss": 0.22179570198059081, + "step": 25730 + }, + { + "epoch": 0.11050719971149635, + "grad_norm": 0.37029099464416504, + "learning_rate": 8.933323560101067e-05, + "loss": 0.2961350202560425, + "step": 25740 + }, + { + "epoch": 0.11055013180151636, + "grad_norm": 4.060856819152832, + "learning_rate": 8.932892388089305e-05, + "loss": 0.4951334476470947, + "step": 25750 + }, + { + "epoch": 0.11059306389153636, + "grad_norm": 0.07439135760068893, + "learning_rate": 8.932461216077543e-05, + "loss": 0.18491072654724122, + "step": 25760 + }, + { + "epoch": 0.11063599598155638, + "grad_norm": 0.6668906211853027, + "learning_rate": 8.93203004406578e-05, + "loss": 0.4163065910339355, + "step": 25770 + }, + { + "epoch": 0.11067892807157638, + "grad_norm": 2.0445303916931152, + "learning_rate": 8.931598872054018e-05, + "loss": 0.3462115049362183, + "step": 25780 + }, + { + "epoch": 0.11072186016159638, + "grad_norm": 3.0600664615631104, + "learning_rate": 8.931167700042256e-05, + "loss": 0.30161380767822266, + "step": 25790 + }, + { + "epoch": 0.1107647922516164, + "grad_norm": 0.5397235751152039, + "learning_rate": 8.930736528030494e-05, + "loss": 0.4184750556945801, + "step": 25800 + }, + { + "epoch": 0.1108077243416364, + "grad_norm": 0.1752122938632965, + "learning_rate": 8.930305356018731e-05, + "loss": 0.29085919857025144, + "step": 25810 + }, + { + "epoch": 0.11085065643165641, + "grad_norm": 0.0460553839802742, + "learning_rate": 8.929874184006968e-05, + "loss": 0.37629220485687254, + "step": 25820 + }, + { + "epoch": 0.11089358852167641, + "grad_norm": 0.3117293417453766, + "learning_rate": 8.929443011995205e-05, + "loss": 0.15187954902648926, + "step": 25830 + }, + { + "epoch": 0.11093652061169641, + "grad_norm": 1.2698419094085693, + "learning_rate": 8.929011839983443e-05, + "loss": 0.28465192317962645, + "step": 25840 + }, + { + "epoch": 0.11097945270171643, + "grad_norm": 0.17641954123973846, + "learning_rate": 8.928580667971681e-05, + "loss": 0.26123223304748533, + "step": 25850 + }, + { + "epoch": 0.11102238479173643, + "grad_norm": 0.019400320947170258, + "learning_rate": 8.928149495959919e-05, + "loss": 0.05121874809265137, + "step": 25860 + }, + { + "epoch": 0.11106531688175644, + "grad_norm": 0.049805380403995514, + "learning_rate": 8.927718323948156e-05, + "loss": 0.14310473203659058, + "step": 25870 + }, + { + "epoch": 0.11110824897177644, + "grad_norm": 9.800433158874512, + "learning_rate": 8.927287151936394e-05, + "loss": 0.23860435485839843, + "step": 25880 + }, + { + "epoch": 0.11115118106179644, + "grad_norm": 0.00014414018369279802, + "learning_rate": 8.926855979924632e-05, + "loss": 0.30962748527526857, + "step": 25890 + }, + { + "epoch": 0.11119411315181646, + "grad_norm": 0.004766400903463364, + "learning_rate": 8.926424807912868e-05, + "loss": 0.5229979038238526, + "step": 25900 + }, + { + "epoch": 0.11123704524183646, + "grad_norm": 0.028215311467647552, + "learning_rate": 8.925993635901106e-05, + "loss": 0.18894855976104735, + "step": 25910 + }, + { + "epoch": 0.11127997733185647, + "grad_norm": 2.7721102237701416, + "learning_rate": 8.925562463889344e-05, + "loss": 0.27259135246276855, + "step": 25920 + }, + { + "epoch": 0.11132290942187648, + "grad_norm": 0.02031039260327816, + "learning_rate": 8.925131291877581e-05, + "loss": 0.04676889479160309, + "step": 25930 + }, + { + "epoch": 0.11136584151189648, + "grad_norm": 0.2718992829322815, + "learning_rate": 8.924700119865819e-05, + "loss": 0.16489242315292357, + "step": 25940 + }, + { + "epoch": 0.11140877360191649, + "grad_norm": 0.9515452980995178, + "learning_rate": 8.924268947854057e-05, + "loss": 0.3039748430252075, + "step": 25950 + }, + { + "epoch": 0.11145170569193649, + "grad_norm": 1.0625194311141968, + "learning_rate": 8.923837775842295e-05, + "loss": 0.3415965557098389, + "step": 25960 + }, + { + "epoch": 0.11149463778195651, + "grad_norm": 0.040675487369298935, + "learning_rate": 8.923406603830532e-05, + "loss": 0.33023149967193605, + "step": 25970 + }, + { + "epoch": 0.11153756987197651, + "grad_norm": 0.011495154350996017, + "learning_rate": 8.92297543181877e-05, + "loss": 0.255326509475708, + "step": 25980 + }, + { + "epoch": 0.11158050196199651, + "grad_norm": 3.5770716667175293, + "learning_rate": 8.922544259807008e-05, + "loss": 0.3099225521087646, + "step": 25990 + }, + { + "epoch": 0.11162343405201652, + "grad_norm": 0.02177545428276062, + "learning_rate": 8.922113087795246e-05, + "loss": 0.30356450080871583, + "step": 26000 + }, + { + "epoch": 0.11162343405201652, + "eval_loss": 0.49951910972595215, + "eval_runtime": 27.6012, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 3.623, + "step": 26000 + }, + { + "epoch": 0.11166636614203652, + "grad_norm": 0.01517306175082922, + "learning_rate": 8.921681915783483e-05, + "loss": 0.2588840961456299, + "step": 26010 + }, + { + "epoch": 0.11170929823205654, + "grad_norm": 0.010166897438466549, + "learning_rate": 8.921250743771721e-05, + "loss": 0.10459980964660645, + "step": 26020 + }, + { + "epoch": 0.11175223032207654, + "grad_norm": 0.045308589935302734, + "learning_rate": 8.920819571759959e-05, + "loss": 0.18060653209686278, + "step": 26030 + }, + { + "epoch": 0.11179516241209654, + "grad_norm": 0.2524489462375641, + "learning_rate": 8.920388399748197e-05, + "loss": 0.26762235164642334, + "step": 26040 + }, + { + "epoch": 0.11183809450211656, + "grad_norm": 0.039384521543979645, + "learning_rate": 8.919957227736434e-05, + "loss": 0.3579113483428955, + "step": 26050 + }, + { + "epoch": 0.11188102659213656, + "grad_norm": 0.7673929333686829, + "learning_rate": 8.91952605572467e-05, + "loss": 0.2611018896102905, + "step": 26060 + }, + { + "epoch": 0.11192395868215657, + "grad_norm": 0.08051066100597382, + "learning_rate": 8.919094883712908e-05, + "loss": 0.15652050971984863, + "step": 26070 + }, + { + "epoch": 0.11196689077217657, + "grad_norm": 3.0436594486236572, + "learning_rate": 8.918663711701146e-05, + "loss": 0.40769338607788086, + "step": 26080 + }, + { + "epoch": 0.11200982286219657, + "grad_norm": 0.5110700726509094, + "learning_rate": 8.918232539689384e-05, + "loss": 0.11793347597122192, + "step": 26090 + }, + { + "epoch": 0.11205275495221659, + "grad_norm": 1.4811254739761353, + "learning_rate": 8.917801367677622e-05, + "loss": 0.2230898141860962, + "step": 26100 + }, + { + "epoch": 0.11209568704223659, + "grad_norm": 0.7501670122146606, + "learning_rate": 8.917370195665859e-05, + "loss": 0.4135143280029297, + "step": 26110 + }, + { + "epoch": 0.1121386191322566, + "grad_norm": 0.24594485759735107, + "learning_rate": 8.916939023654097e-05, + "loss": 0.35773112773895266, + "step": 26120 + }, + { + "epoch": 0.1121815512222766, + "grad_norm": 1.2016063928604126, + "learning_rate": 8.916507851642335e-05, + "loss": 0.2555335521697998, + "step": 26130 + }, + { + "epoch": 0.1122244833122966, + "grad_norm": 0.007154045160859823, + "learning_rate": 8.916076679630571e-05, + "loss": 0.114761483669281, + "step": 26140 + }, + { + "epoch": 0.11226741540231662, + "grad_norm": 2.147310733795166, + "learning_rate": 8.915645507618809e-05, + "loss": 0.25461688041687014, + "step": 26150 + }, + { + "epoch": 0.11231034749233662, + "grad_norm": 0.14692267775535583, + "learning_rate": 8.915214335607047e-05, + "loss": 0.18954638242721558, + "step": 26160 + }, + { + "epoch": 0.11235327958235662, + "grad_norm": 0.3495458960533142, + "learning_rate": 8.914783163595284e-05, + "loss": 0.06459863781929016, + "step": 26170 + }, + { + "epoch": 0.11239621167237664, + "grad_norm": 3.2628321647644043, + "learning_rate": 8.914351991583522e-05, + "loss": 0.3024146556854248, + "step": 26180 + }, + { + "epoch": 0.11243914376239664, + "grad_norm": 0.09125436097383499, + "learning_rate": 8.91392081957176e-05, + "loss": 0.19367423057556152, + "step": 26190 + }, + { + "epoch": 0.11248207585241665, + "grad_norm": 1.5896530151367188, + "learning_rate": 8.913489647559999e-05, + "loss": 0.09784256219863892, + "step": 26200 + }, + { + "epoch": 0.11252500794243665, + "grad_norm": 0.012747851200401783, + "learning_rate": 8.913058475548237e-05, + "loss": 0.31458463668823244, + "step": 26210 + }, + { + "epoch": 0.11256794003245665, + "grad_norm": 0.0535753071308136, + "learning_rate": 8.912627303536474e-05, + "loss": 0.06705414652824401, + "step": 26220 + }, + { + "epoch": 0.11261087212247667, + "grad_norm": 0.0063928053714334965, + "learning_rate": 8.912196131524711e-05, + "loss": 0.41733684539794924, + "step": 26230 + }, + { + "epoch": 0.11265380421249667, + "grad_norm": 0.65238356590271, + "learning_rate": 8.911764959512949e-05, + "loss": 0.24972774982452392, + "step": 26240 + }, + { + "epoch": 0.11269673630251668, + "grad_norm": 0.005043783225119114, + "learning_rate": 8.911333787501186e-05, + "loss": 0.45450987815856936, + "step": 26250 + }, + { + "epoch": 0.11273966839253668, + "grad_norm": 7.086516380310059, + "learning_rate": 8.910902615489424e-05, + "loss": 0.28200199604034426, + "step": 26260 + }, + { + "epoch": 0.11278260048255669, + "grad_norm": 0.09140264987945557, + "learning_rate": 8.910471443477662e-05, + "loss": 0.12481236457824707, + "step": 26270 + }, + { + "epoch": 0.1128255325725767, + "grad_norm": 2.038637161254883, + "learning_rate": 8.9100402714659e-05, + "loss": 0.24641501903533936, + "step": 26280 + }, + { + "epoch": 0.1128684646625967, + "grad_norm": 11.497899055480957, + "learning_rate": 8.909609099454137e-05, + "loss": 0.3155627727508545, + "step": 26290 + }, + { + "epoch": 0.11291139675261672, + "grad_norm": 0.2962181866168976, + "learning_rate": 8.909177927442375e-05, + "loss": 0.37747180461883545, + "step": 26300 + }, + { + "epoch": 0.11295432884263672, + "grad_norm": 17.22124671936035, + "learning_rate": 8.908746755430611e-05, + "loss": 0.17400816679000855, + "step": 26310 + }, + { + "epoch": 0.11299726093265672, + "grad_norm": 0.027156352996826172, + "learning_rate": 8.908315583418849e-05, + "loss": 0.328963041305542, + "step": 26320 + }, + { + "epoch": 0.11304019302267673, + "grad_norm": 0.03348701819777489, + "learning_rate": 8.907884411407087e-05, + "loss": 0.2583893060684204, + "step": 26330 + }, + { + "epoch": 0.11308312511269673, + "grad_norm": 0.03124728426337242, + "learning_rate": 8.907453239395324e-05, + "loss": 0.22128143310546874, + "step": 26340 + }, + { + "epoch": 0.11312605720271675, + "grad_norm": 0.19246523082256317, + "learning_rate": 8.907022067383562e-05, + "loss": 0.23182182312011718, + "step": 26350 + }, + { + "epoch": 0.11316898929273675, + "grad_norm": 0.5085291862487793, + "learning_rate": 8.9065908953718e-05, + "loss": 0.33657450675964357, + "step": 26360 + }, + { + "epoch": 0.11321192138275675, + "grad_norm": 0.16103234887123108, + "learning_rate": 8.906159723360038e-05, + "loss": 0.2063810110092163, + "step": 26370 + }, + { + "epoch": 0.11325485347277676, + "grad_norm": 0.04276309162378311, + "learning_rate": 8.905728551348275e-05, + "loss": 0.4347810745239258, + "step": 26380 + }, + { + "epoch": 0.11329778556279677, + "grad_norm": 0.027480829507112503, + "learning_rate": 8.905297379336512e-05, + "loss": 0.3524182796478271, + "step": 26390 + }, + { + "epoch": 0.11334071765281678, + "grad_norm": 1.1285064220428467, + "learning_rate": 8.90486620732475e-05, + "loss": 0.43365187644958497, + "step": 26400 + }, + { + "epoch": 0.11338364974283678, + "grad_norm": 0.07447858154773712, + "learning_rate": 8.904435035312987e-05, + "loss": 0.2533705711364746, + "step": 26410 + }, + { + "epoch": 0.11342658183285678, + "grad_norm": 0.6848863363265991, + "learning_rate": 8.904003863301226e-05, + "loss": 0.1212932825088501, + "step": 26420 + }, + { + "epoch": 0.1134695139228768, + "grad_norm": 0.0208682119846344, + "learning_rate": 8.903572691289464e-05, + "loss": 0.3657447576522827, + "step": 26430 + }, + { + "epoch": 0.1135124460128968, + "grad_norm": 0.03788859024643898, + "learning_rate": 8.903141519277702e-05, + "loss": 0.29512145519256594, + "step": 26440 + }, + { + "epoch": 0.11355537810291681, + "grad_norm": 0.36164799332618713, + "learning_rate": 8.90271034726594e-05, + "loss": 0.42470488548278806, + "step": 26450 + }, + { + "epoch": 0.11359831019293681, + "grad_norm": 3.108694314956665, + "learning_rate": 8.902279175254177e-05, + "loss": 0.23271114826202394, + "step": 26460 + }, + { + "epoch": 0.11364124228295681, + "grad_norm": 0.007275083102285862, + "learning_rate": 8.901848003242414e-05, + "loss": 0.1772995948791504, + "step": 26470 + }, + { + "epoch": 0.11368417437297683, + "grad_norm": 0.7524713277816772, + "learning_rate": 8.901416831230651e-05, + "loss": 0.0993366301059723, + "step": 26480 + }, + { + "epoch": 0.11372710646299683, + "grad_norm": 0.0062967403791844845, + "learning_rate": 8.900985659218889e-05, + "loss": 0.2973215103149414, + "step": 26490 + }, + { + "epoch": 0.11377003855301684, + "grad_norm": 0.2719433605670929, + "learning_rate": 8.900554487207127e-05, + "loss": 0.31873762607574463, + "step": 26500 + }, + { + "epoch": 0.11381297064303685, + "grad_norm": 0.02310267835855484, + "learning_rate": 8.900123315195365e-05, + "loss": 0.21906378269195556, + "step": 26510 + }, + { + "epoch": 0.11385590273305685, + "grad_norm": 0.041628237813711166, + "learning_rate": 8.899692143183602e-05, + "loss": 0.1306293249130249, + "step": 26520 + }, + { + "epoch": 0.11389883482307686, + "grad_norm": 0.03699645772576332, + "learning_rate": 8.89926097117184e-05, + "loss": 0.18234388828277587, + "step": 26530 + }, + { + "epoch": 0.11394176691309686, + "grad_norm": 0.005653525702655315, + "learning_rate": 8.898829799160078e-05, + "loss": 0.30573740005493166, + "step": 26540 + }, + { + "epoch": 0.11398469900311688, + "grad_norm": 0.0038088662549853325, + "learning_rate": 8.898398627148316e-05, + "loss": 0.14387012720108033, + "step": 26550 + }, + { + "epoch": 0.11402763109313688, + "grad_norm": 1.9470224380493164, + "learning_rate": 8.897967455136552e-05, + "loss": 0.20126612186431886, + "step": 26560 + }, + { + "epoch": 0.11407056318315688, + "grad_norm": 0.16124966740608215, + "learning_rate": 8.89753628312479e-05, + "loss": 0.1542932868003845, + "step": 26570 + }, + { + "epoch": 0.1141134952731769, + "grad_norm": 0.011234652251005173, + "learning_rate": 8.897105111113027e-05, + "loss": 0.1024243950843811, + "step": 26580 + }, + { + "epoch": 0.1141564273631969, + "grad_norm": 0.050764694809913635, + "learning_rate": 8.896673939101265e-05, + "loss": 0.21154391765594482, + "step": 26590 + }, + { + "epoch": 0.1141993594532169, + "grad_norm": 0.7355092763900757, + "learning_rate": 8.896242767089503e-05, + "loss": 0.48318023681640626, + "step": 26600 + }, + { + "epoch": 0.11424229154323691, + "grad_norm": 0.0014123255386948586, + "learning_rate": 8.89581159507774e-05, + "loss": 0.24305782318115235, + "step": 26610 + }, + { + "epoch": 0.11428522363325691, + "grad_norm": 0.0012762444093823433, + "learning_rate": 8.895380423065978e-05, + "loss": 0.24177062511444092, + "step": 26620 + }, + { + "epoch": 0.11432815572327693, + "grad_norm": 2.227457046508789, + "learning_rate": 8.894949251054216e-05, + "loss": 0.21717729568481445, + "step": 26630 + }, + { + "epoch": 0.11437108781329693, + "grad_norm": 0.16315631568431854, + "learning_rate": 8.894518079042454e-05, + "loss": 0.08009040355682373, + "step": 26640 + }, + { + "epoch": 0.11441401990331693, + "grad_norm": 0.04312557354569435, + "learning_rate": 8.894086907030692e-05, + "loss": 0.3106640338897705, + "step": 26650 + }, + { + "epoch": 0.11445695199333694, + "grad_norm": 1.9198765754699707, + "learning_rate": 8.893655735018929e-05, + "loss": 0.4228626251220703, + "step": 26660 + }, + { + "epoch": 0.11449988408335694, + "grad_norm": 0.18937277793884277, + "learning_rate": 8.893224563007167e-05, + "loss": 0.45505146980285643, + "step": 26670 + }, + { + "epoch": 0.11454281617337696, + "grad_norm": 1.0401482582092285, + "learning_rate": 8.892793390995405e-05, + "loss": 0.3090991020202637, + "step": 26680 + }, + { + "epoch": 0.11458574826339696, + "grad_norm": 0.7521312832832336, + "learning_rate": 8.892362218983642e-05, + "loss": 0.47649755477905276, + "step": 26690 + }, + { + "epoch": 0.11462868035341696, + "grad_norm": 0.9687482118606567, + "learning_rate": 8.89193104697188e-05, + "loss": 0.2554279088973999, + "step": 26700 + }, + { + "epoch": 0.11467161244343697, + "grad_norm": 0.9771291613578796, + "learning_rate": 8.891499874960118e-05, + "loss": 0.23694472312927245, + "step": 26710 + }, + { + "epoch": 0.11471454453345697, + "grad_norm": 4.835459232330322, + "learning_rate": 8.891068702948354e-05, + "loss": 0.17550307512283325, + "step": 26720 + }, + { + "epoch": 0.11475747662347699, + "grad_norm": 0.5119221806526184, + "learning_rate": 8.890637530936592e-05, + "loss": 0.30478582382202146, + "step": 26730 + }, + { + "epoch": 0.11480040871349699, + "grad_norm": 0.17026209831237793, + "learning_rate": 8.89020635892483e-05, + "loss": 0.1443108320236206, + "step": 26740 + }, + { + "epoch": 0.11484334080351699, + "grad_norm": 2.911370277404785, + "learning_rate": 8.889775186913068e-05, + "loss": 0.3410804748535156, + "step": 26750 + }, + { + "epoch": 0.114886272893537, + "grad_norm": 1.0684778690338135, + "learning_rate": 8.889344014901305e-05, + "loss": 0.22259902954101562, + "step": 26760 + }, + { + "epoch": 0.114929204983557, + "grad_norm": 0.11497758328914642, + "learning_rate": 8.888912842889543e-05, + "loss": 0.24176509380340577, + "step": 26770 + }, + { + "epoch": 0.11497213707357702, + "grad_norm": 1.0878132581710815, + "learning_rate": 8.888481670877781e-05, + "loss": 0.08091990947723389, + "step": 26780 + }, + { + "epoch": 0.11501506916359702, + "grad_norm": 0.0038782337214797735, + "learning_rate": 8.888050498866018e-05, + "loss": 0.34708499908447266, + "step": 26790 + }, + { + "epoch": 0.11505800125361702, + "grad_norm": 2.4943995475769043, + "learning_rate": 8.887619326854255e-05, + "loss": 0.2911946773529053, + "step": 26800 + }, + { + "epoch": 0.11510093334363704, + "grad_norm": 0.0024789159651845694, + "learning_rate": 8.887188154842493e-05, + "loss": 0.14854669570922852, + "step": 26810 + }, + { + "epoch": 0.11514386543365704, + "grad_norm": 0.18486323952674866, + "learning_rate": 8.88675698283073e-05, + "loss": 0.3237830638885498, + "step": 26820 + }, + { + "epoch": 0.11518679752367705, + "grad_norm": 0.02643188089132309, + "learning_rate": 8.886325810818968e-05, + "loss": 0.1400722861289978, + "step": 26830 + }, + { + "epoch": 0.11522972961369705, + "grad_norm": 2.9142510890960693, + "learning_rate": 8.885894638807206e-05, + "loss": 0.20289158821105957, + "step": 26840 + }, + { + "epoch": 0.11527266170371706, + "grad_norm": 6.909871578216553, + "learning_rate": 8.885463466795444e-05, + "loss": 0.4391770839691162, + "step": 26850 + }, + { + "epoch": 0.11531559379373707, + "grad_norm": 0.09212353825569153, + "learning_rate": 8.885032294783681e-05, + "loss": 0.3045357704162598, + "step": 26860 + }, + { + "epoch": 0.11535852588375707, + "grad_norm": 0.007540345191955566, + "learning_rate": 8.884601122771919e-05, + "loss": 0.09747713208198547, + "step": 26870 + }, + { + "epoch": 0.11540145797377709, + "grad_norm": 0.10550621896982193, + "learning_rate": 8.884169950760157e-05, + "loss": 0.10741302967071534, + "step": 26880 + }, + { + "epoch": 0.11544439006379709, + "grad_norm": 1.4358404874801636, + "learning_rate": 8.883738778748394e-05, + "loss": 0.395121693611145, + "step": 26890 + }, + { + "epoch": 0.11548732215381709, + "grad_norm": 0.007923940196633339, + "learning_rate": 8.883307606736632e-05, + "loss": 0.3058964252471924, + "step": 26900 + }, + { + "epoch": 0.1155302542438371, + "grad_norm": 1.1491098403930664, + "learning_rate": 8.88287643472487e-05, + "loss": 0.3279739856719971, + "step": 26910 + }, + { + "epoch": 0.1155731863338571, + "grad_norm": 0.1358955353498459, + "learning_rate": 8.882445262713108e-05, + "loss": 0.1154222846031189, + "step": 26920 + }, + { + "epoch": 0.11561611842387712, + "grad_norm": 0.6412147283554077, + "learning_rate": 8.882014090701345e-05, + "loss": 0.14349900484085082, + "step": 26930 + }, + { + "epoch": 0.11565905051389712, + "grad_norm": 0.020047593861818314, + "learning_rate": 8.881582918689583e-05, + "loss": 0.24264280796051024, + "step": 26940 + }, + { + "epoch": 0.11570198260391712, + "grad_norm": 0.9909156560897827, + "learning_rate": 8.881151746677821e-05, + "loss": 0.5208069801330566, + "step": 26950 + }, + { + "epoch": 0.11574491469393713, + "grad_norm": 0.19676139950752258, + "learning_rate": 8.880720574666059e-05, + "loss": 0.37195398807525637, + "step": 26960 + }, + { + "epoch": 0.11578784678395714, + "grad_norm": 0.03384987264871597, + "learning_rate": 8.880289402654295e-05, + "loss": 0.3475580453872681, + "step": 26970 + }, + { + "epoch": 0.11583077887397715, + "grad_norm": 0.018173309043049812, + "learning_rate": 8.879858230642533e-05, + "loss": 0.21183347702026367, + "step": 26980 + }, + { + "epoch": 0.11587371096399715, + "grad_norm": 0.25634244084358215, + "learning_rate": 8.87942705863077e-05, + "loss": 0.2051846981048584, + "step": 26990 + }, + { + "epoch": 0.11591664305401715, + "grad_norm": 1.4937998056411743, + "learning_rate": 8.878995886619008e-05, + "loss": 0.1931118369102478, + "step": 27000 + }, + { + "epoch": 0.11591664305401715, + "eval_loss": 0.4814135432243347, + "eval_runtime": 27.4898, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 3.638, + "step": 27000 + }, + { + "epoch": 0.11595957514403717, + "grad_norm": 0.032575830817222595, + "learning_rate": 8.878564714607246e-05, + "loss": 0.07166704535484314, + "step": 27010 + }, + { + "epoch": 0.11600250723405717, + "grad_norm": 0.08005379140377045, + "learning_rate": 8.878133542595484e-05, + "loss": 0.339334774017334, + "step": 27020 + }, + { + "epoch": 0.11604543932407717, + "grad_norm": 4.995341777801514, + "learning_rate": 8.877702370583721e-05, + "loss": 0.2974552154541016, + "step": 27030 + }, + { + "epoch": 0.11608837141409718, + "grad_norm": 0.15797531604766846, + "learning_rate": 8.877271198571959e-05, + "loss": 0.30518279075622556, + "step": 27040 + }, + { + "epoch": 0.11613130350411718, + "grad_norm": 0.037030257284641266, + "learning_rate": 8.876840026560195e-05, + "loss": 0.26148602962493894, + "step": 27050 + }, + { + "epoch": 0.1161742355941372, + "grad_norm": 1.2287954092025757, + "learning_rate": 8.876408854548433e-05, + "loss": 0.2081602096557617, + "step": 27060 + }, + { + "epoch": 0.1162171676841572, + "grad_norm": 0.4174017906188965, + "learning_rate": 8.875977682536671e-05, + "loss": 0.26671810150146485, + "step": 27070 + }, + { + "epoch": 0.1162600997741772, + "grad_norm": 0.16252487897872925, + "learning_rate": 8.875546510524909e-05, + "loss": 0.1370398998260498, + "step": 27080 + }, + { + "epoch": 0.11630303186419721, + "grad_norm": 2.9258928298950195, + "learning_rate": 8.875115338513146e-05, + "loss": 0.4124359130859375, + "step": 27090 + }, + { + "epoch": 0.11634596395421722, + "grad_norm": 0.22780385613441467, + "learning_rate": 8.874684166501384e-05, + "loss": 0.3003973960876465, + "step": 27100 + }, + { + "epoch": 0.11638889604423723, + "grad_norm": 1.1312873363494873, + "learning_rate": 8.874252994489622e-05, + "loss": 0.39095630645751955, + "step": 27110 + }, + { + "epoch": 0.11643182813425723, + "grad_norm": 0.01758890599012375, + "learning_rate": 8.87382182247786e-05, + "loss": 0.14932353496551515, + "step": 27120 + }, + { + "epoch": 0.11647476022427723, + "grad_norm": 0.05419120937585831, + "learning_rate": 8.873390650466097e-05, + "loss": 0.1397382140159607, + "step": 27130 + }, + { + "epoch": 0.11651769231429725, + "grad_norm": 2.2360360622406006, + "learning_rate": 8.872959478454335e-05, + "loss": 0.336977481842041, + "step": 27140 + }, + { + "epoch": 0.11656062440431725, + "grad_norm": 0.12993699312210083, + "learning_rate": 8.872528306442573e-05, + "loss": 0.25680654048919677, + "step": 27150 + }, + { + "epoch": 0.11660355649433726, + "grad_norm": 3.370098829269409, + "learning_rate": 8.87209713443081e-05, + "loss": 0.20383124351501464, + "step": 27160 + }, + { + "epoch": 0.11664648858435726, + "grad_norm": 0.07472426444292068, + "learning_rate": 8.871665962419048e-05, + "loss": 0.1689348340034485, + "step": 27170 + }, + { + "epoch": 0.11668942067437726, + "grad_norm": 1.1328237056732178, + "learning_rate": 8.871234790407286e-05, + "loss": 0.3129899501800537, + "step": 27180 + }, + { + "epoch": 0.11673235276439728, + "grad_norm": 1.5202020406723022, + "learning_rate": 8.870803618395524e-05, + "loss": 0.31610372066497805, + "step": 27190 + }, + { + "epoch": 0.11677528485441728, + "grad_norm": 0.8442431092262268, + "learning_rate": 8.870372446383762e-05, + "loss": 0.18818488121032714, + "step": 27200 + }, + { + "epoch": 0.1168182169444373, + "grad_norm": 1.796455979347229, + "learning_rate": 8.869941274371998e-05, + "loss": 0.26075315475463867, + "step": 27210 + }, + { + "epoch": 0.1168611490344573, + "grad_norm": 4.038948059082031, + "learning_rate": 8.869510102360236e-05, + "loss": 0.4824398040771484, + "step": 27220 + }, + { + "epoch": 0.1169040811244773, + "grad_norm": 1.9637843370437622, + "learning_rate": 8.869078930348473e-05, + "loss": 0.20625545978546142, + "step": 27230 + }, + { + "epoch": 0.11694701321449731, + "grad_norm": 0.13020427525043488, + "learning_rate": 8.868647758336711e-05, + "loss": 0.03135204017162323, + "step": 27240 + }, + { + "epoch": 0.11698994530451731, + "grad_norm": 1.4142533540725708, + "learning_rate": 8.868216586324949e-05, + "loss": 0.21837081909179687, + "step": 27250 + }, + { + "epoch": 0.11703287739453733, + "grad_norm": 3.5253872871398926, + "learning_rate": 8.867785414313187e-05, + "loss": 0.20597407817840577, + "step": 27260 + }, + { + "epoch": 0.11707580948455733, + "grad_norm": 0.007855327799916267, + "learning_rate": 8.867354242301424e-05, + "loss": 0.35103812217712405, + "step": 27270 + }, + { + "epoch": 0.11711874157457733, + "grad_norm": 1.1489592790603638, + "learning_rate": 8.866923070289662e-05, + "loss": 0.2486743450164795, + "step": 27280 + }, + { + "epoch": 0.11716167366459734, + "grad_norm": 0.3575959801673889, + "learning_rate": 8.8664918982779e-05, + "loss": 0.24932396411895752, + "step": 27290 + }, + { + "epoch": 0.11720460575461734, + "grad_norm": 2.510430335998535, + "learning_rate": 8.866060726266136e-05, + "loss": 0.2862740755081177, + "step": 27300 + }, + { + "epoch": 0.11724753784463736, + "grad_norm": 0.14760471880435944, + "learning_rate": 8.865629554254374e-05, + "loss": 0.1569320559501648, + "step": 27310 + }, + { + "epoch": 0.11729046993465736, + "grad_norm": 2.2253646850585938, + "learning_rate": 8.865198382242612e-05, + "loss": 0.18345837593078612, + "step": 27320 + }, + { + "epoch": 0.11733340202467736, + "grad_norm": 0.17462894320487976, + "learning_rate": 8.86476721023085e-05, + "loss": 0.059002459049224854, + "step": 27330 + }, + { + "epoch": 0.11737633411469738, + "grad_norm": 1.675255298614502, + "learning_rate": 8.864336038219087e-05, + "loss": 0.30129642486572267, + "step": 27340 + }, + { + "epoch": 0.11741926620471738, + "grad_norm": 0.016447896137833595, + "learning_rate": 8.863904866207325e-05, + "loss": 0.2761634111404419, + "step": 27350 + }, + { + "epoch": 0.11746219829473739, + "grad_norm": 0.5405940413475037, + "learning_rate": 8.863473694195563e-05, + "loss": 0.3909731388092041, + "step": 27360 + }, + { + "epoch": 0.11750513038475739, + "grad_norm": 0.0973791852593422, + "learning_rate": 8.8630425221838e-05, + "loss": 0.2007131338119507, + "step": 27370 + }, + { + "epoch": 0.11754806247477739, + "grad_norm": 0.040704648941755295, + "learning_rate": 8.862611350172038e-05, + "loss": 0.24662020206451415, + "step": 27380 + }, + { + "epoch": 0.11759099456479741, + "grad_norm": 2.4736199378967285, + "learning_rate": 8.862180178160276e-05, + "loss": 0.3135467290878296, + "step": 27390 + }, + { + "epoch": 0.11763392665481741, + "grad_norm": 0.06006622686982155, + "learning_rate": 8.861749006148513e-05, + "loss": 0.21347477436065673, + "step": 27400 + }, + { + "epoch": 0.11767685874483741, + "grad_norm": 2.3658194541931152, + "learning_rate": 8.861317834136751e-05, + "loss": 0.387444281578064, + "step": 27410 + }, + { + "epoch": 0.11771979083485742, + "grad_norm": 0.0878182202577591, + "learning_rate": 8.860886662124989e-05, + "loss": 0.26784508228302, + "step": 27420 + }, + { + "epoch": 0.11776272292487742, + "grad_norm": 1.5703392028808594, + "learning_rate": 8.860455490113227e-05, + "loss": 0.242673921585083, + "step": 27430 + }, + { + "epoch": 0.11780565501489744, + "grad_norm": 0.012026949785649776, + "learning_rate": 8.860024318101464e-05, + "loss": 0.2383371353149414, + "step": 27440 + }, + { + "epoch": 0.11784858710491744, + "grad_norm": 3.061283826828003, + "learning_rate": 8.859593146089702e-05, + "loss": 0.28369903564453125, + "step": 27450 + }, + { + "epoch": 0.11789151919493744, + "grad_norm": 0.5101853609085083, + "learning_rate": 8.859161974077939e-05, + "loss": 0.07000910639762878, + "step": 27460 + }, + { + "epoch": 0.11793445128495746, + "grad_norm": 0.011193258687853813, + "learning_rate": 8.858730802066176e-05, + "loss": 0.11365052461624145, + "step": 27470 + }, + { + "epoch": 0.11797738337497746, + "grad_norm": 0.10916262120008469, + "learning_rate": 8.858299630054414e-05, + "loss": 0.1990136981010437, + "step": 27480 + }, + { + "epoch": 0.11802031546499747, + "grad_norm": 1.9866777658462524, + "learning_rate": 8.857868458042652e-05, + "loss": 0.297538685798645, + "step": 27490 + }, + { + "epoch": 0.11806324755501747, + "grad_norm": 0.7264887690544128, + "learning_rate": 8.85743728603089e-05, + "loss": 0.25992865562438966, + "step": 27500 + }, + { + "epoch": 0.11810617964503747, + "grad_norm": 1.078795313835144, + "learning_rate": 8.857006114019127e-05, + "loss": 0.2584535598754883, + "step": 27510 + }, + { + "epoch": 0.11814911173505749, + "grad_norm": 0.010630804114043713, + "learning_rate": 8.856574942007365e-05, + "loss": 0.3359013795852661, + "step": 27520 + }, + { + "epoch": 0.11819204382507749, + "grad_norm": 0.37588950991630554, + "learning_rate": 8.856143769995603e-05, + "loss": 0.18968162536621094, + "step": 27530 + }, + { + "epoch": 0.1182349759150975, + "grad_norm": 1.0637744665145874, + "learning_rate": 8.855712597983839e-05, + "loss": 0.1399350643157959, + "step": 27540 + }, + { + "epoch": 0.1182779080051175, + "grad_norm": 3.714017629623413, + "learning_rate": 8.855281425972077e-05, + "loss": 0.17076128721237183, + "step": 27550 + }, + { + "epoch": 0.1183208400951375, + "grad_norm": 1.026892066001892, + "learning_rate": 8.854850253960315e-05, + "loss": 0.3097742795944214, + "step": 27560 + }, + { + "epoch": 0.11836377218515752, + "grad_norm": 0.08490348607301712, + "learning_rate": 8.854419081948552e-05, + "loss": 0.45609478950500487, + "step": 27570 + }, + { + "epoch": 0.11840670427517752, + "grad_norm": 0.7944920063018799, + "learning_rate": 8.85398790993679e-05, + "loss": 0.2455613613128662, + "step": 27580 + }, + { + "epoch": 0.11844963636519754, + "grad_norm": 0.11953561753034592, + "learning_rate": 8.853556737925028e-05, + "loss": 0.13984798192977904, + "step": 27590 + }, + { + "epoch": 0.11849256845521754, + "grad_norm": 2.450636863708496, + "learning_rate": 8.853125565913267e-05, + "loss": 0.2631438493728638, + "step": 27600 + }, + { + "epoch": 0.11853550054523754, + "grad_norm": 1.842597246170044, + "learning_rate": 8.852694393901505e-05, + "loss": 0.18394358158111573, + "step": 27610 + }, + { + "epoch": 0.11857843263525755, + "grad_norm": 5.348299980163574, + "learning_rate": 8.852263221889742e-05, + "loss": 0.10762025117874145, + "step": 27620 + }, + { + "epoch": 0.11862136472527755, + "grad_norm": 1.779021978378296, + "learning_rate": 8.851832049877979e-05, + "loss": 0.1610881805419922, + "step": 27630 + }, + { + "epoch": 0.11866429681529757, + "grad_norm": 1.3260366916656494, + "learning_rate": 8.851400877866216e-05, + "loss": 0.3329970359802246, + "step": 27640 + }, + { + "epoch": 0.11870722890531757, + "grad_norm": 0.0022421982139348984, + "learning_rate": 8.850969705854454e-05, + "loss": 0.2720233678817749, + "step": 27650 + }, + { + "epoch": 0.11875016099533757, + "grad_norm": 26.460460662841797, + "learning_rate": 8.850538533842692e-05, + "loss": 0.4188239574432373, + "step": 27660 + }, + { + "epoch": 0.11879309308535758, + "grad_norm": 0.045316264033317566, + "learning_rate": 8.85010736183093e-05, + "loss": 0.324761962890625, + "step": 27670 + }, + { + "epoch": 0.11883602517537759, + "grad_norm": 3.0380566120147705, + "learning_rate": 8.849676189819167e-05, + "loss": 0.22859654426574708, + "step": 27680 + }, + { + "epoch": 0.1188789572653976, + "grad_norm": 0.09295903891324997, + "learning_rate": 8.849245017807405e-05, + "loss": 0.15038024187088012, + "step": 27690 + }, + { + "epoch": 0.1189218893554176, + "grad_norm": 0.0038858470506966114, + "learning_rate": 8.848813845795643e-05, + "loss": 0.22608392238616942, + "step": 27700 + }, + { + "epoch": 0.1189648214454376, + "grad_norm": 0.29266226291656494, + "learning_rate": 8.848382673783879e-05, + "loss": 0.24526865482330323, + "step": 27710 + }, + { + "epoch": 0.11900775353545762, + "grad_norm": 1.2969417572021484, + "learning_rate": 8.847951501772117e-05, + "loss": 0.17613345384597778, + "step": 27720 + }, + { + "epoch": 0.11905068562547762, + "grad_norm": 0.27322718501091003, + "learning_rate": 8.847520329760355e-05, + "loss": 0.29839363098144533, + "step": 27730 + }, + { + "epoch": 0.11909361771549763, + "grad_norm": 0.1163104996085167, + "learning_rate": 8.847089157748592e-05, + "loss": 0.32402098178863525, + "step": 27740 + }, + { + "epoch": 0.11913654980551763, + "grad_norm": 0.5185718536376953, + "learning_rate": 8.84665798573683e-05, + "loss": 0.2267787218093872, + "step": 27750 + }, + { + "epoch": 0.11917948189553763, + "grad_norm": 0.86121666431427, + "learning_rate": 8.846226813725068e-05, + "loss": 0.3904379606246948, + "step": 27760 + }, + { + "epoch": 0.11922241398555765, + "grad_norm": 4.6156721115112305, + "learning_rate": 8.845795641713306e-05, + "loss": 0.3092981815338135, + "step": 27770 + }, + { + "epoch": 0.11926534607557765, + "grad_norm": 0.036813993006944656, + "learning_rate": 8.845364469701543e-05, + "loss": 0.23008527755737304, + "step": 27780 + }, + { + "epoch": 0.11930827816559766, + "grad_norm": 0.06015315651893616, + "learning_rate": 8.84493329768978e-05, + "loss": 0.2318122148513794, + "step": 27790 + }, + { + "epoch": 0.11935121025561767, + "grad_norm": 0.4399387240409851, + "learning_rate": 8.844502125678017e-05, + "loss": 0.3942227840423584, + "step": 27800 + }, + { + "epoch": 0.11939414234563767, + "grad_norm": 5.1140313148498535, + "learning_rate": 8.844070953666255e-05, + "loss": 0.2428600788116455, + "step": 27810 + }, + { + "epoch": 0.11943707443565768, + "grad_norm": 7.504186153411865, + "learning_rate": 8.843639781654494e-05, + "loss": 0.34903745651245116, + "step": 27820 + }, + { + "epoch": 0.11948000652567768, + "grad_norm": 1.5398544073104858, + "learning_rate": 8.843208609642732e-05, + "loss": 0.38591148853302004, + "step": 27830 + }, + { + "epoch": 0.11952293861569768, + "grad_norm": 0.020581362769007683, + "learning_rate": 8.84277743763097e-05, + "loss": 0.3888427972793579, + "step": 27840 + }, + { + "epoch": 0.1195658707057177, + "grad_norm": 2.014479637145996, + "learning_rate": 8.842346265619207e-05, + "loss": 0.35665121078491213, + "step": 27850 + }, + { + "epoch": 0.1196088027957377, + "grad_norm": 0.05373619124293327, + "learning_rate": 8.841915093607445e-05, + "loss": 0.06771031022071838, + "step": 27860 + }, + { + "epoch": 0.11965173488575771, + "grad_norm": 0.1480589210987091, + "learning_rate": 8.841483921595682e-05, + "loss": 0.2560983896255493, + "step": 27870 + }, + { + "epoch": 0.11969466697577771, + "grad_norm": 0.05488418787717819, + "learning_rate": 8.841052749583919e-05, + "loss": 0.18749144077301025, + "step": 27880 + }, + { + "epoch": 0.11973759906579771, + "grad_norm": 0.7366297841072083, + "learning_rate": 8.840621577572157e-05, + "loss": 0.17107198238372803, + "step": 27890 + }, + { + "epoch": 0.11978053115581773, + "grad_norm": 0.04739157855510712, + "learning_rate": 8.840190405560395e-05, + "loss": 0.22733573913574218, + "step": 27900 + }, + { + "epoch": 0.11982346324583773, + "grad_norm": 0.009907875210046768, + "learning_rate": 8.839759233548633e-05, + "loss": 0.07253676056861877, + "step": 27910 + }, + { + "epoch": 0.11986639533585774, + "grad_norm": 0.9760177135467529, + "learning_rate": 8.83932806153687e-05, + "loss": 0.208707594871521, + "step": 27920 + }, + { + "epoch": 0.11990932742587775, + "grad_norm": 4.618872165679932, + "learning_rate": 8.838896889525108e-05, + "loss": 0.2124265670776367, + "step": 27930 + }, + { + "epoch": 0.11995225951589775, + "grad_norm": 0.017419319599866867, + "learning_rate": 8.838465717513346e-05, + "loss": 0.23728528022766113, + "step": 27940 + }, + { + "epoch": 0.11999519160591776, + "grad_norm": 0.03731221705675125, + "learning_rate": 8.838034545501582e-05, + "loss": 0.12621110677719116, + "step": 27950 + }, + { + "epoch": 0.12003812369593776, + "grad_norm": 0.08100777119398117, + "learning_rate": 8.83760337348982e-05, + "loss": 0.0045126181095838545, + "step": 27960 + }, + { + "epoch": 0.12008105578595778, + "grad_norm": 0.018143413588404655, + "learning_rate": 8.837172201478058e-05, + "loss": 0.40405783653259275, + "step": 27970 + }, + { + "epoch": 0.12012398787597778, + "grad_norm": 2.4014012813568115, + "learning_rate": 8.836741029466295e-05, + "loss": 0.15833113193511963, + "step": 27980 + }, + { + "epoch": 0.12016691996599778, + "grad_norm": 0.5604978799819946, + "learning_rate": 8.836309857454533e-05, + "loss": 0.3327100992202759, + "step": 27990 + }, + { + "epoch": 0.1202098520560178, + "grad_norm": 1.6774513721466064, + "learning_rate": 8.835878685442771e-05, + "loss": 0.26010899543762206, + "step": 28000 + }, + { + "epoch": 0.1202098520560178, + "eval_loss": 0.47686854004859924, + "eval_runtime": 27.4364, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 28000 + }, + { + "epoch": 0.1202527841460378, + "grad_norm": 0.04927496239542961, + "learning_rate": 8.835447513431008e-05, + "loss": 0.194127357006073, + "step": 28010 + }, + { + "epoch": 0.12029571623605781, + "grad_norm": 0.009340300224721432, + "learning_rate": 8.835016341419246e-05, + "loss": 0.3162141084671021, + "step": 28020 + }, + { + "epoch": 0.12033864832607781, + "grad_norm": 0.0497310571372509, + "learning_rate": 8.834585169407484e-05, + "loss": 0.3715135812759399, + "step": 28030 + }, + { + "epoch": 0.12038158041609781, + "grad_norm": 0.6487219929695129, + "learning_rate": 8.834153997395722e-05, + "loss": 0.22716693878173827, + "step": 28040 + }, + { + "epoch": 0.12042451250611783, + "grad_norm": 1.9130522012710571, + "learning_rate": 8.83372282538396e-05, + "loss": 0.3316776275634766, + "step": 28050 + }, + { + "epoch": 0.12046744459613783, + "grad_norm": 1.4078731536865234, + "learning_rate": 8.833291653372197e-05, + "loss": 0.25235857963562014, + "step": 28060 + }, + { + "epoch": 0.12051037668615784, + "grad_norm": 0.01590568758547306, + "learning_rate": 8.832860481360435e-05, + "loss": 0.05722926259040832, + "step": 28070 + }, + { + "epoch": 0.12055330877617784, + "grad_norm": 0.12422087043523788, + "learning_rate": 8.832429309348673e-05, + "loss": 0.4248668670654297, + "step": 28080 + }, + { + "epoch": 0.12059624086619784, + "grad_norm": 0.1295778602361679, + "learning_rate": 8.83199813733691e-05, + "loss": 0.2959657430648804, + "step": 28090 + }, + { + "epoch": 0.12063917295621786, + "grad_norm": 20.151147842407227, + "learning_rate": 8.831566965325148e-05, + "loss": 0.25739731788635256, + "step": 28100 + }, + { + "epoch": 0.12068210504623786, + "grad_norm": 0.04763505980372429, + "learning_rate": 8.831135793313386e-05, + "loss": 0.13446078300476075, + "step": 28110 + }, + { + "epoch": 0.12072503713625787, + "grad_norm": 0.009243443608283997, + "learning_rate": 8.830704621301622e-05, + "loss": 0.2809897899627686, + "step": 28120 + }, + { + "epoch": 0.12076796922627787, + "grad_norm": 0.1913996934890747, + "learning_rate": 8.83027344928986e-05, + "loss": 0.2904137372970581, + "step": 28130 + }, + { + "epoch": 0.12081090131629788, + "grad_norm": 0.2755334675312042, + "learning_rate": 8.829842277278098e-05, + "loss": 0.23877317905426027, + "step": 28140 + }, + { + "epoch": 0.12085383340631789, + "grad_norm": 0.14645986258983612, + "learning_rate": 8.829411105266335e-05, + "loss": 0.20860531330108642, + "step": 28150 + }, + { + "epoch": 0.12089676549633789, + "grad_norm": 0.25226902961730957, + "learning_rate": 8.828979933254573e-05, + "loss": 0.03910198211669922, + "step": 28160 + }, + { + "epoch": 0.1209396975863579, + "grad_norm": 0.4876665771007538, + "learning_rate": 8.828548761242811e-05, + "loss": 0.18497053384780884, + "step": 28170 + }, + { + "epoch": 0.1209826296763779, + "grad_norm": 0.08259847015142441, + "learning_rate": 8.828117589231049e-05, + "loss": 0.07651437520980835, + "step": 28180 + }, + { + "epoch": 0.12102556176639791, + "grad_norm": 1.885874629020691, + "learning_rate": 8.827686417219286e-05, + "loss": 0.2606934070587158, + "step": 28190 + }, + { + "epoch": 0.12106849385641792, + "grad_norm": 3.6061160564422607, + "learning_rate": 8.827255245207523e-05, + "loss": 0.32321016788482665, + "step": 28200 + }, + { + "epoch": 0.12111142594643792, + "grad_norm": 0.004731371533125639, + "learning_rate": 8.82682407319576e-05, + "loss": 0.1959924578666687, + "step": 28210 + }, + { + "epoch": 0.12115435803645794, + "grad_norm": 0.1178203746676445, + "learning_rate": 8.826392901183998e-05, + "loss": 0.19997456073760986, + "step": 28220 + }, + { + "epoch": 0.12119729012647794, + "grad_norm": 1.247888445854187, + "learning_rate": 8.825961729172236e-05, + "loss": 0.30602240562438965, + "step": 28230 + }, + { + "epoch": 0.12124022221649794, + "grad_norm": 0.704877495765686, + "learning_rate": 8.825530557160474e-05, + "loss": 0.13399279117584229, + "step": 28240 + }, + { + "epoch": 0.12128315430651795, + "grad_norm": 1.3607152700424194, + "learning_rate": 8.825099385148711e-05, + "loss": 0.20442640781402588, + "step": 28250 + }, + { + "epoch": 0.12132608639653795, + "grad_norm": 3.9458365440368652, + "learning_rate": 8.824668213136949e-05, + "loss": 0.2593385219573975, + "step": 28260 + }, + { + "epoch": 0.12136901848655796, + "grad_norm": 5.625955581665039, + "learning_rate": 8.824237041125187e-05, + "loss": 0.43620920181274414, + "step": 28270 + }, + { + "epoch": 0.12141195057657797, + "grad_norm": 2.36214280128479, + "learning_rate": 8.823805869113425e-05, + "loss": 0.6317470073699951, + "step": 28280 + }, + { + "epoch": 0.12145488266659797, + "grad_norm": 0.24611838161945343, + "learning_rate": 8.823374697101662e-05, + "loss": 0.29659457206726075, + "step": 28290 + }, + { + "epoch": 0.12149781475661799, + "grad_norm": 0.3825310170650482, + "learning_rate": 8.8229435250899e-05, + "loss": 0.3007538318634033, + "step": 28300 + }, + { + "epoch": 0.12154074684663799, + "grad_norm": 1.0893738269805908, + "learning_rate": 8.822512353078138e-05, + "loss": 0.09316685199737548, + "step": 28310 + }, + { + "epoch": 0.12158367893665799, + "grad_norm": 1.2878329753875732, + "learning_rate": 8.822081181066376e-05, + "loss": 0.2848006248474121, + "step": 28320 + }, + { + "epoch": 0.121626611026678, + "grad_norm": 1.5610382556915283, + "learning_rate": 8.821650009054613e-05, + "loss": 0.18019193410873413, + "step": 28330 + }, + { + "epoch": 0.121669543116698, + "grad_norm": 0.055733054876327515, + "learning_rate": 8.821218837042851e-05, + "loss": 0.21548593044281006, + "step": 28340 + }, + { + "epoch": 0.12171247520671802, + "grad_norm": 0.5747507810592651, + "learning_rate": 8.820787665031089e-05, + "loss": 0.4182239055633545, + "step": 28350 + }, + { + "epoch": 0.12175540729673802, + "grad_norm": 0.39458662271499634, + "learning_rate": 8.820356493019326e-05, + "loss": 0.33075120449066164, + "step": 28360 + }, + { + "epoch": 0.12179833938675802, + "grad_norm": 0.22349131107330322, + "learning_rate": 8.819925321007563e-05, + "loss": 0.16916000843048096, + "step": 28370 + }, + { + "epoch": 0.12184127147677803, + "grad_norm": 0.16843147575855255, + "learning_rate": 8.8194941489958e-05, + "loss": 0.37458953857421873, + "step": 28380 + }, + { + "epoch": 0.12188420356679804, + "grad_norm": 0.012207563035190105, + "learning_rate": 8.819062976984038e-05, + "loss": 0.024071575701236726, + "step": 28390 + }, + { + "epoch": 0.12192713565681805, + "grad_norm": 0.03551925718784332, + "learning_rate": 8.818631804972276e-05, + "loss": 0.29789299964904786, + "step": 28400 + }, + { + "epoch": 0.12197006774683805, + "grad_norm": 0.050328079611063004, + "learning_rate": 8.818200632960514e-05, + "loss": 0.3409437656402588, + "step": 28410 + }, + { + "epoch": 0.12201299983685805, + "grad_norm": 0.009469667449593544, + "learning_rate": 8.817769460948752e-05, + "loss": 0.2927106857299805, + "step": 28420 + }, + { + "epoch": 0.12205593192687807, + "grad_norm": 2.191164970397949, + "learning_rate": 8.817338288936989e-05, + "loss": 0.4062193870544434, + "step": 28430 + }, + { + "epoch": 0.12209886401689807, + "grad_norm": 1.214449405670166, + "learning_rate": 8.816907116925227e-05, + "loss": 0.26831727027893065, + "step": 28440 + }, + { + "epoch": 0.12214179610691808, + "grad_norm": 0.12962786853313446, + "learning_rate": 8.816475944913463e-05, + "loss": 0.26643710136413573, + "step": 28450 + }, + { + "epoch": 0.12218472819693808, + "grad_norm": 2.260606288909912, + "learning_rate": 8.816044772901701e-05, + "loss": 0.2599010944366455, + "step": 28460 + }, + { + "epoch": 0.12222766028695808, + "grad_norm": 14.785316467285156, + "learning_rate": 8.815613600889939e-05, + "loss": 0.3463857650756836, + "step": 28470 + }, + { + "epoch": 0.1222705923769781, + "grad_norm": 0.17666365206241608, + "learning_rate": 8.815182428878177e-05, + "loss": 0.3569460868835449, + "step": 28480 + }, + { + "epoch": 0.1223135244669981, + "grad_norm": 0.2478390485048294, + "learning_rate": 8.814751256866414e-05, + "loss": 0.25950796604156495, + "step": 28490 + }, + { + "epoch": 0.12235645655701811, + "grad_norm": 0.018371712416410446, + "learning_rate": 8.814320084854652e-05, + "loss": 0.6131328582763672, + "step": 28500 + }, + { + "epoch": 0.12239938864703812, + "grad_norm": 1.7847379446029663, + "learning_rate": 8.81388891284289e-05, + "loss": 0.3412437677383423, + "step": 28510 + }, + { + "epoch": 0.12244232073705812, + "grad_norm": 3.38214111328125, + "learning_rate": 8.813457740831128e-05, + "loss": 0.37290809154510496, + "step": 28520 + }, + { + "epoch": 0.12248525282707813, + "grad_norm": 0.06898491084575653, + "learning_rate": 8.813026568819365e-05, + "loss": 0.2055596351623535, + "step": 28530 + }, + { + "epoch": 0.12252818491709813, + "grad_norm": 2.6301519870758057, + "learning_rate": 8.812595396807603e-05, + "loss": 0.31752374172210696, + "step": 28540 + }, + { + "epoch": 0.12257111700711815, + "grad_norm": 2.2748639583587646, + "learning_rate": 8.812164224795841e-05, + "loss": 0.2857156038284302, + "step": 28550 + }, + { + "epoch": 0.12261404909713815, + "grad_norm": 0.08286463469266891, + "learning_rate": 8.811733052784078e-05, + "loss": 0.2685399055480957, + "step": 28560 + }, + { + "epoch": 0.12265698118715815, + "grad_norm": 2.439558744430542, + "learning_rate": 8.811301880772316e-05, + "loss": 0.3968736410140991, + "step": 28570 + }, + { + "epoch": 0.12269991327717816, + "grad_norm": 0.1083855926990509, + "learning_rate": 8.810870708760554e-05, + "loss": 0.08559461236000061, + "step": 28580 + }, + { + "epoch": 0.12274284536719816, + "grad_norm": 2.2437236309051514, + "learning_rate": 8.810439536748792e-05, + "loss": 0.15293599367141725, + "step": 28590 + }, + { + "epoch": 0.12278577745721818, + "grad_norm": 0.05528045818209648, + "learning_rate": 8.81000836473703e-05, + "loss": 0.28284125328063964, + "step": 28600 + }, + { + "epoch": 0.12282870954723818, + "grad_norm": 0.0032772794365882874, + "learning_rate": 8.809577192725266e-05, + "loss": 0.10475077629089355, + "step": 28610 + }, + { + "epoch": 0.12287164163725818, + "grad_norm": 0.0006892705569043756, + "learning_rate": 8.809146020713504e-05, + "loss": 0.17690064907073974, + "step": 28620 + }, + { + "epoch": 0.1229145737272782, + "grad_norm": 0.005972778424620628, + "learning_rate": 8.808714848701741e-05, + "loss": 0.16229041814804077, + "step": 28630 + }, + { + "epoch": 0.1229575058172982, + "grad_norm": 0.0009143303614109755, + "learning_rate": 8.808283676689979e-05, + "loss": 0.2857463598251343, + "step": 28640 + }, + { + "epoch": 0.12300043790731821, + "grad_norm": 0.006627853959798813, + "learning_rate": 8.807852504678217e-05, + "loss": 0.27227063179016114, + "step": 28650 + }, + { + "epoch": 0.12304336999733821, + "grad_norm": 0.07442281395196915, + "learning_rate": 8.807421332666454e-05, + "loss": 0.1588853716850281, + "step": 28660 + }, + { + "epoch": 0.12308630208735821, + "grad_norm": 0.05322013795375824, + "learning_rate": 8.806990160654692e-05, + "loss": 0.10874921083450317, + "step": 28670 + }, + { + "epoch": 0.12312923417737823, + "grad_norm": 1.68862783908844, + "learning_rate": 8.80655898864293e-05, + "loss": 0.33600821495056155, + "step": 28680 + }, + { + "epoch": 0.12317216626739823, + "grad_norm": 0.10802485793828964, + "learning_rate": 8.806127816631166e-05, + "loss": 0.15215885639190674, + "step": 28690 + }, + { + "epoch": 0.12321509835741823, + "grad_norm": 3.715789318084717, + "learning_rate": 8.805696644619404e-05, + "loss": 0.37726900577545164, + "step": 28700 + }, + { + "epoch": 0.12325803044743824, + "grad_norm": 0.18458077311515808, + "learning_rate": 8.805265472607642e-05, + "loss": 0.07564336061477661, + "step": 28710 + }, + { + "epoch": 0.12330096253745824, + "grad_norm": 0.2841893136501312, + "learning_rate": 8.80483430059588e-05, + "loss": 0.16145308017730714, + "step": 28720 + }, + { + "epoch": 0.12334389462747826, + "grad_norm": 0.1921205371618271, + "learning_rate": 8.804403128584117e-05, + "loss": 0.3189536571502686, + "step": 28730 + }, + { + "epoch": 0.12338682671749826, + "grad_norm": 12.66451358795166, + "learning_rate": 8.803971956572355e-05, + "loss": 0.2783447504043579, + "step": 28740 + }, + { + "epoch": 0.12342975880751826, + "grad_norm": 0.4492635130882263, + "learning_rate": 8.803540784560593e-05, + "loss": 0.10007522106170655, + "step": 28750 + }, + { + "epoch": 0.12347269089753828, + "grad_norm": 0.025158774107694626, + "learning_rate": 8.80310961254883e-05, + "loss": 0.3440743923187256, + "step": 28760 + }, + { + "epoch": 0.12351562298755828, + "grad_norm": 0.020711267367005348, + "learning_rate": 8.802678440537068e-05, + "loss": 0.3100674867630005, + "step": 28770 + }, + { + "epoch": 0.12355855507757829, + "grad_norm": 0.0499393492937088, + "learning_rate": 8.802247268525306e-05, + "loss": 0.16550155878067016, + "step": 28780 + }, + { + "epoch": 0.12360148716759829, + "grad_norm": 2.1590919494628906, + "learning_rate": 8.801816096513544e-05, + "loss": 0.27875704765319825, + "step": 28790 + }, + { + "epoch": 0.1236444192576183, + "grad_norm": 0.07582681626081467, + "learning_rate": 8.801384924501781e-05, + "loss": 0.1742209553718567, + "step": 28800 + }, + { + "epoch": 0.12368735134763831, + "grad_norm": 3.8447768688201904, + "learning_rate": 8.800953752490019e-05, + "loss": 0.254422926902771, + "step": 28810 + }, + { + "epoch": 0.12373028343765831, + "grad_norm": 2.4385411739349365, + "learning_rate": 8.800522580478257e-05, + "loss": 0.1299283981323242, + "step": 28820 + }, + { + "epoch": 0.12377321552767832, + "grad_norm": 2.0784387588500977, + "learning_rate": 8.800091408466495e-05, + "loss": 0.3685250520706177, + "step": 28830 + }, + { + "epoch": 0.12381614761769832, + "grad_norm": 0.04812368005514145, + "learning_rate": 8.799660236454732e-05, + "loss": 0.18473578691482545, + "step": 28840 + }, + { + "epoch": 0.12385907970771833, + "grad_norm": 12.649018287658691, + "learning_rate": 8.79922906444297e-05, + "loss": 0.04986898601055145, + "step": 28850 + }, + { + "epoch": 0.12390201179773834, + "grad_norm": 0.06861792504787445, + "learning_rate": 8.798797892431206e-05, + "loss": 0.16036927700042725, + "step": 28860 + }, + { + "epoch": 0.12394494388775834, + "grad_norm": 8.813103675842285, + "learning_rate": 8.798366720419444e-05, + "loss": 0.4332833766937256, + "step": 28870 + }, + { + "epoch": 0.12398787597777836, + "grad_norm": 0.10298661142587662, + "learning_rate": 8.797935548407682e-05, + "loss": 0.2906115293502808, + "step": 28880 + }, + { + "epoch": 0.12403080806779836, + "grad_norm": 0.008993543684482574, + "learning_rate": 8.79750437639592e-05, + "loss": 0.1359240174293518, + "step": 28890 + }, + { + "epoch": 0.12407374015781836, + "grad_norm": 3.4008004665374756, + "learning_rate": 8.797073204384157e-05, + "loss": 0.29442768096923827, + "step": 28900 + }, + { + "epoch": 0.12411667224783837, + "grad_norm": 0.010733728297054768, + "learning_rate": 8.796642032372395e-05, + "loss": 0.418992280960083, + "step": 28910 + }, + { + "epoch": 0.12415960433785837, + "grad_norm": 0.8852447867393494, + "learning_rate": 8.796210860360633e-05, + "loss": 0.2331214427947998, + "step": 28920 + }, + { + "epoch": 0.12420253642787839, + "grad_norm": 2.394709587097168, + "learning_rate": 8.79577968834887e-05, + "loss": 0.25450208187103274, + "step": 28930 + }, + { + "epoch": 0.12424546851789839, + "grad_norm": 0.205605149269104, + "learning_rate": 8.795348516337107e-05, + "loss": 0.16247061491012574, + "step": 28940 + }, + { + "epoch": 0.12428840060791839, + "grad_norm": 1.9020476341247559, + "learning_rate": 8.794917344325345e-05, + "loss": 0.6566081047058105, + "step": 28950 + }, + { + "epoch": 0.1243313326979384, + "grad_norm": 0.033048976212739944, + "learning_rate": 8.794486172313582e-05, + "loss": 0.32285366058349607, + "step": 28960 + }, + { + "epoch": 0.1243742647879584, + "grad_norm": 0.02315451204776764, + "learning_rate": 8.79405500030182e-05, + "loss": 0.12404880523681641, + "step": 28970 + }, + { + "epoch": 0.12441719687797842, + "grad_norm": 0.011411737650632858, + "learning_rate": 8.793623828290058e-05, + "loss": 0.018471239507198332, + "step": 28980 + }, + { + "epoch": 0.12446012896799842, + "grad_norm": 0.11584151536226273, + "learning_rate": 8.793192656278296e-05, + "loss": 0.3177420854568481, + "step": 28990 + }, + { + "epoch": 0.12450306105801842, + "grad_norm": 0.003826763015240431, + "learning_rate": 8.792761484266533e-05, + "loss": 0.1817054867744446, + "step": 29000 + }, + { + "epoch": 0.12450306105801842, + "eval_loss": 0.4792439639568329, + "eval_runtime": 27.5002, + "eval_samples_per_second": 3.636, + "eval_steps_per_second": 3.636, + "step": 29000 + }, + { + "epoch": 0.12454599314803844, + "grad_norm": 0.684794545173645, + "learning_rate": 8.792330312254772e-05, + "loss": 0.11615618467330932, + "step": 29010 + }, + { + "epoch": 0.12458892523805844, + "grad_norm": 3.8045012950897217, + "learning_rate": 8.791899140243009e-05, + "loss": 0.29504122734069826, + "step": 29020 + }, + { + "epoch": 0.12463185732807845, + "grad_norm": 0.0482652485370636, + "learning_rate": 8.791467968231247e-05, + "loss": 0.06096305847167969, + "step": 29030 + }, + { + "epoch": 0.12467478941809845, + "grad_norm": 0.11692120879888535, + "learning_rate": 8.791036796219484e-05, + "loss": 0.2260446786880493, + "step": 29040 + }, + { + "epoch": 0.12471772150811845, + "grad_norm": 0.003027498023584485, + "learning_rate": 8.790605624207722e-05, + "loss": 0.1288272976875305, + "step": 29050 + }, + { + "epoch": 0.12476065359813847, + "grad_norm": 1.686038613319397, + "learning_rate": 8.79017445219596e-05, + "loss": 0.4686258316040039, + "step": 29060 + }, + { + "epoch": 0.12480358568815847, + "grad_norm": 0.27779895067214966, + "learning_rate": 8.789743280184197e-05, + "loss": 0.20123896598815919, + "step": 29070 + }, + { + "epoch": 0.12484651777817848, + "grad_norm": 5.1882805824279785, + "learning_rate": 8.789312108172435e-05, + "loss": 0.41694116592407227, + "step": 29080 + }, + { + "epoch": 0.12488944986819848, + "grad_norm": 0.6722069978713989, + "learning_rate": 8.788880936160673e-05, + "loss": 0.29966685771942136, + "step": 29090 + }, + { + "epoch": 0.12493238195821849, + "grad_norm": 1.6656306982040405, + "learning_rate": 8.788449764148911e-05, + "loss": 0.15465418100357056, + "step": 29100 + }, + { + "epoch": 0.1249753140482385, + "grad_norm": 0.1642853319644928, + "learning_rate": 8.788018592137147e-05, + "loss": 0.3329266309738159, + "step": 29110 + }, + { + "epoch": 0.12501824613825852, + "grad_norm": 2.975208044052124, + "learning_rate": 8.787587420125385e-05, + "loss": 0.1679534912109375, + "step": 29120 + }, + { + "epoch": 0.1250611782282785, + "grad_norm": 0.11888681352138519, + "learning_rate": 8.787156248113623e-05, + "loss": 0.06416876316070556, + "step": 29130 + }, + { + "epoch": 0.12510411031829852, + "grad_norm": 1.4387938976287842, + "learning_rate": 8.78672507610186e-05, + "loss": 0.26547038555145264, + "step": 29140 + }, + { + "epoch": 0.12514704240831853, + "grad_norm": 0.0023374587763100863, + "learning_rate": 8.786293904090098e-05, + "loss": 0.10321999788284301, + "step": 29150 + }, + { + "epoch": 0.12518997449833852, + "grad_norm": 6.412931442260742, + "learning_rate": 8.785862732078336e-05, + "loss": 0.49428510665893555, + "step": 29160 + }, + { + "epoch": 0.12523290658835853, + "grad_norm": 0.05729628726840019, + "learning_rate": 8.785431560066573e-05, + "loss": 0.2725868225097656, + "step": 29170 + }, + { + "epoch": 0.12527583867837855, + "grad_norm": 0.06620445102453232, + "learning_rate": 8.785000388054811e-05, + "loss": 0.27253124713897703, + "step": 29180 + }, + { + "epoch": 0.12531877076839854, + "grad_norm": 0.11173108965158463, + "learning_rate": 8.784569216043048e-05, + "loss": 0.2758491992950439, + "step": 29190 + }, + { + "epoch": 0.12536170285841855, + "grad_norm": 1.4134966135025024, + "learning_rate": 8.784138044031285e-05, + "loss": 0.2473074197769165, + "step": 29200 + }, + { + "epoch": 0.12540463494843856, + "grad_norm": 3.598130941390991, + "learning_rate": 8.783706872019523e-05, + "loss": 0.19799611568450928, + "step": 29210 + }, + { + "epoch": 0.12544756703845858, + "grad_norm": 0.28464409708976746, + "learning_rate": 8.783275700007761e-05, + "loss": 0.23102359771728515, + "step": 29220 + }, + { + "epoch": 0.12549049912847857, + "grad_norm": 0.09032626450061798, + "learning_rate": 8.782844527996e-05, + "loss": 0.23052756786346434, + "step": 29230 + }, + { + "epoch": 0.12553343121849858, + "grad_norm": 0.725068211555481, + "learning_rate": 8.782413355984238e-05, + "loss": 0.3960889339447021, + "step": 29240 + }, + { + "epoch": 0.1255763633085186, + "grad_norm": 0.06919623166322708, + "learning_rate": 8.781982183972475e-05, + "loss": 0.12070854902267455, + "step": 29250 + }, + { + "epoch": 0.12561929539853858, + "grad_norm": 2.4769787788391113, + "learning_rate": 8.781551011960713e-05, + "loss": 0.21523828506469728, + "step": 29260 + }, + { + "epoch": 0.1256622274885586, + "grad_norm": 0.012916326522827148, + "learning_rate": 8.78111983994895e-05, + "loss": 0.08447671532630921, + "step": 29270 + }, + { + "epoch": 0.1257051595785786, + "grad_norm": 0.08832432329654694, + "learning_rate": 8.780688667937187e-05, + "loss": 0.17949587106704712, + "step": 29280 + }, + { + "epoch": 0.1257480916685986, + "grad_norm": 1.1982070207595825, + "learning_rate": 8.780257495925425e-05, + "loss": 0.3396637439727783, + "step": 29290 + }, + { + "epoch": 0.12579102375861861, + "grad_norm": 0.0024188838433474302, + "learning_rate": 8.779826323913663e-05, + "loss": 0.15142393112182617, + "step": 29300 + }, + { + "epoch": 0.12583395584863863, + "grad_norm": 1.0369142293930054, + "learning_rate": 8.7793951519019e-05, + "loss": 0.3342637300491333, + "step": 29310 + }, + { + "epoch": 0.12587688793865862, + "grad_norm": 0.015218590386211872, + "learning_rate": 8.778963979890138e-05, + "loss": 0.04915739297866821, + "step": 29320 + }, + { + "epoch": 0.12591982002867863, + "grad_norm": 5.906143665313721, + "learning_rate": 8.778532807878376e-05, + "loss": 0.18448089361190795, + "step": 29330 + }, + { + "epoch": 0.12596275211869865, + "grad_norm": 5.609766483306885, + "learning_rate": 8.778101635866614e-05, + "loss": 0.16584160327911376, + "step": 29340 + }, + { + "epoch": 0.12600568420871866, + "grad_norm": 1.786668300628662, + "learning_rate": 8.77767046385485e-05, + "loss": 0.2621778964996338, + "step": 29350 + }, + { + "epoch": 0.12604861629873865, + "grad_norm": 0.0621095634996891, + "learning_rate": 8.777239291843088e-05, + "loss": 0.35020430088043214, + "step": 29360 + }, + { + "epoch": 0.12609154838875866, + "grad_norm": 0.5814092755317688, + "learning_rate": 8.776808119831325e-05, + "loss": 0.20505285263061523, + "step": 29370 + }, + { + "epoch": 0.12613448047877868, + "grad_norm": 1.501968502998352, + "learning_rate": 8.776376947819563e-05, + "loss": 0.28874716758728025, + "step": 29380 + }, + { + "epoch": 0.12617741256879866, + "grad_norm": 2.841974973678589, + "learning_rate": 8.775945775807801e-05, + "loss": 0.30657010078430175, + "step": 29390 + }, + { + "epoch": 0.12622034465881868, + "grad_norm": 0.15167303383350372, + "learning_rate": 8.775514603796039e-05, + "loss": 0.20814452171325684, + "step": 29400 + }, + { + "epoch": 0.1262632767488387, + "grad_norm": 0.4209035634994507, + "learning_rate": 8.775083431784276e-05, + "loss": 0.1624962329864502, + "step": 29410 + }, + { + "epoch": 0.12630620883885868, + "grad_norm": 0.1125643402338028, + "learning_rate": 8.774652259772514e-05, + "loss": 0.20734481811523436, + "step": 29420 + }, + { + "epoch": 0.1263491409288787, + "grad_norm": 0.6038672924041748, + "learning_rate": 8.774221087760752e-05, + "loss": 0.13862569332122804, + "step": 29430 + }, + { + "epoch": 0.1263920730188987, + "grad_norm": 1.3439053297042847, + "learning_rate": 8.773789915748988e-05, + "loss": 0.18771252632141114, + "step": 29440 + }, + { + "epoch": 0.12643500510891872, + "grad_norm": 0.06666406989097595, + "learning_rate": 8.773358743737227e-05, + "loss": 0.20646681785583496, + "step": 29450 + }, + { + "epoch": 0.1264779371989387, + "grad_norm": 0.06401768326759338, + "learning_rate": 8.772927571725465e-05, + "loss": 0.2532331466674805, + "step": 29460 + }, + { + "epoch": 0.12652086928895873, + "grad_norm": 0.024090800434350967, + "learning_rate": 8.772496399713703e-05, + "loss": 0.17938752174377443, + "step": 29470 + }, + { + "epoch": 0.12656380137897874, + "grad_norm": 0.011575686745345592, + "learning_rate": 8.77206522770194e-05, + "loss": 0.21051297187805176, + "step": 29480 + }, + { + "epoch": 0.12660673346899873, + "grad_norm": 0.15312832593917847, + "learning_rate": 8.771634055690178e-05, + "loss": 0.20514678955078125, + "step": 29490 + }, + { + "epoch": 0.12664966555901874, + "grad_norm": 27.025413513183594, + "learning_rate": 8.771202883678416e-05, + "loss": 0.21771588325500488, + "step": 29500 + }, + { + "epoch": 0.12669259764903876, + "grad_norm": 0.03582566976547241, + "learning_rate": 8.770771711666654e-05, + "loss": 0.43050341606140136, + "step": 29510 + }, + { + "epoch": 0.12673552973905874, + "grad_norm": 0.01910443976521492, + "learning_rate": 8.77034053965489e-05, + "loss": 0.3661238193511963, + "step": 29520 + }, + { + "epoch": 0.12677846182907876, + "grad_norm": 0.4073588252067566, + "learning_rate": 8.769909367643128e-05, + "loss": 0.24593477249145507, + "step": 29530 + }, + { + "epoch": 0.12682139391909877, + "grad_norm": 0.019970379769802094, + "learning_rate": 8.769478195631366e-05, + "loss": 0.300111198425293, + "step": 29540 + }, + { + "epoch": 0.1268643260091188, + "grad_norm": 0.026408078148961067, + "learning_rate": 8.769047023619603e-05, + "loss": 0.17894766330718995, + "step": 29550 + }, + { + "epoch": 0.12690725809913878, + "grad_norm": 0.013225136324763298, + "learning_rate": 8.768615851607841e-05, + "loss": 0.23364582061767578, + "step": 29560 + }, + { + "epoch": 0.1269501901891588, + "grad_norm": 0.043238185346126556, + "learning_rate": 8.768184679596079e-05, + "loss": 0.35565433502197263, + "step": 29570 + }, + { + "epoch": 0.1269931222791788, + "grad_norm": 0.3078600764274597, + "learning_rate": 8.767753507584317e-05, + "loss": 0.2342392921447754, + "step": 29580 + }, + { + "epoch": 0.1270360543691988, + "grad_norm": 7.824125289916992, + "learning_rate": 8.767322335572554e-05, + "loss": 0.21343417167663575, + "step": 29590 + }, + { + "epoch": 0.1270789864592188, + "grad_norm": 0.45437708497047424, + "learning_rate": 8.76689116356079e-05, + "loss": 0.0968501091003418, + "step": 29600 + }, + { + "epoch": 0.12712191854923882, + "grad_norm": 2.1558401584625244, + "learning_rate": 8.766459991549028e-05, + "loss": 0.25272440910339355, + "step": 29610 + }, + { + "epoch": 0.1271648506392588, + "grad_norm": 0.08883268386125565, + "learning_rate": 8.766028819537266e-05, + "loss": 0.21234755516052245, + "step": 29620 + }, + { + "epoch": 0.12720778272927882, + "grad_norm": 0.0019184901611879468, + "learning_rate": 8.765597647525504e-05, + "loss": 0.10313694477081299, + "step": 29630 + }, + { + "epoch": 0.12725071481929884, + "grad_norm": 0.31680187582969666, + "learning_rate": 8.765166475513742e-05, + "loss": 0.20983920097351075, + "step": 29640 + }, + { + "epoch": 0.12729364690931885, + "grad_norm": 1.115886926651001, + "learning_rate": 8.764735303501979e-05, + "loss": 0.061583316326141356, + "step": 29650 + }, + { + "epoch": 0.12733657899933884, + "grad_norm": 0.011549929156899452, + "learning_rate": 8.764304131490217e-05, + "loss": 0.009761539101600648, + "step": 29660 + }, + { + "epoch": 0.12737951108935885, + "grad_norm": 0.7803761959075928, + "learning_rate": 8.763872959478455e-05, + "loss": 0.26551635265350343, + "step": 29670 + }, + { + "epoch": 0.12742244317937887, + "grad_norm": 0.0001464521192247048, + "learning_rate": 8.763441787466693e-05, + "loss": 0.24804928302764892, + "step": 29680 + }, + { + "epoch": 0.12746537526939886, + "grad_norm": 0.04745858907699585, + "learning_rate": 8.76301061545493e-05, + "loss": 0.07876437306404113, + "step": 29690 + }, + { + "epoch": 0.12750830735941887, + "grad_norm": 3.122537136077881, + "learning_rate": 8.762579443443168e-05, + "loss": 0.5715325832366943, + "step": 29700 + }, + { + "epoch": 0.12755123944943889, + "grad_norm": 0.0008646674104966223, + "learning_rate": 8.762148271431406e-05, + "loss": 0.4441582679748535, + "step": 29710 + }, + { + "epoch": 0.12759417153945887, + "grad_norm": 0.30260586738586426, + "learning_rate": 8.761717099419643e-05, + "loss": 0.495958948135376, + "step": 29720 + }, + { + "epoch": 0.1276371036294789, + "grad_norm": 1.3628997802734375, + "learning_rate": 8.761285927407881e-05, + "loss": 0.1913755178451538, + "step": 29730 + }, + { + "epoch": 0.1276800357194989, + "grad_norm": 0.05150032043457031, + "learning_rate": 8.760854755396119e-05, + "loss": 0.04136924743652344, + "step": 29740 + }, + { + "epoch": 0.1277229678095189, + "grad_norm": 0.016917334869503975, + "learning_rate": 8.760423583384357e-05, + "loss": 0.3162794828414917, + "step": 29750 + }, + { + "epoch": 0.1277658998995389, + "grad_norm": 1.61897611618042, + "learning_rate": 8.759992411372593e-05, + "loss": 0.2959132194519043, + "step": 29760 + }, + { + "epoch": 0.12780883198955892, + "grad_norm": 0.04096636176109314, + "learning_rate": 8.759561239360831e-05, + "loss": 0.2208381175994873, + "step": 29770 + }, + { + "epoch": 0.12785176407957893, + "grad_norm": 1.580322265625, + "learning_rate": 8.759130067349068e-05, + "loss": 0.2737943172454834, + "step": 29780 + }, + { + "epoch": 0.12789469616959892, + "grad_norm": 0.030307283625006676, + "learning_rate": 8.758698895337306e-05, + "loss": 0.16907756328582763, + "step": 29790 + }, + { + "epoch": 0.12793762825961894, + "grad_norm": 0.04987514391541481, + "learning_rate": 8.758267723325544e-05, + "loss": 0.07499375939369202, + "step": 29800 + }, + { + "epoch": 0.12798056034963895, + "grad_norm": 0.363908976316452, + "learning_rate": 8.757836551313782e-05, + "loss": 0.30732824802398684, + "step": 29810 + }, + { + "epoch": 0.12802349243965894, + "grad_norm": 0.3449194133281708, + "learning_rate": 8.75740537930202e-05, + "loss": 0.4999234676361084, + "step": 29820 + }, + { + "epoch": 0.12806642452967895, + "grad_norm": 0.8750787377357483, + "learning_rate": 8.756974207290257e-05, + "loss": 0.30790679454803466, + "step": 29830 + }, + { + "epoch": 0.12810935661969897, + "grad_norm": 1.4069339036941528, + "learning_rate": 8.756543035278495e-05, + "loss": 0.21252622604370117, + "step": 29840 + }, + { + "epoch": 0.12815228870971895, + "grad_norm": 0.7201815247535706, + "learning_rate": 8.756111863266731e-05, + "loss": 0.2888744831085205, + "step": 29850 + }, + { + "epoch": 0.12819522079973897, + "grad_norm": 3.888648509979248, + "learning_rate": 8.755680691254969e-05, + "loss": 0.08338718414306641, + "step": 29860 + }, + { + "epoch": 0.12823815288975898, + "grad_norm": 0.02963382750749588, + "learning_rate": 8.755249519243207e-05, + "loss": 0.032802003622055056, + "step": 29870 + }, + { + "epoch": 0.128281084979779, + "grad_norm": 0.7857760787010193, + "learning_rate": 8.754818347231444e-05, + "loss": 0.16297609806060792, + "step": 29880 + }, + { + "epoch": 0.12832401706979898, + "grad_norm": 2.595731019973755, + "learning_rate": 8.754387175219682e-05, + "loss": 0.19706374406814575, + "step": 29890 + }, + { + "epoch": 0.128366949159819, + "grad_norm": 1.7243250608444214, + "learning_rate": 8.75395600320792e-05, + "loss": 0.3066516637802124, + "step": 29900 + }, + { + "epoch": 0.12840988124983901, + "grad_norm": 0.018426483497023582, + "learning_rate": 8.753524831196158e-05, + "loss": 0.46737966537475584, + "step": 29910 + }, + { + "epoch": 0.128452813339859, + "grad_norm": 2.102694272994995, + "learning_rate": 8.753093659184395e-05, + "loss": 0.23304004669189454, + "step": 29920 + }, + { + "epoch": 0.12849574542987902, + "grad_norm": 0.03168783336877823, + "learning_rate": 8.752662487172633e-05, + "loss": 0.32050702571868894, + "step": 29930 + }, + { + "epoch": 0.12853867751989903, + "grad_norm": 3.586268663406372, + "learning_rate": 8.752231315160871e-05, + "loss": 0.4045413494110107, + "step": 29940 + }, + { + "epoch": 0.12858160960991902, + "grad_norm": 0.01576436124742031, + "learning_rate": 8.751800143149109e-05, + "loss": 0.19724249839782715, + "step": 29950 + }, + { + "epoch": 0.12862454169993903, + "grad_norm": 0.03131139278411865, + "learning_rate": 8.751368971137346e-05, + "loss": 0.1462864637374878, + "step": 29960 + }, + { + "epoch": 0.12866747378995905, + "grad_norm": 1.590073585510254, + "learning_rate": 8.750937799125584e-05, + "loss": 0.643623161315918, + "step": 29970 + }, + { + "epoch": 0.12871040587997906, + "grad_norm": 0.17035600543022156, + "learning_rate": 8.750506627113822e-05, + "loss": 0.22655746936798096, + "step": 29980 + }, + { + "epoch": 0.12875333796999905, + "grad_norm": 0.10757172107696533, + "learning_rate": 8.75007545510206e-05, + "loss": 0.4978126049041748, + "step": 29990 + }, + { + "epoch": 0.12879627006001906, + "grad_norm": 0.06894957274198532, + "learning_rate": 8.749644283090297e-05, + "loss": 0.10065504312515258, + "step": 30000 + }, + { + "epoch": 0.12879627006001906, + "eval_loss": 0.4761184751987457, + "eval_runtime": 27.4021, + "eval_samples_per_second": 3.649, + "eval_steps_per_second": 3.649, + "step": 30000 + }, + { + "epoch": 0.12883920215003908, + "grad_norm": 1.8203767538070679, + "learning_rate": 8.749213111078534e-05, + "loss": 0.20030674934387208, + "step": 30010 + }, + { + "epoch": 0.12888213424005907, + "grad_norm": 2.7183761596679688, + "learning_rate": 8.748781939066771e-05, + "loss": 0.263798451423645, + "step": 30020 + }, + { + "epoch": 0.12892506633007908, + "grad_norm": 0.060718487948179245, + "learning_rate": 8.748350767055009e-05, + "loss": 0.2561044692993164, + "step": 30030 + }, + { + "epoch": 0.1289679984200991, + "grad_norm": 4.440418720245361, + "learning_rate": 8.747919595043247e-05, + "loss": 0.14885461330413818, + "step": 30040 + }, + { + "epoch": 0.12901093051011908, + "grad_norm": 0.02631818689405918, + "learning_rate": 8.747488423031485e-05, + "loss": 0.05507946014404297, + "step": 30050 + }, + { + "epoch": 0.1290538626001391, + "grad_norm": 1.9612177610397339, + "learning_rate": 8.747057251019722e-05, + "loss": 0.3454012632369995, + "step": 30060 + }, + { + "epoch": 0.1290967946901591, + "grad_norm": 9.800521850585938, + "learning_rate": 8.74662607900796e-05, + "loss": 0.38835835456848145, + "step": 30070 + }, + { + "epoch": 0.12913972678017913, + "grad_norm": 0.984861433506012, + "learning_rate": 8.746194906996198e-05, + "loss": 0.19478857517242432, + "step": 30080 + }, + { + "epoch": 0.1291826588701991, + "grad_norm": 4.223658084869385, + "learning_rate": 8.745763734984434e-05, + "loss": 0.11445388793945313, + "step": 30090 + }, + { + "epoch": 0.12922559096021913, + "grad_norm": 0.6742361783981323, + "learning_rate": 8.745332562972672e-05, + "loss": 0.29052252769470216, + "step": 30100 + }, + { + "epoch": 0.12926852305023914, + "grad_norm": 1.4758702516555786, + "learning_rate": 8.74490139096091e-05, + "loss": 0.053003185987472536, + "step": 30110 + }, + { + "epoch": 0.12931145514025913, + "grad_norm": 0.11309056729078293, + "learning_rate": 8.744470218949147e-05, + "loss": 0.12848434448242188, + "step": 30120 + }, + { + "epoch": 0.12935438723027914, + "grad_norm": 0.041068606078624725, + "learning_rate": 8.744039046937385e-05, + "loss": 0.31078152656555175, + "step": 30130 + }, + { + "epoch": 0.12939731932029916, + "grad_norm": 1.0843498706817627, + "learning_rate": 8.743607874925623e-05, + "loss": 0.2896578788757324, + "step": 30140 + }, + { + "epoch": 0.12944025141031915, + "grad_norm": 0.13427147269248962, + "learning_rate": 8.74317670291386e-05, + "loss": 0.07495735883712769, + "step": 30150 + }, + { + "epoch": 0.12948318350033916, + "grad_norm": 0.03397704288363457, + "learning_rate": 8.742745530902098e-05, + "loss": 0.291317081451416, + "step": 30160 + }, + { + "epoch": 0.12952611559035918, + "grad_norm": 0.003932945430278778, + "learning_rate": 8.742314358890336e-05, + "loss": 0.2433781862258911, + "step": 30170 + }, + { + "epoch": 0.12956904768037916, + "grad_norm": 0.04297548905014992, + "learning_rate": 8.741883186878574e-05, + "loss": 0.3208784580230713, + "step": 30180 + }, + { + "epoch": 0.12961197977039918, + "grad_norm": 0.07267145812511444, + "learning_rate": 8.741452014866812e-05, + "loss": 0.4771559715270996, + "step": 30190 + }, + { + "epoch": 0.1296549118604192, + "grad_norm": 0.1928095817565918, + "learning_rate": 8.741020842855049e-05, + "loss": 0.054413968324661256, + "step": 30200 + }, + { + "epoch": 0.1296978439504392, + "grad_norm": 0.00119906070176512, + "learning_rate": 8.740589670843287e-05, + "loss": 0.1544780135154724, + "step": 30210 + }, + { + "epoch": 0.1297407760404592, + "grad_norm": 5.354598522186279, + "learning_rate": 8.740158498831525e-05, + "loss": 0.29805917739868165, + "step": 30220 + }, + { + "epoch": 0.1297837081304792, + "grad_norm": 18.67365074157715, + "learning_rate": 8.739727326819762e-05, + "loss": 0.13847802877426146, + "step": 30230 + }, + { + "epoch": 0.12982664022049922, + "grad_norm": 1.6385403871536255, + "learning_rate": 8.739296154808e-05, + "loss": 0.21875405311584473, + "step": 30240 + }, + { + "epoch": 0.1298695723105192, + "grad_norm": 7.299107074737549, + "learning_rate": 8.738864982796238e-05, + "loss": 0.24007067680358887, + "step": 30250 + }, + { + "epoch": 0.12991250440053922, + "grad_norm": 0.05835841968655586, + "learning_rate": 8.738433810784474e-05, + "loss": 0.1011542797088623, + "step": 30260 + }, + { + "epoch": 0.12995543649055924, + "grad_norm": 0.01663939282298088, + "learning_rate": 8.738002638772712e-05, + "loss": 0.23251605033874512, + "step": 30270 + }, + { + "epoch": 0.12999836858057923, + "grad_norm": 0.441267728805542, + "learning_rate": 8.73757146676095e-05, + "loss": 0.3203972101211548, + "step": 30280 + }, + { + "epoch": 0.13004130067059924, + "grad_norm": 0.03272169828414917, + "learning_rate": 8.737140294749188e-05, + "loss": 0.05306289196014404, + "step": 30290 + }, + { + "epoch": 0.13008423276061926, + "grad_norm": 0.06436789780855179, + "learning_rate": 8.736709122737425e-05, + "loss": 0.2278662919998169, + "step": 30300 + }, + { + "epoch": 0.13012716485063927, + "grad_norm": 0.3606188893318176, + "learning_rate": 8.736277950725663e-05, + "loss": 0.4411325931549072, + "step": 30310 + }, + { + "epoch": 0.13017009694065926, + "grad_norm": 0.052819643169641495, + "learning_rate": 8.735846778713901e-05, + "loss": 0.2932943820953369, + "step": 30320 + }, + { + "epoch": 0.13021302903067927, + "grad_norm": 0.0145418681204319, + "learning_rate": 8.735415606702138e-05, + "loss": 0.1358722925186157, + "step": 30330 + }, + { + "epoch": 0.1302559611206993, + "grad_norm": 3.2161331176757812, + "learning_rate": 8.734984434690375e-05, + "loss": 0.13851587772369384, + "step": 30340 + }, + { + "epoch": 0.13029889321071927, + "grad_norm": 0.01784207485616207, + "learning_rate": 8.734553262678613e-05, + "loss": 0.2483436346054077, + "step": 30350 + }, + { + "epoch": 0.1303418253007393, + "grad_norm": 0.6265615820884705, + "learning_rate": 8.73412209066685e-05, + "loss": 0.29143610000610354, + "step": 30360 + }, + { + "epoch": 0.1303847573907593, + "grad_norm": 0.11181551963090897, + "learning_rate": 8.733690918655088e-05, + "loss": 0.1619391918182373, + "step": 30370 + }, + { + "epoch": 0.1304276894807793, + "grad_norm": 0.06604292243719101, + "learning_rate": 8.733259746643326e-05, + "loss": 0.07990115880966187, + "step": 30380 + }, + { + "epoch": 0.1304706215707993, + "grad_norm": 0.1553955078125, + "learning_rate": 8.732828574631564e-05, + "loss": 0.13872065544128417, + "step": 30390 + }, + { + "epoch": 0.13051355366081932, + "grad_norm": 2.5285756587982178, + "learning_rate": 8.732397402619801e-05, + "loss": 0.26338150501251223, + "step": 30400 + }, + { + "epoch": 0.13055648575083934, + "grad_norm": 0.31697508692741394, + "learning_rate": 8.731966230608039e-05, + "loss": 0.5478155612945557, + "step": 30410 + }, + { + "epoch": 0.13059941784085932, + "grad_norm": 0.1644754558801651, + "learning_rate": 8.731535058596277e-05, + "loss": 0.11594902276992798, + "step": 30420 + }, + { + "epoch": 0.13064234993087934, + "grad_norm": 0.03448955714702606, + "learning_rate": 8.731103886584514e-05, + "loss": 0.11595598459243775, + "step": 30430 + }, + { + "epoch": 0.13068528202089935, + "grad_norm": 3.270960807800293, + "learning_rate": 8.730672714572752e-05, + "loss": 0.2628152847290039, + "step": 30440 + }, + { + "epoch": 0.13072821411091934, + "grad_norm": 0.08907277882099152, + "learning_rate": 8.73024154256099e-05, + "loss": 0.2462904453277588, + "step": 30450 + }, + { + "epoch": 0.13077114620093935, + "grad_norm": 0.35055065155029297, + "learning_rate": 8.729810370549228e-05, + "loss": 0.3140165567398071, + "step": 30460 + }, + { + "epoch": 0.13081407829095937, + "grad_norm": 3.522132635116577, + "learning_rate": 8.729379198537465e-05, + "loss": 0.22482681274414062, + "step": 30470 + }, + { + "epoch": 0.13085701038097936, + "grad_norm": 2.6127853393554688, + "learning_rate": 8.728948026525703e-05, + "loss": 0.20301475524902343, + "step": 30480 + }, + { + "epoch": 0.13089994247099937, + "grad_norm": 0.01888544298708439, + "learning_rate": 8.728516854513941e-05, + "loss": 0.1393720030784607, + "step": 30490 + }, + { + "epoch": 0.13094287456101938, + "grad_norm": 2.050440549850464, + "learning_rate": 8.728085682502177e-05, + "loss": 0.20471715927124023, + "step": 30500 + }, + { + "epoch": 0.1309858066510394, + "grad_norm": 2.6003317832946777, + "learning_rate": 8.727654510490415e-05, + "loss": 0.26553878784179685, + "step": 30510 + }, + { + "epoch": 0.1310287387410594, + "grad_norm": 0.009445443749427795, + "learning_rate": 8.727223338478653e-05, + "loss": 0.25958957672119143, + "step": 30520 + }, + { + "epoch": 0.1310716708310794, + "grad_norm": 0.04686051234602928, + "learning_rate": 8.72679216646689e-05, + "loss": 0.3284756660461426, + "step": 30530 + }, + { + "epoch": 0.13111460292109942, + "grad_norm": 1.3024394512176514, + "learning_rate": 8.726360994455128e-05, + "loss": 0.2646768569946289, + "step": 30540 + }, + { + "epoch": 0.1311575350111194, + "grad_norm": 1.392385482788086, + "learning_rate": 8.725929822443366e-05, + "loss": 0.2610862016677856, + "step": 30550 + }, + { + "epoch": 0.13120046710113942, + "grad_norm": 0.39657700061798096, + "learning_rate": 8.725498650431604e-05, + "loss": 0.12167308330535889, + "step": 30560 + }, + { + "epoch": 0.13124339919115943, + "grad_norm": 0.020843392238020897, + "learning_rate": 8.725067478419841e-05, + "loss": 0.055149370431900026, + "step": 30570 + }, + { + "epoch": 0.13128633128117942, + "grad_norm": 3.253476619720459, + "learning_rate": 8.724636306408079e-05, + "loss": 0.16969624757766724, + "step": 30580 + }, + { + "epoch": 0.13132926337119943, + "grad_norm": 1.004389762878418, + "learning_rate": 8.724205134396315e-05, + "loss": 0.095842045545578, + "step": 30590 + }, + { + "epoch": 0.13137219546121945, + "grad_norm": 0.9034222364425659, + "learning_rate": 8.723773962384553e-05, + "loss": 0.3282526254653931, + "step": 30600 + }, + { + "epoch": 0.13141512755123944, + "grad_norm": 0.10621494054794312, + "learning_rate": 8.723342790372791e-05, + "loss": 0.12817639112472534, + "step": 30610 + }, + { + "epoch": 0.13145805964125945, + "grad_norm": 5.3974761962890625, + "learning_rate": 8.722911618361029e-05, + "loss": 0.3108220100402832, + "step": 30620 + }, + { + "epoch": 0.13150099173127947, + "grad_norm": 0.13631728291511536, + "learning_rate": 8.722480446349266e-05, + "loss": 0.31007301807403564, + "step": 30630 + }, + { + "epoch": 0.13154392382129948, + "grad_norm": 0.018317611888051033, + "learning_rate": 8.722049274337506e-05, + "loss": 0.1647101402282715, + "step": 30640 + }, + { + "epoch": 0.13158685591131947, + "grad_norm": 0.646333634853363, + "learning_rate": 8.721618102325743e-05, + "loss": 0.41881618499755857, + "step": 30650 + }, + { + "epoch": 0.13162978800133948, + "grad_norm": 0.03925095498561859, + "learning_rate": 8.721186930313981e-05, + "loss": 0.22201454639434814, + "step": 30660 + }, + { + "epoch": 0.1316727200913595, + "grad_norm": 1.520325779914856, + "learning_rate": 8.720755758302217e-05, + "loss": 0.23899271488189697, + "step": 30670 + }, + { + "epoch": 0.13171565218137948, + "grad_norm": 0.3242422640323639, + "learning_rate": 8.720324586290455e-05, + "loss": 0.42775511741638184, + "step": 30680 + }, + { + "epoch": 0.1317585842713995, + "grad_norm": 1.1560478210449219, + "learning_rate": 8.719893414278693e-05, + "loss": 0.4399539947509766, + "step": 30690 + }, + { + "epoch": 0.1318015163614195, + "grad_norm": 0.62732994556427, + "learning_rate": 8.71946224226693e-05, + "loss": 0.15830096006393432, + "step": 30700 + }, + { + "epoch": 0.1318444484514395, + "grad_norm": 1.1266449689865112, + "learning_rate": 8.719031070255168e-05, + "loss": 0.4181190013885498, + "step": 30710 + }, + { + "epoch": 0.13188738054145951, + "grad_norm": 1.3097193241119385, + "learning_rate": 8.718599898243406e-05, + "loss": 0.15668928623199463, + "step": 30720 + }, + { + "epoch": 0.13193031263147953, + "grad_norm": 0.01987522467970848, + "learning_rate": 8.718168726231644e-05, + "loss": 0.20157217979431152, + "step": 30730 + }, + { + "epoch": 0.13197324472149954, + "grad_norm": 1.8036832809448242, + "learning_rate": 8.717737554219882e-05, + "loss": 0.2997883319854736, + "step": 30740 + }, + { + "epoch": 0.13201617681151953, + "grad_norm": 2.8631534576416016, + "learning_rate": 8.717306382208118e-05, + "loss": 0.45325074195861814, + "step": 30750 + }, + { + "epoch": 0.13205910890153955, + "grad_norm": 4.630545616149902, + "learning_rate": 8.716875210196356e-05, + "loss": 0.15610417127609252, + "step": 30760 + }, + { + "epoch": 0.13210204099155956, + "grad_norm": 1.621471643447876, + "learning_rate": 8.716444038184593e-05, + "loss": 0.26927714347839354, + "step": 30770 + }, + { + "epoch": 0.13214497308157955, + "grad_norm": 0.38395434617996216, + "learning_rate": 8.716012866172831e-05, + "loss": 0.2567978143692017, + "step": 30780 + }, + { + "epoch": 0.13218790517159956, + "grad_norm": 0.8037998676300049, + "learning_rate": 8.715581694161069e-05, + "loss": 0.24332842826843262, + "step": 30790 + }, + { + "epoch": 0.13223083726161958, + "grad_norm": 0.4493197202682495, + "learning_rate": 8.715150522149307e-05, + "loss": 0.3514964818954468, + "step": 30800 + }, + { + "epoch": 0.13227376935163956, + "grad_norm": 0.16325241327285767, + "learning_rate": 8.714719350137544e-05, + "loss": 0.40372166633605955, + "step": 30810 + }, + { + "epoch": 0.13231670144165958, + "grad_norm": 0.019779078662395477, + "learning_rate": 8.714288178125782e-05, + "loss": 0.31420106887817384, + "step": 30820 + }, + { + "epoch": 0.1323596335316796, + "grad_norm": 0.44937989115715027, + "learning_rate": 8.713857006114018e-05, + "loss": 0.22593259811401367, + "step": 30830 + }, + { + "epoch": 0.1324025656216996, + "grad_norm": 6.273345947265625, + "learning_rate": 8.713425834102256e-05, + "loss": 0.30472588539123535, + "step": 30840 + }, + { + "epoch": 0.1324454977117196, + "grad_norm": 1.8497966527938843, + "learning_rate": 8.712994662090494e-05, + "loss": 0.3565536022186279, + "step": 30850 + }, + { + "epoch": 0.1324884298017396, + "grad_norm": 0.015776457265019417, + "learning_rate": 8.712563490078733e-05, + "loss": 0.22413876056671142, + "step": 30860 + }, + { + "epoch": 0.13253136189175962, + "grad_norm": 0.12957410514354706, + "learning_rate": 8.712132318066971e-05, + "loss": 0.360329270362854, + "step": 30870 + }, + { + "epoch": 0.1325742939817796, + "grad_norm": 0.01238189171999693, + "learning_rate": 8.711701146055208e-05, + "loss": 0.2678189754486084, + "step": 30880 + }, + { + "epoch": 0.13261722607179963, + "grad_norm": 1.247525930404663, + "learning_rate": 8.711269974043446e-05, + "loss": 0.23769049644470214, + "step": 30890 + }, + { + "epoch": 0.13266015816181964, + "grad_norm": 1.3736680746078491, + "learning_rate": 8.710838802031684e-05, + "loss": 0.2896465539932251, + "step": 30900 + }, + { + "epoch": 0.13270309025183963, + "grad_norm": 0.0654890313744545, + "learning_rate": 8.710407630019922e-05, + "loss": 0.11143224239349366, + "step": 30910 + }, + { + "epoch": 0.13274602234185964, + "grad_norm": 0.011445770040154457, + "learning_rate": 8.709976458008158e-05, + "loss": 0.09457647800445557, + "step": 30920 + }, + { + "epoch": 0.13278895443187966, + "grad_norm": 1.6107486486434937, + "learning_rate": 8.709545285996396e-05, + "loss": 0.4207982063293457, + "step": 30930 + }, + { + "epoch": 0.13283188652189967, + "grad_norm": 1.2543550729751587, + "learning_rate": 8.709114113984633e-05, + "loss": 0.16696133613586425, + "step": 30940 + }, + { + "epoch": 0.13287481861191966, + "grad_norm": 0.011349753476679325, + "learning_rate": 8.708682941972871e-05, + "loss": 0.1631263017654419, + "step": 30950 + }, + { + "epoch": 0.13291775070193967, + "grad_norm": 0.016437632963061333, + "learning_rate": 8.708251769961109e-05, + "loss": 0.14615211486816407, + "step": 30960 + }, + { + "epoch": 0.1329606827919597, + "grad_norm": 0.021030467003583908, + "learning_rate": 8.707820597949347e-05, + "loss": 0.16817245483398438, + "step": 30970 + }, + { + "epoch": 0.13300361488197968, + "grad_norm": 0.1420525759458542, + "learning_rate": 8.707389425937584e-05, + "loss": 0.3803170919418335, + "step": 30980 + }, + { + "epoch": 0.1330465469719997, + "grad_norm": 7.2748894691467285, + "learning_rate": 8.706958253925822e-05, + "loss": 0.27606379985809326, + "step": 30990 + }, + { + "epoch": 0.1330894790620197, + "grad_norm": 3.6429316997528076, + "learning_rate": 8.706527081914059e-05, + "loss": 0.22091338634490967, + "step": 31000 + }, + { + "epoch": 0.1330894790620197, + "eval_loss": 0.5054401159286499, + "eval_runtime": 27.4631, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 31000 + }, + { + "epoch": 0.1331324111520397, + "grad_norm": 2.900641918182373, + "learning_rate": 8.706095909902296e-05, + "loss": 0.33292906284332274, + "step": 31010 + }, + { + "epoch": 0.1331753432420597, + "grad_norm": 6.009944915771484, + "learning_rate": 8.705664737890534e-05, + "loss": 0.4029203414916992, + "step": 31020 + }, + { + "epoch": 0.13321827533207972, + "grad_norm": 2.333559274673462, + "learning_rate": 8.705233565878772e-05, + "loss": 0.2746156930923462, + "step": 31030 + }, + { + "epoch": 0.1332612074220997, + "grad_norm": 1.5716809034347534, + "learning_rate": 8.70480239386701e-05, + "loss": 0.3722160816192627, + "step": 31040 + }, + { + "epoch": 0.13330413951211972, + "grad_norm": 3.9740467071533203, + "learning_rate": 8.704371221855247e-05, + "loss": 0.3705629348754883, + "step": 31050 + }, + { + "epoch": 0.13334707160213974, + "grad_norm": 0.7232141494750977, + "learning_rate": 8.703940049843485e-05, + "loss": 0.35708882808685305, + "step": 31060 + }, + { + "epoch": 0.13339000369215975, + "grad_norm": 0.0947902724146843, + "learning_rate": 8.703508877831723e-05, + "loss": 0.36917526721954347, + "step": 31070 + }, + { + "epoch": 0.13343293578217974, + "grad_norm": 0.2062128782272339, + "learning_rate": 8.70307770581996e-05, + "loss": 0.219962477684021, + "step": 31080 + }, + { + "epoch": 0.13347586787219975, + "grad_norm": 2.3485822677612305, + "learning_rate": 8.702646533808198e-05, + "loss": 0.5045081615447998, + "step": 31090 + }, + { + "epoch": 0.13351879996221977, + "grad_norm": 2.278395891189575, + "learning_rate": 8.702215361796436e-05, + "loss": 0.333439302444458, + "step": 31100 + }, + { + "epoch": 0.13356173205223976, + "grad_norm": 0.18220630288124084, + "learning_rate": 8.701784189784674e-05, + "loss": 0.12184329032897949, + "step": 31110 + }, + { + "epoch": 0.13360466414225977, + "grad_norm": 19.915332794189453, + "learning_rate": 8.701353017772911e-05, + "loss": 0.2725220680236816, + "step": 31120 + }, + { + "epoch": 0.1336475962322798, + "grad_norm": 4.639437198638916, + "learning_rate": 8.700921845761149e-05, + "loss": 0.44598069190979006, + "step": 31130 + }, + { + "epoch": 0.13369052832229977, + "grad_norm": 0.005592535249888897, + "learning_rate": 8.700490673749387e-05, + "loss": 0.31151866912841797, + "step": 31140 + }, + { + "epoch": 0.1337334604123198, + "grad_norm": 0.07216699421405792, + "learning_rate": 8.700059501737625e-05, + "loss": 0.3183210134506226, + "step": 31150 + }, + { + "epoch": 0.1337763925023398, + "grad_norm": 0.9691863656044006, + "learning_rate": 8.699628329725861e-05, + "loss": 0.6177532196044921, + "step": 31160 + }, + { + "epoch": 0.13381932459235982, + "grad_norm": 0.2667335271835327, + "learning_rate": 8.699197157714099e-05, + "loss": 0.34618918895721434, + "step": 31170 + }, + { + "epoch": 0.1338622566823798, + "grad_norm": 2.0624542236328125, + "learning_rate": 8.698765985702336e-05, + "loss": 0.21124274730682374, + "step": 31180 + }, + { + "epoch": 0.13390518877239982, + "grad_norm": 2.3349645137786865, + "learning_rate": 8.698334813690574e-05, + "loss": 0.3152353286743164, + "step": 31190 + }, + { + "epoch": 0.13394812086241983, + "grad_norm": 0.0925971269607544, + "learning_rate": 8.697903641678812e-05, + "loss": 0.2945190668106079, + "step": 31200 + }, + { + "epoch": 0.13399105295243982, + "grad_norm": 0.03744291141629219, + "learning_rate": 8.69747246966705e-05, + "loss": 0.2564548015594482, + "step": 31210 + }, + { + "epoch": 0.13403398504245984, + "grad_norm": 0.07750023901462555, + "learning_rate": 8.697041297655287e-05, + "loss": 0.24486761093139647, + "step": 31220 + }, + { + "epoch": 0.13407691713247985, + "grad_norm": 0.05882472172379494, + "learning_rate": 8.696610125643525e-05, + "loss": 0.2871314525604248, + "step": 31230 + }, + { + "epoch": 0.13411984922249984, + "grad_norm": 0.05573529750108719, + "learning_rate": 8.696178953631763e-05, + "loss": 0.2048487186431885, + "step": 31240 + }, + { + "epoch": 0.13416278131251985, + "grad_norm": 6.236080646514893, + "learning_rate": 8.695747781619999e-05, + "loss": 0.19539493322372437, + "step": 31250 + }, + { + "epoch": 0.13420571340253987, + "grad_norm": 0.01104716956615448, + "learning_rate": 8.695316609608237e-05, + "loss": 0.2785011053085327, + "step": 31260 + }, + { + "epoch": 0.13424864549255988, + "grad_norm": 2.238067626953125, + "learning_rate": 8.694885437596475e-05, + "loss": 0.5471057891845703, + "step": 31270 + }, + { + "epoch": 0.13429157758257987, + "grad_norm": 2.1154680252075195, + "learning_rate": 8.694454265584712e-05, + "loss": 0.33014640808105467, + "step": 31280 + }, + { + "epoch": 0.13433450967259988, + "grad_norm": 7.169861316680908, + "learning_rate": 8.69402309357295e-05, + "loss": 0.23499386310577391, + "step": 31290 + }, + { + "epoch": 0.1343774417626199, + "grad_norm": 0.7031405568122864, + "learning_rate": 8.693591921561188e-05, + "loss": 0.1952307939529419, + "step": 31300 + }, + { + "epoch": 0.13442037385263989, + "grad_norm": 0.07406271994113922, + "learning_rate": 8.693160749549426e-05, + "loss": 0.3307734489440918, + "step": 31310 + }, + { + "epoch": 0.1344633059426599, + "grad_norm": 0.04302635416388512, + "learning_rate": 8.692729577537663e-05, + "loss": 0.2610581398010254, + "step": 31320 + }, + { + "epoch": 0.13450623803267991, + "grad_norm": 0.966715395450592, + "learning_rate": 8.692298405525901e-05, + "loss": 0.2458191156387329, + "step": 31330 + }, + { + "epoch": 0.1345491701226999, + "grad_norm": 0.2640382945537567, + "learning_rate": 8.691867233514139e-05, + "loss": 0.13566402196884156, + "step": 31340 + }, + { + "epoch": 0.13459210221271992, + "grad_norm": 0.02546941116452217, + "learning_rate": 8.691436061502377e-05, + "loss": 0.1587485432624817, + "step": 31350 + }, + { + "epoch": 0.13463503430273993, + "grad_norm": 0.028399016708135605, + "learning_rate": 8.691004889490614e-05, + "loss": 0.16007002592086791, + "step": 31360 + }, + { + "epoch": 0.13467796639275995, + "grad_norm": 0.26582005620002747, + "learning_rate": 8.690573717478852e-05, + "loss": 0.09273542761802674, + "step": 31370 + }, + { + "epoch": 0.13472089848277993, + "grad_norm": 0.08304465562105179, + "learning_rate": 8.69014254546709e-05, + "loss": 0.12200015783309937, + "step": 31380 + }, + { + "epoch": 0.13476383057279995, + "grad_norm": 0.12915168702602386, + "learning_rate": 8.689711373455327e-05, + "loss": 0.08989648818969727, + "step": 31390 + }, + { + "epoch": 0.13480676266281996, + "grad_norm": 0.0017127407481893897, + "learning_rate": 8.689280201443565e-05, + "loss": 0.16513725519180297, + "step": 31400 + }, + { + "epoch": 0.13484969475283995, + "grad_norm": 0.09435081481933594, + "learning_rate": 8.688849029431802e-05, + "loss": 0.3568329095840454, + "step": 31410 + }, + { + "epoch": 0.13489262684285996, + "grad_norm": 3.7495338916778564, + "learning_rate": 8.688417857420039e-05, + "loss": 0.2168562412261963, + "step": 31420 + }, + { + "epoch": 0.13493555893287998, + "grad_norm": 0.16012614965438843, + "learning_rate": 8.687986685408277e-05, + "loss": 0.3556508541107178, + "step": 31430 + }, + { + "epoch": 0.13497849102289997, + "grad_norm": 0.6222158670425415, + "learning_rate": 8.687555513396515e-05, + "loss": 0.3658663988113403, + "step": 31440 + }, + { + "epoch": 0.13502142311291998, + "grad_norm": 0.12118512392044067, + "learning_rate": 8.687124341384753e-05, + "loss": 0.25071876049041747, + "step": 31450 + }, + { + "epoch": 0.13506435520294, + "grad_norm": 8.053659439086914, + "learning_rate": 8.68669316937299e-05, + "loss": 0.35620319843292236, + "step": 31460 + }, + { + "epoch": 0.13510728729295998, + "grad_norm": 0.005498465616255999, + "learning_rate": 8.686261997361228e-05, + "loss": 0.13012741804122924, + "step": 31470 + }, + { + "epoch": 0.13515021938298, + "grad_norm": 1.4227027893066406, + "learning_rate": 8.685830825349466e-05, + "loss": 0.40222697257995604, + "step": 31480 + }, + { + "epoch": 0.135193151473, + "grad_norm": 18.48629379272461, + "learning_rate": 8.685399653337702e-05, + "loss": 0.294164252281189, + "step": 31490 + }, + { + "epoch": 0.13523608356302003, + "grad_norm": 0.008073339238762856, + "learning_rate": 8.68496848132594e-05, + "loss": 0.4413759231567383, + "step": 31500 + }, + { + "epoch": 0.13527901565304, + "grad_norm": 0.48230382800102234, + "learning_rate": 8.684537309314178e-05, + "loss": 0.28578667640686034, + "step": 31510 + }, + { + "epoch": 0.13532194774306003, + "grad_norm": 3.201517105102539, + "learning_rate": 8.684106137302415e-05, + "loss": 0.1512368679046631, + "step": 31520 + }, + { + "epoch": 0.13536487983308004, + "grad_norm": 15.804793357849121, + "learning_rate": 8.683674965290653e-05, + "loss": 0.19707468748092652, + "step": 31530 + }, + { + "epoch": 0.13540781192310003, + "grad_norm": 0.42527276277542114, + "learning_rate": 8.683243793278891e-05, + "loss": 0.142280113697052, + "step": 31540 + }, + { + "epoch": 0.13545074401312004, + "grad_norm": 0.46108368039131165, + "learning_rate": 8.682812621267128e-05, + "loss": 0.35723319053649905, + "step": 31550 + }, + { + "epoch": 0.13549367610314006, + "grad_norm": 1.848292589187622, + "learning_rate": 8.682381449255366e-05, + "loss": 0.28166794776916504, + "step": 31560 + }, + { + "epoch": 0.13553660819316005, + "grad_norm": 1.9825315475463867, + "learning_rate": 8.681950277243604e-05, + "loss": 0.24795372486114503, + "step": 31570 + }, + { + "epoch": 0.13557954028318006, + "grad_norm": 0.008994282223284245, + "learning_rate": 8.681519105231842e-05, + "loss": 0.10199071168899536, + "step": 31580 + }, + { + "epoch": 0.13562247237320008, + "grad_norm": 0.554589569568634, + "learning_rate": 8.68108793322008e-05, + "loss": 0.18277888298034667, + "step": 31590 + }, + { + "epoch": 0.1356654044632201, + "grad_norm": 1.266161322593689, + "learning_rate": 8.680656761208317e-05, + "loss": 0.213988733291626, + "step": 31600 + }, + { + "epoch": 0.13570833655324008, + "grad_norm": 0.0008511008927598596, + "learning_rate": 8.680225589196555e-05, + "loss": 0.2665504693984985, + "step": 31610 + }, + { + "epoch": 0.1357512686432601, + "grad_norm": 2.124117851257324, + "learning_rate": 8.679794417184793e-05, + "loss": 0.35089409351348877, + "step": 31620 + }, + { + "epoch": 0.1357942007332801, + "grad_norm": 0.08872847259044647, + "learning_rate": 8.67936324517303e-05, + "loss": 0.11770111322402954, + "step": 31630 + }, + { + "epoch": 0.1358371328233001, + "grad_norm": 0.013181965798139572, + "learning_rate": 8.678932073161268e-05, + "loss": 0.1267725944519043, + "step": 31640 + }, + { + "epoch": 0.1358800649133201, + "grad_norm": 0.3185889720916748, + "learning_rate": 8.678500901149506e-05, + "loss": 0.16408768892288209, + "step": 31650 + }, + { + "epoch": 0.13592299700334012, + "grad_norm": 0.0036954637616872787, + "learning_rate": 8.678069729137742e-05, + "loss": 0.2898323774337769, + "step": 31660 + }, + { + "epoch": 0.1359659290933601, + "grad_norm": 0.06999905407428741, + "learning_rate": 8.67763855712598e-05, + "loss": 0.2554055690765381, + "step": 31670 + }, + { + "epoch": 0.13600886118338013, + "grad_norm": 9.497495651245117, + "learning_rate": 8.677207385114218e-05, + "loss": 0.3093088626861572, + "step": 31680 + }, + { + "epoch": 0.13605179327340014, + "grad_norm": 0.4421633183956146, + "learning_rate": 8.676776213102455e-05, + "loss": 0.19062780141830443, + "step": 31690 + }, + { + "epoch": 0.13609472536342015, + "grad_norm": 2.385646104812622, + "learning_rate": 8.676345041090693e-05, + "loss": 0.34704439640045165, + "step": 31700 + }, + { + "epoch": 0.13613765745344014, + "grad_norm": 1.0288870334625244, + "learning_rate": 8.675913869078931e-05, + "loss": 0.2951198101043701, + "step": 31710 + }, + { + "epoch": 0.13618058954346016, + "grad_norm": 0.036432646214962006, + "learning_rate": 8.675482697067169e-05, + "loss": 0.3489818811416626, + "step": 31720 + }, + { + "epoch": 0.13622352163348017, + "grad_norm": 2.0441031455993652, + "learning_rate": 8.675051525055406e-05, + "loss": 0.194374942779541, + "step": 31730 + }, + { + "epoch": 0.13626645372350016, + "grad_norm": 0.02840869128704071, + "learning_rate": 8.674620353043643e-05, + "loss": 0.35605788230895996, + "step": 31740 + }, + { + "epoch": 0.13630938581352017, + "grad_norm": 0.003902270458638668, + "learning_rate": 8.67418918103188e-05, + "loss": 0.12478889226913452, + "step": 31750 + }, + { + "epoch": 0.1363523179035402, + "grad_norm": 1.4010862112045288, + "learning_rate": 8.673758009020118e-05, + "loss": 0.25406508445739745, + "step": 31760 + }, + { + "epoch": 0.13639524999356017, + "grad_norm": 0.030482197180390358, + "learning_rate": 8.673326837008356e-05, + "loss": 0.1824798583984375, + "step": 31770 + }, + { + "epoch": 0.1364381820835802, + "grad_norm": 0.1671326905488968, + "learning_rate": 8.672895664996594e-05, + "loss": 0.19233707189559937, + "step": 31780 + }, + { + "epoch": 0.1364811141736002, + "grad_norm": 0.020653098821640015, + "learning_rate": 8.672464492984831e-05, + "loss": 0.17775663137435913, + "step": 31790 + }, + { + "epoch": 0.13652404626362022, + "grad_norm": 22.064794540405273, + "learning_rate": 8.672033320973069e-05, + "loss": 0.26563799381256104, + "step": 31800 + }, + { + "epoch": 0.1365669783536402, + "grad_norm": 0.03927993029356003, + "learning_rate": 8.671602148961307e-05, + "loss": 0.17628468275070192, + "step": 31810 + }, + { + "epoch": 0.13660991044366022, + "grad_norm": 1.8453829288482666, + "learning_rate": 8.671170976949545e-05, + "loss": 0.1508329391479492, + "step": 31820 + }, + { + "epoch": 0.13665284253368024, + "grad_norm": 0.1535903513431549, + "learning_rate": 8.670739804937782e-05, + "loss": 0.3724507331848145, + "step": 31830 + }, + { + "epoch": 0.13669577462370022, + "grad_norm": 1.699593186378479, + "learning_rate": 8.67030863292602e-05, + "loss": 0.18699527978897096, + "step": 31840 + }, + { + "epoch": 0.13673870671372024, + "grad_norm": 0.051007404923439026, + "learning_rate": 8.669877460914258e-05, + "loss": 0.27728071212768557, + "step": 31850 + }, + { + "epoch": 0.13678163880374025, + "grad_norm": 5.522481441497803, + "learning_rate": 8.669446288902496e-05, + "loss": 0.3032398700714111, + "step": 31860 + }, + { + "epoch": 0.13682457089376024, + "grad_norm": 0.02729124389588833, + "learning_rate": 8.669015116890733e-05, + "loss": 0.1534939408302307, + "step": 31870 + }, + { + "epoch": 0.13686750298378025, + "grad_norm": 2.797128438949585, + "learning_rate": 8.668583944878971e-05, + "loss": 0.3220533847808838, + "step": 31880 + }, + { + "epoch": 0.13691043507380027, + "grad_norm": 1.324243426322937, + "learning_rate": 8.668152772867209e-05, + "loss": 0.17927749156951905, + "step": 31890 + }, + { + "epoch": 0.13695336716382026, + "grad_norm": 0.02539249137043953, + "learning_rate": 8.667721600855445e-05, + "loss": 0.28399274349212644, + "step": 31900 + }, + { + "epoch": 0.13699629925384027, + "grad_norm": 0.3492501378059387, + "learning_rate": 8.667290428843683e-05, + "loss": 0.27191436290740967, + "step": 31910 + }, + { + "epoch": 0.13703923134386028, + "grad_norm": 0.005278902594000101, + "learning_rate": 8.66685925683192e-05, + "loss": 0.16442601680755614, + "step": 31920 + }, + { + "epoch": 0.1370821634338803, + "grad_norm": 0.029702888801693916, + "learning_rate": 8.666428084820158e-05, + "loss": 0.19043039083480834, + "step": 31930 + }, + { + "epoch": 0.1371250955239003, + "grad_norm": 3.249835968017578, + "learning_rate": 8.665996912808396e-05, + "loss": 0.22955794334411622, + "step": 31940 + }, + { + "epoch": 0.1371680276139203, + "grad_norm": 0.15154388546943665, + "learning_rate": 8.665565740796634e-05, + "loss": 0.19120630025863647, + "step": 31950 + }, + { + "epoch": 0.13721095970394032, + "grad_norm": 0.06035231798887253, + "learning_rate": 8.665134568784872e-05, + "loss": 0.3377244234085083, + "step": 31960 + }, + { + "epoch": 0.1372538917939603, + "grad_norm": 0.1840345859527588, + "learning_rate": 8.664703396773109e-05, + "loss": 0.09188364148139953, + "step": 31970 + }, + { + "epoch": 0.13729682388398032, + "grad_norm": 1.6915229558944702, + "learning_rate": 8.664272224761347e-05, + "loss": 0.42070856094360354, + "step": 31980 + }, + { + "epoch": 0.13733975597400033, + "grad_norm": 0.14676789939403534, + "learning_rate": 8.663841052749583e-05, + "loss": 0.06923063993453979, + "step": 31990 + }, + { + "epoch": 0.13738268806402032, + "grad_norm": 0.46177831292152405, + "learning_rate": 8.663409880737821e-05, + "loss": 0.3868566513061523, + "step": 32000 + }, + { + "epoch": 0.13738268806402032, + "eval_loss": 0.48874905705451965, + "eval_runtime": 27.4796, + "eval_samples_per_second": 3.639, + "eval_steps_per_second": 3.639, + "step": 32000 + }, + { + "epoch": 0.13742562015404033, + "grad_norm": 0.011651808395981789, + "learning_rate": 8.662978708726059e-05, + "loss": 0.3492263317108154, + "step": 32010 + }, + { + "epoch": 0.13746855224406035, + "grad_norm": 0.3724205493927002, + "learning_rate": 8.662547536714297e-05, + "loss": 0.23693232536315917, + "step": 32020 + }, + { + "epoch": 0.13751148433408036, + "grad_norm": 1.7988815307617188, + "learning_rate": 8.662116364702534e-05, + "loss": 0.17499951124191285, + "step": 32030 + }, + { + "epoch": 0.13755441642410035, + "grad_norm": 0.036245301365852356, + "learning_rate": 8.661685192690772e-05, + "loss": 0.2796959400177002, + "step": 32040 + }, + { + "epoch": 0.13759734851412037, + "grad_norm": 0.031076082959771156, + "learning_rate": 8.661254020679011e-05, + "loss": 0.3143959045410156, + "step": 32050 + }, + { + "epoch": 0.13764028060414038, + "grad_norm": 0.21341530978679657, + "learning_rate": 8.660822848667249e-05, + "loss": 0.2533440351486206, + "step": 32060 + }, + { + "epoch": 0.13768321269416037, + "grad_norm": 0.3128828704357147, + "learning_rate": 8.660391676655485e-05, + "loss": 0.37330625057220457, + "step": 32070 + }, + { + "epoch": 0.13772614478418038, + "grad_norm": 0.05614300072193146, + "learning_rate": 8.659960504643723e-05, + "loss": 0.1680017113685608, + "step": 32080 + }, + { + "epoch": 0.1377690768742004, + "grad_norm": 0.010020498186349869, + "learning_rate": 8.659529332631961e-05, + "loss": 0.11806988716125488, + "step": 32090 + }, + { + "epoch": 0.13781200896422038, + "grad_norm": 0.06427690386772156, + "learning_rate": 8.659098160620198e-05, + "loss": 0.2253105878829956, + "step": 32100 + }, + { + "epoch": 0.1378549410542404, + "grad_norm": 0.04789144545793533, + "learning_rate": 8.658666988608436e-05, + "loss": 0.10743888616561889, + "step": 32110 + }, + { + "epoch": 0.1378978731442604, + "grad_norm": 0.019688574597239494, + "learning_rate": 8.658235816596674e-05, + "loss": 0.1399633526802063, + "step": 32120 + }, + { + "epoch": 0.13794080523428043, + "grad_norm": 0.16319715976715088, + "learning_rate": 8.657804644584912e-05, + "loss": 0.0964900016784668, + "step": 32130 + }, + { + "epoch": 0.13798373732430042, + "grad_norm": 2.0481417179107666, + "learning_rate": 8.65737347257315e-05, + "loss": 0.35899038314819337, + "step": 32140 + }, + { + "epoch": 0.13802666941432043, + "grad_norm": 0.015374544076621532, + "learning_rate": 8.656942300561386e-05, + "loss": 0.2831136226654053, + "step": 32150 + }, + { + "epoch": 0.13806960150434044, + "grad_norm": 0.05547713488340378, + "learning_rate": 8.656511128549624e-05, + "loss": 0.16382434368133544, + "step": 32160 + }, + { + "epoch": 0.13811253359436043, + "grad_norm": 1.0987155437469482, + "learning_rate": 8.656079956537861e-05, + "loss": 0.3500218391418457, + "step": 32170 + }, + { + "epoch": 0.13815546568438045, + "grad_norm": 4.993858814239502, + "learning_rate": 8.655648784526099e-05, + "loss": 0.35623295307159425, + "step": 32180 + }, + { + "epoch": 0.13819839777440046, + "grad_norm": 2.2642040252685547, + "learning_rate": 8.655217612514337e-05, + "loss": 0.38486814498901367, + "step": 32190 + }, + { + "epoch": 0.13824132986442045, + "grad_norm": 0.10652873665094376, + "learning_rate": 8.654786440502574e-05, + "loss": 0.09749494791030884, + "step": 32200 + }, + { + "epoch": 0.13828426195444046, + "grad_norm": 0.17305311560630798, + "learning_rate": 8.654355268490812e-05, + "loss": 0.22684135437011718, + "step": 32210 + }, + { + "epoch": 0.13832719404446048, + "grad_norm": 0.5436341762542725, + "learning_rate": 8.65392409647905e-05, + "loss": 0.19008939266204833, + "step": 32220 + }, + { + "epoch": 0.1383701261344805, + "grad_norm": 0.03433492034673691, + "learning_rate": 8.653492924467286e-05, + "loss": 0.3322418212890625, + "step": 32230 + }, + { + "epoch": 0.13841305822450048, + "grad_norm": 0.032812800258398056, + "learning_rate": 8.653061752455524e-05, + "loss": 0.37228755950927733, + "step": 32240 + }, + { + "epoch": 0.1384559903145205, + "grad_norm": 0.00702094379812479, + "learning_rate": 8.652630580443762e-05, + "loss": 0.29015960693359377, + "step": 32250 + }, + { + "epoch": 0.1384989224045405, + "grad_norm": 0.9049208760261536, + "learning_rate": 8.652199408432e-05, + "loss": 0.06290289163589477, + "step": 32260 + }, + { + "epoch": 0.1385418544945605, + "grad_norm": 0.31068122386932373, + "learning_rate": 8.651768236420239e-05, + "loss": 0.1787291646003723, + "step": 32270 + }, + { + "epoch": 0.1385847865845805, + "grad_norm": 0.12767857313156128, + "learning_rate": 8.651337064408476e-05, + "loss": 0.2031085252761841, + "step": 32280 + }, + { + "epoch": 0.13862771867460053, + "grad_norm": 1.2406351566314697, + "learning_rate": 8.650905892396714e-05, + "loss": 0.40726299285888673, + "step": 32290 + }, + { + "epoch": 0.1386706507646205, + "grad_norm": 0.18899884819984436, + "learning_rate": 8.650474720384952e-05, + "loss": 0.38270137310028074, + "step": 32300 + }, + { + "epoch": 0.13871358285464053, + "grad_norm": 0.06344349682331085, + "learning_rate": 8.650043548373188e-05, + "loss": 0.23246891498565675, + "step": 32310 + }, + { + "epoch": 0.13875651494466054, + "grad_norm": 1.4559452533721924, + "learning_rate": 8.649612376361426e-05, + "loss": 0.27634477615356445, + "step": 32320 + }, + { + "epoch": 0.13879944703468053, + "grad_norm": 1.1971638202667236, + "learning_rate": 8.649181204349664e-05, + "loss": 0.24708399772644044, + "step": 32330 + }, + { + "epoch": 0.13884237912470054, + "grad_norm": 0.05968537554144859, + "learning_rate": 8.648750032337901e-05, + "loss": 0.2924589872360229, + "step": 32340 + }, + { + "epoch": 0.13888531121472056, + "grad_norm": 0.07050938904285431, + "learning_rate": 8.648318860326139e-05, + "loss": 0.22881875038146973, + "step": 32350 + }, + { + "epoch": 0.13892824330474057, + "grad_norm": 0.013065168634057045, + "learning_rate": 8.647887688314377e-05, + "loss": 0.1992401123046875, + "step": 32360 + }, + { + "epoch": 0.13897117539476056, + "grad_norm": 1.5027996301651, + "learning_rate": 8.647456516302615e-05, + "loss": 0.16590211391448975, + "step": 32370 + }, + { + "epoch": 0.13901410748478057, + "grad_norm": 0.3740582764148712, + "learning_rate": 8.647025344290852e-05, + "loss": 0.34262235164642335, + "step": 32380 + }, + { + "epoch": 0.1390570395748006, + "grad_norm": 0.7905094623565674, + "learning_rate": 8.64659417227909e-05, + "loss": 0.4150557994842529, + "step": 32390 + }, + { + "epoch": 0.13909997166482058, + "grad_norm": 0.06105173006653786, + "learning_rate": 8.646163000267326e-05, + "loss": 0.19480823278427123, + "step": 32400 + }, + { + "epoch": 0.1391429037548406, + "grad_norm": 0.008457164280116558, + "learning_rate": 8.645731828255564e-05, + "loss": 0.26109282970428466, + "step": 32410 + }, + { + "epoch": 0.1391858358448606, + "grad_norm": 1.475132942199707, + "learning_rate": 8.645300656243802e-05, + "loss": 0.5467658042907715, + "step": 32420 + }, + { + "epoch": 0.1392287679348806, + "grad_norm": 0.09985917806625366, + "learning_rate": 8.64486948423204e-05, + "loss": 0.30468990802764895, + "step": 32430 + }, + { + "epoch": 0.1392717000249006, + "grad_norm": 0.09741347283124924, + "learning_rate": 8.644438312220277e-05, + "loss": 0.15210098028182983, + "step": 32440 + }, + { + "epoch": 0.13931463211492062, + "grad_norm": 2.485933780670166, + "learning_rate": 8.644007140208515e-05, + "loss": 0.22832133769989013, + "step": 32450 + }, + { + "epoch": 0.13935756420494064, + "grad_norm": 0.9260600209236145, + "learning_rate": 8.643575968196753e-05, + "loss": 0.15713369846343994, + "step": 32460 + }, + { + "epoch": 0.13940049629496062, + "grad_norm": 0.0251544788479805, + "learning_rate": 8.64314479618499e-05, + "loss": 0.15889840126037597, + "step": 32470 + }, + { + "epoch": 0.13944342838498064, + "grad_norm": 0.8846043944358826, + "learning_rate": 8.642713624173227e-05, + "loss": 0.30323312282562254, + "step": 32480 + }, + { + "epoch": 0.13948636047500065, + "grad_norm": 1.401147484779358, + "learning_rate": 8.642282452161466e-05, + "loss": 0.4211751461029053, + "step": 32490 + }, + { + "epoch": 0.13952929256502064, + "grad_norm": 4.154013156890869, + "learning_rate": 8.641851280149704e-05, + "loss": 0.28515145778656004, + "step": 32500 + }, + { + "epoch": 0.13957222465504066, + "grad_norm": 0.0072095938958227634, + "learning_rate": 8.641420108137941e-05, + "loss": 0.07046842575073242, + "step": 32510 + }, + { + "epoch": 0.13961515674506067, + "grad_norm": 0.10458221286535263, + "learning_rate": 8.640988936126179e-05, + "loss": 0.3335193872451782, + "step": 32520 + }, + { + "epoch": 0.13965808883508066, + "grad_norm": 0.07076560705900192, + "learning_rate": 8.640557764114417e-05, + "loss": 0.28255205154418944, + "step": 32530 + }, + { + "epoch": 0.13970102092510067, + "grad_norm": 0.05078651383519173, + "learning_rate": 8.640126592102655e-05, + "loss": 0.09788010120391846, + "step": 32540 + }, + { + "epoch": 0.1397439530151207, + "grad_norm": 3.8852310180664062, + "learning_rate": 8.639695420090892e-05, + "loss": 0.5485908508300781, + "step": 32550 + }, + { + "epoch": 0.1397868851051407, + "grad_norm": 0.8929144144058228, + "learning_rate": 8.639264248079129e-05, + "loss": 0.15849707126617432, + "step": 32560 + }, + { + "epoch": 0.1398298171951607, + "grad_norm": 2.173306465148926, + "learning_rate": 8.638833076067367e-05, + "loss": 0.21721320152282714, + "step": 32570 + }, + { + "epoch": 0.1398727492851807, + "grad_norm": 0.19807979464530945, + "learning_rate": 8.638401904055604e-05, + "loss": 0.10829294919967651, + "step": 32580 + }, + { + "epoch": 0.13991568137520072, + "grad_norm": 0.1245853528380394, + "learning_rate": 8.637970732043842e-05, + "loss": 0.2582393169403076, + "step": 32590 + }, + { + "epoch": 0.1399586134652207, + "grad_norm": 1.730551838874817, + "learning_rate": 8.63753956003208e-05, + "loss": 0.34633893966674806, + "step": 32600 + }, + { + "epoch": 0.14000154555524072, + "grad_norm": 2.3339028358459473, + "learning_rate": 8.637108388020317e-05, + "loss": 0.41213231086730956, + "step": 32610 + }, + { + "epoch": 0.14004447764526073, + "grad_norm": 4.648174285888672, + "learning_rate": 8.636677216008555e-05, + "loss": 0.1871417284011841, + "step": 32620 + }, + { + "epoch": 0.14008740973528072, + "grad_norm": 0.6128503084182739, + "learning_rate": 8.636246043996793e-05, + "loss": 0.0874154508113861, + "step": 32630 + }, + { + "epoch": 0.14013034182530074, + "grad_norm": 0.02867172844707966, + "learning_rate": 8.63581487198503e-05, + "loss": 0.2719635009765625, + "step": 32640 + }, + { + "epoch": 0.14017327391532075, + "grad_norm": 0.9320842027664185, + "learning_rate": 8.635383699973267e-05, + "loss": 0.040377697348594664, + "step": 32650 + }, + { + "epoch": 0.14021620600534077, + "grad_norm": 0.039206285029649734, + "learning_rate": 8.634952527961505e-05, + "loss": 0.09994704723358154, + "step": 32660 + }, + { + "epoch": 0.14025913809536075, + "grad_norm": 2.78364896774292, + "learning_rate": 8.634521355949743e-05, + "loss": 0.432407283782959, + "step": 32670 + }, + { + "epoch": 0.14030207018538077, + "grad_norm": 0.29419511556625366, + "learning_rate": 8.63409018393798e-05, + "loss": 0.30876913070678713, + "step": 32680 + }, + { + "epoch": 0.14034500227540078, + "grad_norm": 5.2350850105285645, + "learning_rate": 8.633659011926218e-05, + "loss": 0.18348314762115478, + "step": 32690 + }, + { + "epoch": 0.14038793436542077, + "grad_norm": 0.013051588088274002, + "learning_rate": 8.633227839914456e-05, + "loss": 0.0500313937664032, + "step": 32700 + }, + { + "epoch": 0.14043086645544078, + "grad_norm": 1.8791477680206299, + "learning_rate": 8.632796667902693e-05, + "loss": 0.16241765022277832, + "step": 32710 + }, + { + "epoch": 0.1404737985454608, + "grad_norm": 0.018941722810268402, + "learning_rate": 8.632365495890931e-05, + "loss": 0.2940650701522827, + "step": 32720 + }, + { + "epoch": 0.14051673063548079, + "grad_norm": 0.2370922714471817, + "learning_rate": 8.631934323879169e-05, + "loss": 0.15933622121810914, + "step": 32730 + }, + { + "epoch": 0.1405596627255008, + "grad_norm": 0.06801789999008179, + "learning_rate": 8.631503151867407e-05, + "loss": 0.3181285858154297, + "step": 32740 + }, + { + "epoch": 0.14060259481552081, + "grad_norm": 4.759559631347656, + "learning_rate": 8.631071979855644e-05, + "loss": 0.4246529102325439, + "step": 32750 + }, + { + "epoch": 0.1406455269055408, + "grad_norm": 0.021498646587133408, + "learning_rate": 8.630640807843882e-05, + "loss": 0.09310616850852967, + "step": 32760 + }, + { + "epoch": 0.14068845899556082, + "grad_norm": 0.009470085613429546, + "learning_rate": 8.63020963583212e-05, + "loss": 0.15859633684158325, + "step": 32770 + }, + { + "epoch": 0.14073139108558083, + "grad_norm": 0.5190895795822144, + "learning_rate": 8.629778463820358e-05, + "loss": 0.32474589347839355, + "step": 32780 + }, + { + "epoch": 0.14077432317560085, + "grad_norm": 0.008953122422099113, + "learning_rate": 8.629347291808595e-05, + "loss": 0.2130033016204834, + "step": 32790 + }, + { + "epoch": 0.14081725526562083, + "grad_norm": 0.12810958921909332, + "learning_rate": 8.628916119796833e-05, + "loss": 0.14106868505477904, + "step": 32800 + }, + { + "epoch": 0.14086018735564085, + "grad_norm": 42.244422912597656, + "learning_rate": 8.62848494778507e-05, + "loss": 0.2511881351470947, + "step": 32810 + }, + { + "epoch": 0.14090311944566086, + "grad_norm": 0.08070328831672668, + "learning_rate": 8.628053775773307e-05, + "loss": 0.003995791077613831, + "step": 32820 + }, + { + "epoch": 0.14094605153568085, + "grad_norm": 0.0576271191239357, + "learning_rate": 8.627622603761545e-05, + "loss": 0.1484993577003479, + "step": 32830 + }, + { + "epoch": 0.14098898362570086, + "grad_norm": 2.7077083587646484, + "learning_rate": 8.627191431749783e-05, + "loss": 0.31101303100585936, + "step": 32840 + }, + { + "epoch": 0.14103191571572088, + "grad_norm": 0.6947262287139893, + "learning_rate": 8.62676025973802e-05, + "loss": 0.3318142890930176, + "step": 32850 + }, + { + "epoch": 0.14107484780574087, + "grad_norm": 2.176297187805176, + "learning_rate": 8.626329087726258e-05, + "loss": 0.22213470935821533, + "step": 32860 + }, + { + "epoch": 0.14111777989576088, + "grad_norm": 2.0945627689361572, + "learning_rate": 8.625897915714496e-05, + "loss": 0.08620821237564087, + "step": 32870 + }, + { + "epoch": 0.1411607119857809, + "grad_norm": 0.004814998712390661, + "learning_rate": 8.625466743702734e-05, + "loss": 0.13406879901885987, + "step": 32880 + }, + { + "epoch": 0.1412036440758009, + "grad_norm": 0.11976204067468643, + "learning_rate": 8.62503557169097e-05, + "loss": 0.15420854091644287, + "step": 32890 + }, + { + "epoch": 0.1412465761658209, + "grad_norm": 0.0031996588222682476, + "learning_rate": 8.624604399679208e-05, + "loss": 0.3452379941940308, + "step": 32900 + }, + { + "epoch": 0.1412895082558409, + "grad_norm": 0.20073994994163513, + "learning_rate": 8.624173227667445e-05, + "loss": 0.12277450561523437, + "step": 32910 + }, + { + "epoch": 0.14133244034586093, + "grad_norm": 2.7127232551574707, + "learning_rate": 8.623742055655683e-05, + "loss": 0.21957502365112305, + "step": 32920 + }, + { + "epoch": 0.1413753724358809, + "grad_norm": 0.026280561462044716, + "learning_rate": 8.623310883643921e-05, + "loss": 0.29603774547576905, + "step": 32930 + }, + { + "epoch": 0.14141830452590093, + "grad_norm": 0.009255579672753811, + "learning_rate": 8.622879711632159e-05, + "loss": 0.18040038347244264, + "step": 32940 + }, + { + "epoch": 0.14146123661592094, + "grad_norm": 0.021447597071528435, + "learning_rate": 8.622448539620396e-05, + "loss": 0.20096633434295655, + "step": 32950 + }, + { + "epoch": 0.14150416870594093, + "grad_norm": 0.15856556594371796, + "learning_rate": 8.622017367608634e-05, + "loss": 0.21111953258514404, + "step": 32960 + }, + { + "epoch": 0.14154710079596095, + "grad_norm": 0.9573673605918884, + "learning_rate": 8.621586195596872e-05, + "loss": 0.31001458168029783, + "step": 32970 + }, + { + "epoch": 0.14159003288598096, + "grad_norm": 0.006918746512383223, + "learning_rate": 8.62115502358511e-05, + "loss": 0.12518359422683717, + "step": 32980 + }, + { + "epoch": 0.14163296497600097, + "grad_norm": 0.004179741255939007, + "learning_rate": 8.620723851573347e-05, + "loss": 0.19539453983306884, + "step": 32990 + }, + { + "epoch": 0.14167589706602096, + "grad_norm": 0.005447585601359606, + "learning_rate": 8.620292679561585e-05, + "loss": 0.2963005542755127, + "step": 33000 + }, + { + "epoch": 0.14167589706602096, + "eval_loss": 0.47957244515419006, + "eval_runtime": 27.4369, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 33000 + }, + { + "epoch": 0.14171882915604098, + "grad_norm": 1.1165666580200195, + "learning_rate": 8.619861507549823e-05, + "loss": 0.2317833423614502, + "step": 33010 + }, + { + "epoch": 0.141761761246061, + "grad_norm": 0.40649619698524475, + "learning_rate": 8.61943033553806e-05, + "loss": 0.19553804397583008, + "step": 33020 + }, + { + "epoch": 0.14180469333608098, + "grad_norm": 1.847734808921814, + "learning_rate": 8.618999163526298e-05, + "loss": 0.3180288314819336, + "step": 33030 + }, + { + "epoch": 0.141847625426101, + "grad_norm": 0.02948109805583954, + "learning_rate": 8.618567991514536e-05, + "loss": 0.24348065853118897, + "step": 33040 + }, + { + "epoch": 0.141890557516121, + "grad_norm": 0.050934720784425735, + "learning_rate": 8.618136819502772e-05, + "loss": 0.11437109708786011, + "step": 33050 + }, + { + "epoch": 0.141933489606141, + "grad_norm": 1.0916680097579956, + "learning_rate": 8.61770564749101e-05, + "loss": 0.3583144903182983, + "step": 33060 + }, + { + "epoch": 0.141976421696161, + "grad_norm": 4.1211323738098145, + "learning_rate": 8.617274475479248e-05, + "loss": 0.2798927307128906, + "step": 33070 + }, + { + "epoch": 0.14201935378618102, + "grad_norm": 1.4547139406204224, + "learning_rate": 8.616843303467486e-05, + "loss": 0.3336412668228149, + "step": 33080 + }, + { + "epoch": 0.14206228587620104, + "grad_norm": 0.12225791811943054, + "learning_rate": 8.616412131455723e-05, + "loss": 0.20298798084259034, + "step": 33090 + }, + { + "epoch": 0.14210521796622103, + "grad_norm": 0.34304502606391907, + "learning_rate": 8.615980959443961e-05, + "loss": 0.24174034595489502, + "step": 33100 + }, + { + "epoch": 0.14214815005624104, + "grad_norm": 1.5324758291244507, + "learning_rate": 8.615549787432199e-05, + "loss": 0.08523032665252686, + "step": 33110 + }, + { + "epoch": 0.14219108214626106, + "grad_norm": 1.3764361143112183, + "learning_rate": 8.615118615420437e-05, + "loss": 0.22169878482818603, + "step": 33120 + }, + { + "epoch": 0.14223401423628104, + "grad_norm": 1.85866379737854, + "learning_rate": 8.614687443408674e-05, + "loss": 0.4486417770385742, + "step": 33130 + }, + { + "epoch": 0.14227694632630106, + "grad_norm": 0.02785349264740944, + "learning_rate": 8.61425627139691e-05, + "loss": 0.28274946212768554, + "step": 33140 + }, + { + "epoch": 0.14231987841632107, + "grad_norm": 0.09572744369506836, + "learning_rate": 8.613825099385148e-05, + "loss": 0.3118321180343628, + "step": 33150 + }, + { + "epoch": 0.14236281050634106, + "grad_norm": 0.05197792127728462, + "learning_rate": 8.613393927373386e-05, + "loss": 0.15959925651550294, + "step": 33160 + }, + { + "epoch": 0.14240574259636107, + "grad_norm": 11.495828628540039, + "learning_rate": 8.612962755361624e-05, + "loss": 0.4407163143157959, + "step": 33170 + }, + { + "epoch": 0.1424486746863811, + "grad_norm": 0.09204380214214325, + "learning_rate": 8.612531583349862e-05, + "loss": 0.2962942600250244, + "step": 33180 + }, + { + "epoch": 0.14249160677640108, + "grad_norm": 0.0430777408182621, + "learning_rate": 8.612100411338099e-05, + "loss": 0.33332481384277346, + "step": 33190 + }, + { + "epoch": 0.1425345388664211, + "grad_norm": 0.018142318353056908, + "learning_rate": 8.611669239326337e-05, + "loss": 0.3872080326080322, + "step": 33200 + }, + { + "epoch": 0.1425774709564411, + "grad_norm": 0.29330870509147644, + "learning_rate": 8.611238067314575e-05, + "loss": 0.23970427513122558, + "step": 33210 + }, + { + "epoch": 0.14262040304646112, + "grad_norm": 1.2924166917800903, + "learning_rate": 8.610806895302813e-05, + "loss": 0.24695563316345215, + "step": 33220 + }, + { + "epoch": 0.1426633351364811, + "grad_norm": 1.2489582300186157, + "learning_rate": 8.61037572329105e-05, + "loss": 0.16087877750396729, + "step": 33230 + }, + { + "epoch": 0.14270626722650112, + "grad_norm": 1.7169822454452515, + "learning_rate": 8.609944551279288e-05, + "loss": 0.24377684593200682, + "step": 33240 + }, + { + "epoch": 0.14274919931652114, + "grad_norm": 1.4972940683364868, + "learning_rate": 8.609513379267526e-05, + "loss": 0.2680682182312012, + "step": 33250 + }, + { + "epoch": 0.14279213140654112, + "grad_norm": 0.9008892178535461, + "learning_rate": 8.609082207255763e-05, + "loss": 0.11140121221542358, + "step": 33260 + }, + { + "epoch": 0.14283506349656114, + "grad_norm": 0.009071459993720055, + "learning_rate": 8.608651035244001e-05, + "loss": 0.12064872980117798, + "step": 33270 + }, + { + "epoch": 0.14287799558658115, + "grad_norm": 0.0024141615722328424, + "learning_rate": 8.608219863232239e-05, + "loss": 0.19180315732955933, + "step": 33280 + }, + { + "epoch": 0.14292092767660114, + "grad_norm": 0.255353718996048, + "learning_rate": 8.607788691220477e-05, + "loss": 0.1809109091758728, + "step": 33290 + }, + { + "epoch": 0.14296385976662115, + "grad_norm": 3.5779504776000977, + "learning_rate": 8.607357519208713e-05, + "loss": 0.18246450424194335, + "step": 33300 + }, + { + "epoch": 0.14300679185664117, + "grad_norm": 1.472702145576477, + "learning_rate": 8.606926347196951e-05, + "loss": 0.2409532070159912, + "step": 33310 + }, + { + "epoch": 0.14304972394666118, + "grad_norm": 2.000717878341675, + "learning_rate": 8.606495175185188e-05, + "loss": 0.20780344009399415, + "step": 33320 + }, + { + "epoch": 0.14309265603668117, + "grad_norm": 3.4083595275878906, + "learning_rate": 8.606064003173426e-05, + "loss": 0.26924920082092285, + "step": 33330 + }, + { + "epoch": 0.14313558812670119, + "grad_norm": 0.6678909063339233, + "learning_rate": 8.605632831161664e-05, + "loss": 0.015250737965106963, + "step": 33340 + }, + { + "epoch": 0.1431785202167212, + "grad_norm": 1.3295899629592896, + "learning_rate": 8.605201659149902e-05, + "loss": 0.2518535852432251, + "step": 33350 + }, + { + "epoch": 0.1432214523067412, + "grad_norm": 0.013177858665585518, + "learning_rate": 8.60477048713814e-05, + "loss": 0.041883692145347595, + "step": 33360 + }, + { + "epoch": 0.1432643843967612, + "grad_norm": 0.0020102905109524727, + "learning_rate": 8.604339315126377e-05, + "loss": 0.19797074794769287, + "step": 33370 + }, + { + "epoch": 0.14330731648678122, + "grad_norm": 0.26644909381866455, + "learning_rate": 8.603908143114614e-05, + "loss": 0.27244253158569337, + "step": 33380 + }, + { + "epoch": 0.1433502485768012, + "grad_norm": 0.0014582815347239375, + "learning_rate": 8.603476971102851e-05, + "loss": 0.174778950214386, + "step": 33390 + }, + { + "epoch": 0.14339318066682122, + "grad_norm": 1.2381072044372559, + "learning_rate": 8.603045799091089e-05, + "loss": 0.42563595771789553, + "step": 33400 + }, + { + "epoch": 0.14343611275684123, + "grad_norm": 0.0005544184823520482, + "learning_rate": 8.602614627079327e-05, + "loss": 0.2671182632446289, + "step": 33410 + }, + { + "epoch": 0.14347904484686125, + "grad_norm": 3.252762794494629, + "learning_rate": 8.602183455067564e-05, + "loss": 0.47478280067443845, + "step": 33420 + }, + { + "epoch": 0.14352197693688123, + "grad_norm": 0.0010757598793134093, + "learning_rate": 8.601752283055802e-05, + "loss": 0.35443868637084963, + "step": 33430 + }, + { + "epoch": 0.14356490902690125, + "grad_norm": 0.7422657012939453, + "learning_rate": 8.60132111104404e-05, + "loss": 0.295544958114624, + "step": 33440 + }, + { + "epoch": 0.14360784111692126, + "grad_norm": 0.08659780770540237, + "learning_rate": 8.600889939032278e-05, + "loss": 0.26724236011505126, + "step": 33450 + }, + { + "epoch": 0.14365077320694125, + "grad_norm": 0.07881903648376465, + "learning_rate": 8.600458767020517e-05, + "loss": 0.09010640978813171, + "step": 33460 + }, + { + "epoch": 0.14369370529696127, + "grad_norm": 0.7163306474685669, + "learning_rate": 8.600027595008753e-05, + "loss": 0.2510689258575439, + "step": 33470 + }, + { + "epoch": 0.14373663738698128, + "grad_norm": 0.031614236533641815, + "learning_rate": 8.599596422996991e-05, + "loss": 0.1949402093887329, + "step": 33480 + }, + { + "epoch": 0.14377956947700127, + "grad_norm": 3.6333954334259033, + "learning_rate": 8.599165250985229e-05, + "loss": 0.3583311796188354, + "step": 33490 + }, + { + "epoch": 0.14382250156702128, + "grad_norm": 0.11175089329481125, + "learning_rate": 8.598734078973466e-05, + "loss": 0.12634716033935547, + "step": 33500 + }, + { + "epoch": 0.1438654336570413, + "grad_norm": 2.5286669731140137, + "learning_rate": 8.598302906961704e-05, + "loss": 0.43870835304260253, + "step": 33510 + }, + { + "epoch": 0.1439083657470613, + "grad_norm": 0.5033771395683289, + "learning_rate": 8.597871734949942e-05, + "loss": 0.2515478849411011, + "step": 33520 + }, + { + "epoch": 0.1439512978370813, + "grad_norm": 0.0760180726647377, + "learning_rate": 8.59744056293818e-05, + "loss": 0.22663774490356445, + "step": 33530 + }, + { + "epoch": 0.1439942299271013, + "grad_norm": 4.9512810707092285, + "learning_rate": 8.597009390926417e-05, + "loss": 0.27660746574401857, + "step": 33540 + }, + { + "epoch": 0.14403716201712133, + "grad_norm": 6.3641839027404785, + "learning_rate": 8.596578218914654e-05, + "loss": 0.22129251956939697, + "step": 33550 + }, + { + "epoch": 0.14408009410714132, + "grad_norm": 0.016861692070961, + "learning_rate": 8.596147046902891e-05, + "loss": 0.2914477586746216, + "step": 33560 + }, + { + "epoch": 0.14412302619716133, + "grad_norm": 0.005491959396749735, + "learning_rate": 8.595715874891129e-05, + "loss": 0.20667738914489747, + "step": 33570 + }, + { + "epoch": 0.14416595828718134, + "grad_norm": 0.02153095416724682, + "learning_rate": 8.595284702879367e-05, + "loss": 0.3282850503921509, + "step": 33580 + }, + { + "epoch": 0.14420889037720133, + "grad_norm": 0.1113702580332756, + "learning_rate": 8.594853530867605e-05, + "loss": 0.22480969429016112, + "step": 33590 + }, + { + "epoch": 0.14425182246722135, + "grad_norm": 0.01539696753025055, + "learning_rate": 8.594422358855842e-05, + "loss": 0.060040348768234254, + "step": 33600 + }, + { + "epoch": 0.14429475455724136, + "grad_norm": 0.11859507858753204, + "learning_rate": 8.59399118684408e-05, + "loss": 0.2394228458404541, + "step": 33610 + }, + { + "epoch": 0.14433768664726135, + "grad_norm": 0.110122911632061, + "learning_rate": 8.593560014832318e-05, + "loss": 0.18123446702957152, + "step": 33620 + }, + { + "epoch": 0.14438061873728136, + "grad_norm": 6.010119438171387, + "learning_rate": 8.593128842820554e-05, + "loss": 0.5520340919494628, + "step": 33630 + }, + { + "epoch": 0.14442355082730138, + "grad_norm": 2.263671398162842, + "learning_rate": 8.592697670808792e-05, + "loss": 0.19788291454315185, + "step": 33640 + }, + { + "epoch": 0.1444664829173214, + "grad_norm": 1.149238109588623, + "learning_rate": 8.59226649879703e-05, + "loss": 0.11483936309814453, + "step": 33650 + }, + { + "epoch": 0.14450941500734138, + "grad_norm": 0.1901119351387024, + "learning_rate": 8.591835326785267e-05, + "loss": 0.14860138893127442, + "step": 33660 + }, + { + "epoch": 0.1445523470973614, + "grad_norm": 1.1363589763641357, + "learning_rate": 8.591404154773505e-05, + "loss": 0.21624512672424318, + "step": 33670 + }, + { + "epoch": 0.1445952791873814, + "grad_norm": 0.7980696558952332, + "learning_rate": 8.590972982761744e-05, + "loss": 0.046781697869300844, + "step": 33680 + }, + { + "epoch": 0.1446382112774014, + "grad_norm": 2.2780401706695557, + "learning_rate": 8.590541810749982e-05, + "loss": 0.10103254318237305, + "step": 33690 + }, + { + "epoch": 0.1446811433674214, + "grad_norm": 0.20395316183567047, + "learning_rate": 8.59011063873822e-05, + "loss": 0.401483154296875, + "step": 33700 + }, + { + "epoch": 0.14472407545744143, + "grad_norm": 0.1297791302204132, + "learning_rate": 8.589679466726456e-05, + "loss": 0.15389554500579833, + "step": 33710 + }, + { + "epoch": 0.1447670075474614, + "grad_norm": 0.9008198976516724, + "learning_rate": 8.589248294714694e-05, + "loss": 0.27394144535064696, + "step": 33720 + }, + { + "epoch": 0.14480993963748143, + "grad_norm": 7.308492183685303, + "learning_rate": 8.588817122702932e-05, + "loss": 0.39116473197937013, + "step": 33730 + }, + { + "epoch": 0.14485287172750144, + "grad_norm": 0.006610206328332424, + "learning_rate": 8.588385950691169e-05, + "loss": 0.47578039169311526, + "step": 33740 + }, + { + "epoch": 0.14489580381752146, + "grad_norm": 0.020005585625767708, + "learning_rate": 8.587954778679407e-05, + "loss": 0.10659658908843994, + "step": 33750 + }, + { + "epoch": 0.14493873590754144, + "grad_norm": 0.3955053687095642, + "learning_rate": 8.587523606667645e-05, + "loss": 0.052762389183044434, + "step": 33760 + }, + { + "epoch": 0.14498166799756146, + "grad_norm": 0.07789677381515503, + "learning_rate": 8.587092434655882e-05, + "loss": 0.17416226863861084, + "step": 33770 + }, + { + "epoch": 0.14502460008758147, + "grad_norm": 1.851404070854187, + "learning_rate": 8.58666126264412e-05, + "loss": 0.1859425663948059, + "step": 33780 + }, + { + "epoch": 0.14506753217760146, + "grad_norm": 1.213042974472046, + "learning_rate": 8.586230090632358e-05, + "loss": 0.4020374774932861, + "step": 33790 + }, + { + "epoch": 0.14511046426762148, + "grad_norm": 0.8941920399665833, + "learning_rate": 8.585798918620594e-05, + "loss": 0.27128329277038576, + "step": 33800 + }, + { + "epoch": 0.1451533963576415, + "grad_norm": 3.843531370162964, + "learning_rate": 8.585367746608832e-05, + "loss": 0.0402255117893219, + "step": 33810 + }, + { + "epoch": 0.14519632844766148, + "grad_norm": 0.19719567894935608, + "learning_rate": 8.58493657459707e-05, + "loss": 0.3777024269104004, + "step": 33820 + }, + { + "epoch": 0.1452392605376815, + "grad_norm": 2.3761401176452637, + "learning_rate": 8.584505402585308e-05, + "loss": 0.2160428524017334, + "step": 33830 + }, + { + "epoch": 0.1452821926277015, + "grad_norm": 0.03809600695967674, + "learning_rate": 8.584074230573545e-05, + "loss": 0.2593549251556396, + "step": 33840 + }, + { + "epoch": 0.14532512471772152, + "grad_norm": 0.027907975018024445, + "learning_rate": 8.583643058561783e-05, + "loss": 0.343958306312561, + "step": 33850 + }, + { + "epoch": 0.1453680568077415, + "grad_norm": 0.02713572420179844, + "learning_rate": 8.583211886550021e-05, + "loss": 0.40753722190856934, + "step": 33860 + }, + { + "epoch": 0.14541098889776152, + "grad_norm": 3.2029054164886475, + "learning_rate": 8.582780714538258e-05, + "loss": 0.42441816329956056, + "step": 33870 + }, + { + "epoch": 0.14545392098778154, + "grad_norm": 0.043370820581912994, + "learning_rate": 8.582349542526495e-05, + "loss": 0.0937444269657135, + "step": 33880 + }, + { + "epoch": 0.14549685307780152, + "grad_norm": 0.2185685634613037, + "learning_rate": 8.581918370514733e-05, + "loss": 0.0842089295387268, + "step": 33890 + }, + { + "epoch": 0.14553978516782154, + "grad_norm": 1.985492467880249, + "learning_rate": 8.581487198502972e-05, + "loss": 0.16136040687561035, + "step": 33900 + }, + { + "epoch": 0.14558271725784155, + "grad_norm": 0.01867522858083248, + "learning_rate": 8.58105602649121e-05, + "loss": 0.2270805597305298, + "step": 33910 + }, + { + "epoch": 0.14562564934786154, + "grad_norm": 0.01364646665751934, + "learning_rate": 8.580624854479447e-05, + "loss": 0.1863769292831421, + "step": 33920 + }, + { + "epoch": 0.14566858143788156, + "grad_norm": 0.00913103949278593, + "learning_rate": 8.580193682467685e-05, + "loss": 0.44718332290649415, + "step": 33930 + }, + { + "epoch": 0.14571151352790157, + "grad_norm": 0.17968043684959412, + "learning_rate": 8.579762510455923e-05, + "loss": 0.20691509246826173, + "step": 33940 + }, + { + "epoch": 0.14575444561792159, + "grad_norm": 3.070237874984741, + "learning_rate": 8.57933133844416e-05, + "loss": 0.13816176652908324, + "step": 33950 + }, + { + "epoch": 0.14579737770794157, + "grad_norm": 1.2599154710769653, + "learning_rate": 8.578900166432397e-05, + "loss": 0.4113800525665283, + "step": 33960 + }, + { + "epoch": 0.1458403097979616, + "grad_norm": 0.007566008251160383, + "learning_rate": 8.578468994420634e-05, + "loss": 0.28733956813812256, + "step": 33970 + }, + { + "epoch": 0.1458832418879816, + "grad_norm": 0.15226389467716217, + "learning_rate": 8.578037822408872e-05, + "loss": 0.22472400665283204, + "step": 33980 + }, + { + "epoch": 0.1459261739780016, + "grad_norm": 1.13027822971344, + "learning_rate": 8.57760665039711e-05, + "loss": 0.14060845375061035, + "step": 33990 + }, + { + "epoch": 0.1459691060680216, + "grad_norm": 0.20782363414764404, + "learning_rate": 8.577175478385348e-05, + "loss": 0.2679033041000366, + "step": 34000 + }, + { + "epoch": 0.1459691060680216, + "eval_loss": 0.4690183997154236, + "eval_runtime": 27.4503, + "eval_samples_per_second": 3.643, + "eval_steps_per_second": 3.643, + "step": 34000 + }, + { + "epoch": 0.14601203815804162, + "grad_norm": 0.19579826295375824, + "learning_rate": 8.576744306373585e-05, + "loss": 0.009440401196479797, + "step": 34010 + }, + { + "epoch": 0.1460549702480616, + "grad_norm": 0.06560419499874115, + "learning_rate": 8.576313134361823e-05, + "loss": 0.15190430879592895, + "step": 34020 + }, + { + "epoch": 0.14609790233808162, + "grad_norm": 0.04176841303706169, + "learning_rate": 8.575881962350061e-05, + "loss": 0.3664478063583374, + "step": 34030 + }, + { + "epoch": 0.14614083442810163, + "grad_norm": 0.13518881797790527, + "learning_rate": 8.575450790338297e-05, + "loss": 0.3177423715591431, + "step": 34040 + }, + { + "epoch": 0.14618376651812162, + "grad_norm": 0.00969105027616024, + "learning_rate": 8.575019618326535e-05, + "loss": 0.16859444379806518, + "step": 34050 + }, + { + "epoch": 0.14622669860814164, + "grad_norm": 0.030884014442563057, + "learning_rate": 8.574588446314773e-05, + "loss": 0.2813561916351318, + "step": 34060 + }, + { + "epoch": 0.14626963069816165, + "grad_norm": 0.008490712381899357, + "learning_rate": 8.57415727430301e-05, + "loss": 0.3318132162094116, + "step": 34070 + }, + { + "epoch": 0.14631256278818167, + "grad_norm": 1.6599934101104736, + "learning_rate": 8.573726102291248e-05, + "loss": 0.3509896039962769, + "step": 34080 + }, + { + "epoch": 0.14635549487820165, + "grad_norm": 2.326103687286377, + "learning_rate": 8.573294930279486e-05, + "loss": 0.3958756923675537, + "step": 34090 + }, + { + "epoch": 0.14639842696822167, + "grad_norm": 0.005022017750889063, + "learning_rate": 8.572863758267724e-05, + "loss": 0.2757140874862671, + "step": 34100 + }, + { + "epoch": 0.14644135905824168, + "grad_norm": 0.013056546449661255, + "learning_rate": 8.572432586255961e-05, + "loss": 0.49457626342773436, + "step": 34110 + }, + { + "epoch": 0.14648429114826167, + "grad_norm": 0.14255377650260925, + "learning_rate": 8.572001414244199e-05, + "loss": 0.12429465055465698, + "step": 34120 + }, + { + "epoch": 0.14652722323828168, + "grad_norm": 3.096914768218994, + "learning_rate": 8.571570242232437e-05, + "loss": 0.3980607032775879, + "step": 34130 + }, + { + "epoch": 0.1465701553283017, + "grad_norm": 0.2588160037994385, + "learning_rate": 8.571139070220675e-05, + "loss": 0.21047422885894776, + "step": 34140 + }, + { + "epoch": 0.14661308741832169, + "grad_norm": 1.245520830154419, + "learning_rate": 8.570707898208912e-05, + "loss": 0.19313105344772338, + "step": 34150 + }, + { + "epoch": 0.1466560195083417, + "grad_norm": 0.012002730742096901, + "learning_rate": 8.57027672619715e-05, + "loss": 0.19543395042419434, + "step": 34160 + }, + { + "epoch": 0.14669895159836172, + "grad_norm": 0.4202781021595001, + "learning_rate": 8.569845554185388e-05, + "loss": 0.38264172077178954, + "step": 34170 + }, + { + "epoch": 0.14674188368838173, + "grad_norm": 1.3057153224945068, + "learning_rate": 8.569414382173626e-05, + "loss": 0.3494687795639038, + "step": 34180 + }, + { + "epoch": 0.14678481577840172, + "grad_norm": 0.11200109869241714, + "learning_rate": 8.568983210161863e-05, + "loss": 0.17912702560424804, + "step": 34190 + }, + { + "epoch": 0.14682774786842173, + "grad_norm": 0.04614344611763954, + "learning_rate": 8.568552038150101e-05, + "loss": 0.2561028957366943, + "step": 34200 + }, + { + "epoch": 0.14687067995844175, + "grad_norm": 0.9324910044670105, + "learning_rate": 8.568120866138337e-05, + "loss": 0.22830185890197754, + "step": 34210 + }, + { + "epoch": 0.14691361204846173, + "grad_norm": 0.08826933801174164, + "learning_rate": 8.567689694126575e-05, + "loss": 0.2512583494186401, + "step": 34220 + }, + { + "epoch": 0.14695654413848175, + "grad_norm": 0.8753888010978699, + "learning_rate": 8.567258522114813e-05, + "loss": 0.1659464120864868, + "step": 34230 + }, + { + "epoch": 0.14699947622850176, + "grad_norm": 0.03925846144556999, + "learning_rate": 8.56682735010305e-05, + "loss": 0.15697722434997557, + "step": 34240 + }, + { + "epoch": 0.14704240831852175, + "grad_norm": 0.00965725164860487, + "learning_rate": 8.566396178091288e-05, + "loss": 0.3077390670776367, + "step": 34250 + }, + { + "epoch": 0.14708534040854176, + "grad_norm": 0.08784633129835129, + "learning_rate": 8.565965006079526e-05, + "loss": 0.2271416425704956, + "step": 34260 + }, + { + "epoch": 0.14712827249856178, + "grad_norm": 1.227187991142273, + "learning_rate": 8.565533834067764e-05, + "loss": 0.31869919300079347, + "step": 34270 + }, + { + "epoch": 0.1471712045885818, + "grad_norm": 0.004089116118848324, + "learning_rate": 8.565102662056001e-05, + "loss": 0.21388025283813478, + "step": 34280 + }, + { + "epoch": 0.14721413667860178, + "grad_norm": 0.0882837250828743, + "learning_rate": 8.564671490044238e-05, + "loss": 0.3842825651168823, + "step": 34290 + }, + { + "epoch": 0.1472570687686218, + "grad_norm": 0.05654463544487953, + "learning_rate": 8.564240318032476e-05, + "loss": 0.15656944513320922, + "step": 34300 + }, + { + "epoch": 0.1473000008586418, + "grad_norm": 0.03386420011520386, + "learning_rate": 8.563809146020713e-05, + "loss": 0.2900910615921021, + "step": 34310 + }, + { + "epoch": 0.1473429329486618, + "grad_norm": 22.915380477905273, + "learning_rate": 8.563377974008951e-05, + "loss": 0.20082504749298097, + "step": 34320 + }, + { + "epoch": 0.1473858650386818, + "grad_norm": 4.276361465454102, + "learning_rate": 8.562946801997189e-05, + "loss": 0.2055532693862915, + "step": 34330 + }, + { + "epoch": 0.14742879712870183, + "grad_norm": 0.037190258502960205, + "learning_rate": 8.562515629985427e-05, + "loss": 0.2975011825561523, + "step": 34340 + }, + { + "epoch": 0.14747172921872181, + "grad_norm": 0.050163384526968, + "learning_rate": 8.562084457973664e-05, + "loss": 0.39753849506378175, + "step": 34350 + }, + { + "epoch": 0.14751466130874183, + "grad_norm": 1.5702651739120483, + "learning_rate": 8.561653285961902e-05, + "loss": 0.25239105224609376, + "step": 34360 + }, + { + "epoch": 0.14755759339876184, + "grad_norm": 1.009832501411438, + "learning_rate": 8.56122211395014e-05, + "loss": 0.15066080093383788, + "step": 34370 + }, + { + "epoch": 0.14760052548878186, + "grad_norm": 3.242133617401123, + "learning_rate": 8.560790941938377e-05, + "loss": 0.4163659572601318, + "step": 34380 + }, + { + "epoch": 0.14764345757880185, + "grad_norm": 2.42297101020813, + "learning_rate": 8.560359769926615e-05, + "loss": 0.2022876501083374, + "step": 34390 + }, + { + "epoch": 0.14768638966882186, + "grad_norm": 1.9914714097976685, + "learning_rate": 8.559928597914853e-05, + "loss": 0.13610408306121827, + "step": 34400 + }, + { + "epoch": 0.14772932175884188, + "grad_norm": 1.3775780200958252, + "learning_rate": 8.559497425903091e-05, + "loss": 0.1667981743812561, + "step": 34410 + }, + { + "epoch": 0.14777225384886186, + "grad_norm": 0.5672871470451355, + "learning_rate": 8.559066253891328e-05, + "loss": 0.09758582711219788, + "step": 34420 + }, + { + "epoch": 0.14781518593888188, + "grad_norm": 1.395330548286438, + "learning_rate": 8.558635081879566e-05, + "loss": 0.49545893669128416, + "step": 34430 + }, + { + "epoch": 0.1478581180289019, + "grad_norm": 0.6163213849067688, + "learning_rate": 8.558203909867804e-05, + "loss": 0.41956653594970705, + "step": 34440 + }, + { + "epoch": 0.14790105011892188, + "grad_norm": 0.0650164932012558, + "learning_rate": 8.55777273785604e-05, + "loss": 0.41419262886047364, + "step": 34450 + }, + { + "epoch": 0.1479439822089419, + "grad_norm": 0.08980961889028549, + "learning_rate": 8.557341565844278e-05, + "loss": 0.11508655548095703, + "step": 34460 + }, + { + "epoch": 0.1479869142989619, + "grad_norm": 0.10371974110603333, + "learning_rate": 8.556910393832516e-05, + "loss": 0.2601905107498169, + "step": 34470 + }, + { + "epoch": 0.1480298463889819, + "grad_norm": 1.2678031921386719, + "learning_rate": 8.556479221820753e-05, + "loss": 0.2797661066055298, + "step": 34480 + }, + { + "epoch": 0.1480727784790019, + "grad_norm": 0.013037784025073051, + "learning_rate": 8.556048049808991e-05, + "loss": 0.1015774130821228, + "step": 34490 + }, + { + "epoch": 0.14811571056902192, + "grad_norm": 4.056503772735596, + "learning_rate": 8.555616877797229e-05, + "loss": 0.41338410377502444, + "step": 34500 + }, + { + "epoch": 0.14815864265904194, + "grad_norm": 1.123368740081787, + "learning_rate": 8.555185705785467e-05, + "loss": 0.2618500471115112, + "step": 34510 + }, + { + "epoch": 0.14820157474906193, + "grad_norm": 1.8491207361221313, + "learning_rate": 8.554754533773704e-05, + "loss": 0.14376912117004395, + "step": 34520 + }, + { + "epoch": 0.14824450683908194, + "grad_norm": 0.02913055010139942, + "learning_rate": 8.554323361761942e-05, + "loss": 0.30904397964477537, + "step": 34530 + }, + { + "epoch": 0.14828743892910196, + "grad_norm": 12.26372241973877, + "learning_rate": 8.553892189750179e-05, + "loss": 0.1958317518234253, + "step": 34540 + }, + { + "epoch": 0.14833037101912194, + "grad_norm": 0.00536407670006156, + "learning_rate": 8.553461017738416e-05, + "loss": 0.2335893154144287, + "step": 34550 + }, + { + "epoch": 0.14837330310914196, + "grad_norm": 0.021669838577508926, + "learning_rate": 8.553029845726654e-05, + "loss": 0.07356876134872437, + "step": 34560 + }, + { + "epoch": 0.14841623519916197, + "grad_norm": 0.9531659483909607, + "learning_rate": 8.552598673714892e-05, + "loss": 0.37895355224609373, + "step": 34570 + }, + { + "epoch": 0.14845916728918196, + "grad_norm": 1.2019336223602295, + "learning_rate": 8.55216750170313e-05, + "loss": 0.1651861548423767, + "step": 34580 + }, + { + "epoch": 0.14850209937920197, + "grad_norm": 2.597381114959717, + "learning_rate": 8.551736329691367e-05, + "loss": 0.31086115837097167, + "step": 34590 + }, + { + "epoch": 0.148545031469222, + "grad_norm": 11.423806190490723, + "learning_rate": 8.551305157679605e-05, + "loss": 0.22325286865234376, + "step": 34600 + }, + { + "epoch": 0.148587963559242, + "grad_norm": 0.13982784748077393, + "learning_rate": 8.550873985667843e-05, + "loss": 0.32322494983673095, + "step": 34610 + }, + { + "epoch": 0.148630895649262, + "grad_norm": 38.70827102661133, + "learning_rate": 8.55044281365608e-05, + "loss": 0.3321423292160034, + "step": 34620 + }, + { + "epoch": 0.148673827739282, + "grad_norm": 0.12706917524337769, + "learning_rate": 8.550011641644318e-05, + "loss": 0.2533221960067749, + "step": 34630 + }, + { + "epoch": 0.14871675982930202, + "grad_norm": 0.9390490055084229, + "learning_rate": 8.549580469632556e-05, + "loss": 0.45719470977783205, + "step": 34640 + }, + { + "epoch": 0.148759691919322, + "grad_norm": 4.5604248046875, + "learning_rate": 8.549149297620794e-05, + "loss": 0.20422539710998536, + "step": 34650 + }, + { + "epoch": 0.14880262400934202, + "grad_norm": 1.5207419395446777, + "learning_rate": 8.548718125609031e-05, + "loss": 0.2647045135498047, + "step": 34660 + }, + { + "epoch": 0.14884555609936204, + "grad_norm": 0.008388262242078781, + "learning_rate": 8.548286953597269e-05, + "loss": 0.2994123935699463, + "step": 34670 + }, + { + "epoch": 0.14888848818938202, + "grad_norm": 0.038906846195459366, + "learning_rate": 8.547855781585507e-05, + "loss": 0.07747592329978943, + "step": 34680 + }, + { + "epoch": 0.14893142027940204, + "grad_norm": 0.0024672250729054213, + "learning_rate": 8.547424609573745e-05, + "loss": 0.4452563762664795, + "step": 34690 + }, + { + "epoch": 0.14897435236942205, + "grad_norm": 1.3578065633773804, + "learning_rate": 8.546993437561981e-05, + "loss": 0.4643740653991699, + "step": 34700 + }, + { + "epoch": 0.14901728445944207, + "grad_norm": 0.0018323465483263135, + "learning_rate": 8.546562265550219e-05, + "loss": 0.33861527442932127, + "step": 34710 + }, + { + "epoch": 0.14906021654946205, + "grad_norm": 0.0014579965500161052, + "learning_rate": 8.546131093538456e-05, + "loss": 0.15541526079177856, + "step": 34720 + }, + { + "epoch": 0.14910314863948207, + "grad_norm": 0.5606818199157715, + "learning_rate": 8.545699921526694e-05, + "loss": 0.1587108016014099, + "step": 34730 + }, + { + "epoch": 0.14914608072950208, + "grad_norm": 1.585154414176941, + "learning_rate": 8.545268749514932e-05, + "loss": 0.4065643310546875, + "step": 34740 + }, + { + "epoch": 0.14918901281952207, + "grad_norm": 0.0026883201207965612, + "learning_rate": 8.54483757750317e-05, + "loss": 0.14222111701965331, + "step": 34750 + }, + { + "epoch": 0.14923194490954209, + "grad_norm": 0.15939301252365112, + "learning_rate": 8.544406405491407e-05, + "loss": 0.09957548379898071, + "step": 34760 + }, + { + "epoch": 0.1492748769995621, + "grad_norm": 0.010179446078836918, + "learning_rate": 8.543975233479645e-05, + "loss": 0.15829137563705445, + "step": 34770 + }, + { + "epoch": 0.1493178090895821, + "grad_norm": 0.0022038391325622797, + "learning_rate": 8.543544061467881e-05, + "loss": 0.13406049013137816, + "step": 34780 + }, + { + "epoch": 0.1493607411796021, + "grad_norm": 4.663673400878906, + "learning_rate": 8.543112889456119e-05, + "loss": 0.43924455642700194, + "step": 34790 + }, + { + "epoch": 0.14940367326962212, + "grad_norm": 1.3737190961837769, + "learning_rate": 8.542681717444357e-05, + "loss": 0.24775092601776122, + "step": 34800 + }, + { + "epoch": 0.14944660535964213, + "grad_norm": 0.09232733398675919, + "learning_rate": 8.542250545432595e-05, + "loss": 0.17557703256607055, + "step": 34810 + }, + { + "epoch": 0.14948953744966212, + "grad_norm": 1.6460373401641846, + "learning_rate": 8.541819373420832e-05, + "loss": 0.2807705640792847, + "step": 34820 + }, + { + "epoch": 0.14953246953968213, + "grad_norm": 2.7181038856506348, + "learning_rate": 8.54138820140907e-05, + "loss": 0.24563355445861818, + "step": 34830 + }, + { + "epoch": 0.14957540162970215, + "grad_norm": 0.001726570655591786, + "learning_rate": 8.540957029397308e-05, + "loss": 0.13799943923950195, + "step": 34840 + }, + { + "epoch": 0.14961833371972214, + "grad_norm": 3.9309792518615723, + "learning_rate": 8.540525857385546e-05, + "loss": 0.31938905715942384, + "step": 34850 + }, + { + "epoch": 0.14966126580974215, + "grad_norm": 0.9790728688240051, + "learning_rate": 8.540094685373783e-05, + "loss": 0.4770151138305664, + "step": 34860 + }, + { + "epoch": 0.14970419789976216, + "grad_norm": 1.415012240409851, + "learning_rate": 8.539663513362021e-05, + "loss": 0.041997873783111574, + "step": 34870 + }, + { + "epoch": 0.14974712998978215, + "grad_norm": 1.7993477582931519, + "learning_rate": 8.539232341350259e-05, + "loss": 0.40750322341918943, + "step": 34880 + }, + { + "epoch": 0.14979006207980217, + "grad_norm": 6.537695407867432, + "learning_rate": 8.538801169338497e-05, + "loss": 0.5166696071624756, + "step": 34890 + }, + { + "epoch": 0.14983299416982218, + "grad_norm": 0.040551360696554184, + "learning_rate": 8.538369997326734e-05, + "loss": 0.3467602014541626, + "step": 34900 + }, + { + "epoch": 0.14987592625984217, + "grad_norm": 4.1986894607543945, + "learning_rate": 8.537938825314972e-05, + "loss": 0.42667579650878906, + "step": 34910 + }, + { + "epoch": 0.14991885834986218, + "grad_norm": 1.751068115234375, + "learning_rate": 8.53750765330321e-05, + "loss": 0.1723111629486084, + "step": 34920 + }, + { + "epoch": 0.1499617904398822, + "grad_norm": 1.7199437618255615, + "learning_rate": 8.537076481291447e-05, + "loss": 0.31918811798095703, + "step": 34930 + }, + { + "epoch": 0.1500047225299022, + "grad_norm": 0.026622159406542778, + "learning_rate": 8.536645309279685e-05, + "loss": 0.16448687314987182, + "step": 34940 + }, + { + "epoch": 0.1500476546199222, + "grad_norm": 0.0029959080275148153, + "learning_rate": 8.536214137267922e-05, + "loss": 0.4394689083099365, + "step": 34950 + }, + { + "epoch": 0.15009058670994221, + "grad_norm": 1.8133684396743774, + "learning_rate": 8.535782965256159e-05, + "loss": 0.29315714836120604, + "step": 34960 + }, + { + "epoch": 0.15013351879996223, + "grad_norm": 0.012649440206587315, + "learning_rate": 8.535351793244397e-05, + "loss": 0.06308199763298035, + "step": 34970 + }, + { + "epoch": 0.15017645088998222, + "grad_norm": 0.2371370643377304, + "learning_rate": 8.534920621232635e-05, + "loss": 0.14844993352890015, + "step": 34980 + }, + { + "epoch": 0.15021938298000223, + "grad_norm": 2.117079734802246, + "learning_rate": 8.534489449220872e-05, + "loss": 0.2953939914703369, + "step": 34990 + }, + { + "epoch": 0.15026231507002225, + "grad_norm": 0.02927054464817047, + "learning_rate": 8.53405827720911e-05, + "loss": 0.3870770215988159, + "step": 35000 + }, + { + "epoch": 0.15026231507002225, + "eval_loss": 0.47191593050956726, + "eval_runtime": 27.5864, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 3.625, + "step": 35000 + }, + { + "epoch": 0.15030524716004223, + "grad_norm": 0.3207988440990448, + "learning_rate": 8.533627105197348e-05, + "loss": 0.12940698862075806, + "step": 35010 + }, + { + "epoch": 0.15034817925006225, + "grad_norm": 0.08552608639001846, + "learning_rate": 8.533195933185586e-05, + "loss": 0.17130987644195556, + "step": 35020 + }, + { + "epoch": 0.15039111134008226, + "grad_norm": 2.546262741088867, + "learning_rate": 8.532764761173822e-05, + "loss": 0.20994532108306885, + "step": 35030 + }, + { + "epoch": 0.15043404343010228, + "grad_norm": 0.012148946523666382, + "learning_rate": 8.53233358916206e-05, + "loss": 0.3189948081970215, + "step": 35040 + }, + { + "epoch": 0.15047697552012226, + "grad_norm": 0.0012491026427596807, + "learning_rate": 8.531902417150298e-05, + "loss": 0.06121616363525391, + "step": 35050 + }, + { + "epoch": 0.15051990761014228, + "grad_norm": 2.2094056606292725, + "learning_rate": 8.531471245138535e-05, + "loss": 0.4135121822357178, + "step": 35060 + }, + { + "epoch": 0.1505628397001623, + "grad_norm": 1.279274344444275, + "learning_rate": 8.531040073126773e-05, + "loss": 0.2774710416793823, + "step": 35070 + }, + { + "epoch": 0.15060577179018228, + "grad_norm": 0.022777825593948364, + "learning_rate": 8.530608901115012e-05, + "loss": 0.15013327598571777, + "step": 35080 + }, + { + "epoch": 0.1506487038802023, + "grad_norm": 0.1570606827735901, + "learning_rate": 8.53017772910325e-05, + "loss": 0.679659652709961, + "step": 35090 + }, + { + "epoch": 0.1506916359702223, + "grad_norm": 0.04781366512179375, + "learning_rate": 8.529746557091488e-05, + "loss": 0.3544900417327881, + "step": 35100 + }, + { + "epoch": 0.1507345680602423, + "grad_norm": 0.9712049961090088, + "learning_rate": 8.529315385079724e-05, + "loss": 0.13232774734497071, + "step": 35110 + }, + { + "epoch": 0.1507775001502623, + "grad_norm": 2.364048719406128, + "learning_rate": 8.528884213067962e-05, + "loss": 0.2865861654281616, + "step": 35120 + }, + { + "epoch": 0.15082043224028233, + "grad_norm": 0.16181397438049316, + "learning_rate": 8.5284530410562e-05, + "loss": 0.10180512666702271, + "step": 35130 + }, + { + "epoch": 0.15086336433030234, + "grad_norm": 0.04067031294107437, + "learning_rate": 8.528021869044437e-05, + "loss": 0.18961187601089477, + "step": 35140 + }, + { + "epoch": 0.15090629642032233, + "grad_norm": 1.3849711418151855, + "learning_rate": 8.527590697032675e-05, + "loss": 0.36178433895111084, + "step": 35150 + }, + { + "epoch": 0.15094922851034234, + "grad_norm": 1.1178500652313232, + "learning_rate": 8.527159525020913e-05, + "loss": 0.5713705062866211, + "step": 35160 + }, + { + "epoch": 0.15099216060036236, + "grad_norm": 1.7924635410308838, + "learning_rate": 8.52672835300915e-05, + "loss": 0.2083521842956543, + "step": 35170 + }, + { + "epoch": 0.15103509269038234, + "grad_norm": 0.4806084632873535, + "learning_rate": 8.526297180997388e-05, + "loss": 0.06166212558746338, + "step": 35180 + }, + { + "epoch": 0.15107802478040236, + "grad_norm": 0.2358323186635971, + "learning_rate": 8.525866008985624e-05, + "loss": 0.2636121273040771, + "step": 35190 + }, + { + "epoch": 0.15112095687042237, + "grad_norm": 0.12955226004123688, + "learning_rate": 8.525434836973862e-05, + "loss": 0.2113489627838135, + "step": 35200 + }, + { + "epoch": 0.15116388896044236, + "grad_norm": 0.0469609797000885, + "learning_rate": 8.5250036649621e-05, + "loss": 0.2918891906738281, + "step": 35210 + }, + { + "epoch": 0.15120682105046238, + "grad_norm": 1.7934201955795288, + "learning_rate": 8.524572492950338e-05, + "loss": 0.32982540130615234, + "step": 35220 + }, + { + "epoch": 0.1512497531404824, + "grad_norm": 0.09866645187139511, + "learning_rate": 8.524141320938575e-05, + "loss": 0.3262962818145752, + "step": 35230 + }, + { + "epoch": 0.1512926852305024, + "grad_norm": 1.6658072471618652, + "learning_rate": 8.523710148926813e-05, + "loss": 0.35833468437194826, + "step": 35240 + }, + { + "epoch": 0.1513356173205224, + "grad_norm": 0.013539593666791916, + "learning_rate": 8.523278976915051e-05, + "loss": 0.16183923482894896, + "step": 35250 + }, + { + "epoch": 0.1513785494105424, + "grad_norm": 2.3311607837677, + "learning_rate": 8.522847804903289e-05, + "loss": 0.6225554466247558, + "step": 35260 + }, + { + "epoch": 0.15142148150056242, + "grad_norm": 1.097491979598999, + "learning_rate": 8.522416632891526e-05, + "loss": 0.2803528308868408, + "step": 35270 + }, + { + "epoch": 0.1514644135905824, + "grad_norm": 0.015554245561361313, + "learning_rate": 8.521985460879763e-05, + "loss": 0.23459622859954835, + "step": 35280 + }, + { + "epoch": 0.15150734568060242, + "grad_norm": 3.9107282161712646, + "learning_rate": 8.521554288868e-05, + "loss": 0.23622241020202636, + "step": 35290 + }, + { + "epoch": 0.15155027777062244, + "grad_norm": 0.09115851670503616, + "learning_rate": 8.52112311685624e-05, + "loss": 0.13579467535018921, + "step": 35300 + }, + { + "epoch": 0.15159320986064242, + "grad_norm": 0.28334158658981323, + "learning_rate": 8.520691944844477e-05, + "loss": 0.18963055610656737, + "step": 35310 + }, + { + "epoch": 0.15163614195066244, + "grad_norm": 25.784404754638672, + "learning_rate": 8.520260772832715e-05, + "loss": 0.2784431934356689, + "step": 35320 + }, + { + "epoch": 0.15167907404068245, + "grad_norm": 1.8637523651123047, + "learning_rate": 8.519829600820953e-05, + "loss": 0.21581711769104003, + "step": 35330 + }, + { + "epoch": 0.15172200613070244, + "grad_norm": 0.2694854736328125, + "learning_rate": 8.51939842880919e-05, + "loss": 0.3204442262649536, + "step": 35340 + }, + { + "epoch": 0.15176493822072246, + "grad_norm": 0.12213743478059769, + "learning_rate": 8.518967256797428e-05, + "loss": 0.2358182430267334, + "step": 35350 + }, + { + "epoch": 0.15180787031074247, + "grad_norm": 0.08006271719932556, + "learning_rate": 8.518536084785665e-05, + "loss": 0.10651001930236817, + "step": 35360 + }, + { + "epoch": 0.15185080240076249, + "grad_norm": 2.5007402896881104, + "learning_rate": 8.518104912773902e-05, + "loss": 0.1309428334236145, + "step": 35370 + }, + { + "epoch": 0.15189373449078247, + "grad_norm": 2.820734739303589, + "learning_rate": 8.51767374076214e-05, + "loss": 0.16440974473953246, + "step": 35380 + }, + { + "epoch": 0.1519366665808025, + "grad_norm": 0.014351708814501762, + "learning_rate": 8.517242568750378e-05, + "loss": 0.1614371657371521, + "step": 35390 + }, + { + "epoch": 0.1519795986708225, + "grad_norm": 2.705998420715332, + "learning_rate": 8.516811396738616e-05, + "loss": 0.21752235889434815, + "step": 35400 + }, + { + "epoch": 0.1520225307608425, + "grad_norm": 0.0937918946146965, + "learning_rate": 8.516380224726853e-05, + "loss": 0.4111818313598633, + "step": 35410 + }, + { + "epoch": 0.1520654628508625, + "grad_norm": 0.09626717865467072, + "learning_rate": 8.515949052715091e-05, + "loss": 0.20632579326629638, + "step": 35420 + }, + { + "epoch": 0.15210839494088252, + "grad_norm": 12.47982406616211, + "learning_rate": 8.515517880703329e-05, + "loss": 0.20486812591552733, + "step": 35430 + }, + { + "epoch": 0.1521513270309025, + "grad_norm": 4.971863746643066, + "learning_rate": 8.515086708691565e-05, + "loss": 0.30284805297851564, + "step": 35440 + }, + { + "epoch": 0.15219425912092252, + "grad_norm": 0.019148241728544235, + "learning_rate": 8.514655536679803e-05, + "loss": 0.24144928455352782, + "step": 35450 + }, + { + "epoch": 0.15223719121094254, + "grad_norm": 0.03687391057610512, + "learning_rate": 8.51422436466804e-05, + "loss": 0.25850441455841067, + "step": 35460 + }, + { + "epoch": 0.15228012330096255, + "grad_norm": 0.0728311687707901, + "learning_rate": 8.513793192656278e-05, + "loss": 0.1594499707221985, + "step": 35470 + }, + { + "epoch": 0.15232305539098254, + "grad_norm": 0.8129069209098816, + "learning_rate": 8.513362020644516e-05, + "loss": 0.313718581199646, + "step": 35480 + }, + { + "epoch": 0.15236598748100255, + "grad_norm": 0.1573277711868286, + "learning_rate": 8.512930848632754e-05, + "loss": 0.3664534091949463, + "step": 35490 + }, + { + "epoch": 0.15240891957102257, + "grad_norm": 27.10266876220703, + "learning_rate": 8.512499676620992e-05, + "loss": 0.2637137174606323, + "step": 35500 + }, + { + "epoch": 0.15245185166104255, + "grad_norm": 0.012873655185103416, + "learning_rate": 8.512068504609229e-05, + "loss": 0.1119425654411316, + "step": 35510 + }, + { + "epoch": 0.15249478375106257, + "grad_norm": 0.017580613493919373, + "learning_rate": 8.511637332597467e-05, + "loss": 0.14549771547317505, + "step": 35520 + }, + { + "epoch": 0.15253771584108258, + "grad_norm": 0.008063385263085365, + "learning_rate": 8.511206160585705e-05, + "loss": 0.11777844429016113, + "step": 35530 + }, + { + "epoch": 0.15258064793110257, + "grad_norm": 0.03290800005197525, + "learning_rate": 8.510774988573942e-05, + "loss": 0.4041603565216064, + "step": 35540 + }, + { + "epoch": 0.15262358002112258, + "grad_norm": 0.2318921834230423, + "learning_rate": 8.51034381656218e-05, + "loss": 0.44219579696655276, + "step": 35550 + }, + { + "epoch": 0.1526665121111426, + "grad_norm": 0.7928853631019592, + "learning_rate": 8.509912644550418e-05, + "loss": 0.48702564239501955, + "step": 35560 + }, + { + "epoch": 0.15270944420116261, + "grad_norm": 0.044648732990026474, + "learning_rate": 8.509481472538656e-05, + "loss": 0.2403254747390747, + "step": 35570 + }, + { + "epoch": 0.1527523762911826, + "grad_norm": 1.3173476457595825, + "learning_rate": 8.509050300526893e-05, + "loss": 0.5910422801971436, + "step": 35580 + }, + { + "epoch": 0.15279530838120262, + "grad_norm": 0.02605602703988552, + "learning_rate": 8.508619128515131e-05, + "loss": 0.20394675731658934, + "step": 35590 + }, + { + "epoch": 0.15283824047122263, + "grad_norm": 0.10889720916748047, + "learning_rate": 8.508187956503368e-05, + "loss": 0.3139126062393188, + "step": 35600 + }, + { + "epoch": 0.15288117256124262, + "grad_norm": 2.5993807315826416, + "learning_rate": 8.507756784491605e-05, + "loss": 0.31171181201934817, + "step": 35610 + }, + { + "epoch": 0.15292410465126263, + "grad_norm": 0.08966104686260223, + "learning_rate": 8.507325612479843e-05, + "loss": 0.06715420484542847, + "step": 35620 + }, + { + "epoch": 0.15296703674128265, + "grad_norm": 1.294317603111267, + "learning_rate": 8.506894440468081e-05, + "loss": 0.16832005977630615, + "step": 35630 + }, + { + "epoch": 0.15300996883130263, + "grad_norm": 0.05570885166525841, + "learning_rate": 8.506463268456318e-05, + "loss": 0.07923851013183594, + "step": 35640 + }, + { + "epoch": 0.15305290092132265, + "grad_norm": 0.770665168762207, + "learning_rate": 8.506032096444556e-05, + "loss": 0.1516018271446228, + "step": 35650 + }, + { + "epoch": 0.15309583301134266, + "grad_norm": 0.013801711611449718, + "learning_rate": 8.505600924432794e-05, + "loss": 0.21667106151580812, + "step": 35660 + }, + { + "epoch": 0.15313876510136268, + "grad_norm": 0.03809322789311409, + "learning_rate": 8.505169752421032e-05, + "loss": 0.23052852153778075, + "step": 35670 + }, + { + "epoch": 0.15318169719138267, + "grad_norm": 1.606998324394226, + "learning_rate": 8.50473858040927e-05, + "loss": 0.2474133014678955, + "step": 35680 + }, + { + "epoch": 0.15322462928140268, + "grad_norm": 1.4611258506774902, + "learning_rate": 8.504307408397506e-05, + "loss": 0.22607591152191162, + "step": 35690 + }, + { + "epoch": 0.1532675613714227, + "grad_norm": 0.01265826728194952, + "learning_rate": 8.503876236385744e-05, + "loss": 0.03770926296710968, + "step": 35700 + }, + { + "epoch": 0.15331049346144268, + "grad_norm": 1.7693620920181274, + "learning_rate": 8.503445064373981e-05, + "loss": 0.15623393058776855, + "step": 35710 + }, + { + "epoch": 0.1533534255514627, + "grad_norm": 0.03232761472463608, + "learning_rate": 8.503013892362219e-05, + "loss": 0.13219951391220092, + "step": 35720 + }, + { + "epoch": 0.1533963576414827, + "grad_norm": 0.002780086826533079, + "learning_rate": 8.502582720350457e-05, + "loss": 0.2331317663192749, + "step": 35730 + }, + { + "epoch": 0.1534392897315027, + "grad_norm": 1.7431129217147827, + "learning_rate": 8.502151548338694e-05, + "loss": 0.2198162317276001, + "step": 35740 + }, + { + "epoch": 0.1534822218215227, + "grad_norm": 0.009173325262963772, + "learning_rate": 8.501720376326932e-05, + "loss": 0.011808304488658905, + "step": 35750 + }, + { + "epoch": 0.15352515391154273, + "grad_norm": 1.4654935598373413, + "learning_rate": 8.50128920431517e-05, + "loss": 0.5061048984527587, + "step": 35760 + }, + { + "epoch": 0.15356808600156271, + "grad_norm": 0.0031481690239161253, + "learning_rate": 8.500858032303408e-05, + "loss": 0.3672648906707764, + "step": 35770 + }, + { + "epoch": 0.15361101809158273, + "grad_norm": 0.00255988840945065, + "learning_rate": 8.500426860291645e-05, + "loss": 0.33251783847808836, + "step": 35780 + }, + { + "epoch": 0.15365395018160274, + "grad_norm": 0.08080971240997314, + "learning_rate": 8.499995688279883e-05, + "loss": 0.44880151748657227, + "step": 35790 + }, + { + "epoch": 0.15369688227162276, + "grad_norm": 0.02960016205906868, + "learning_rate": 8.499564516268121e-05, + "loss": 0.1435869574546814, + "step": 35800 + }, + { + "epoch": 0.15373981436164275, + "grad_norm": 1.4224238395690918, + "learning_rate": 8.499133344256359e-05, + "loss": 0.24386231899261473, + "step": 35810 + }, + { + "epoch": 0.15378274645166276, + "grad_norm": 2.1380560398101807, + "learning_rate": 8.498702172244596e-05, + "loss": 0.22258059978485106, + "step": 35820 + }, + { + "epoch": 0.15382567854168278, + "grad_norm": 3.0995121002197266, + "learning_rate": 8.498271000232834e-05, + "loss": 0.269660496711731, + "step": 35830 + }, + { + "epoch": 0.15386861063170276, + "grad_norm": 0.09611407667398453, + "learning_rate": 8.497839828221072e-05, + "loss": 0.04883643090724945, + "step": 35840 + }, + { + "epoch": 0.15391154272172278, + "grad_norm": 1.2176029682159424, + "learning_rate": 8.497408656209308e-05, + "loss": 0.16574220657348632, + "step": 35850 + }, + { + "epoch": 0.1539544748117428, + "grad_norm": 2.6885287761688232, + "learning_rate": 8.496977484197546e-05, + "loss": 0.15115034580230713, + "step": 35860 + }, + { + "epoch": 0.15399740690176278, + "grad_norm": 0.02519896999001503, + "learning_rate": 8.496546312185784e-05, + "loss": 0.32552452087402345, + "step": 35870 + }, + { + "epoch": 0.1540403389917828, + "grad_norm": 2.205995798110962, + "learning_rate": 8.496115140174021e-05, + "loss": 0.3075399875640869, + "step": 35880 + }, + { + "epoch": 0.1540832710818028, + "grad_norm": 1.0606608390808105, + "learning_rate": 8.495683968162259e-05, + "loss": 0.22299137115478515, + "step": 35890 + }, + { + "epoch": 0.15412620317182282, + "grad_norm": 0.01584595814347267, + "learning_rate": 8.495252796150497e-05, + "loss": 0.1980022072792053, + "step": 35900 + }, + { + "epoch": 0.1541691352618428, + "grad_norm": 0.8658286929130554, + "learning_rate": 8.494821624138735e-05, + "loss": 0.045036503672599794, + "step": 35910 + }, + { + "epoch": 0.15421206735186282, + "grad_norm": 5.6871771812438965, + "learning_rate": 8.494390452126972e-05, + "loss": 0.2639256238937378, + "step": 35920 + }, + { + "epoch": 0.15425499944188284, + "grad_norm": 0.06883329898118973, + "learning_rate": 8.493959280115209e-05, + "loss": 0.07429475784301758, + "step": 35930 + }, + { + "epoch": 0.15429793153190283, + "grad_norm": 34.059383392333984, + "learning_rate": 8.493528108103446e-05, + "loss": 0.3349571228027344, + "step": 35940 + }, + { + "epoch": 0.15434086362192284, + "grad_norm": 0.40555959939956665, + "learning_rate": 8.493096936091684e-05, + "loss": 0.09937132000923157, + "step": 35950 + }, + { + "epoch": 0.15438379571194286, + "grad_norm": 3.8519086837768555, + "learning_rate": 8.492665764079922e-05, + "loss": 0.2603405714035034, + "step": 35960 + }, + { + "epoch": 0.15442672780196284, + "grad_norm": 0.5037367939949036, + "learning_rate": 8.49223459206816e-05, + "loss": 0.06734598278999329, + "step": 35970 + }, + { + "epoch": 0.15446965989198286, + "grad_norm": 0.23887856304645538, + "learning_rate": 8.491803420056397e-05, + "loss": 0.30143325328826903, + "step": 35980 + }, + { + "epoch": 0.15451259198200287, + "grad_norm": 0.14862442016601562, + "learning_rate": 8.491372248044635e-05, + "loss": 0.2484194278717041, + "step": 35990 + }, + { + "epoch": 0.1545555240720229, + "grad_norm": 0.24175690114498138, + "learning_rate": 8.490941076032873e-05, + "loss": 0.17390332221984864, + "step": 36000 + }, + { + "epoch": 0.1545555240720229, + "eval_loss": 0.5203341841697693, + "eval_runtime": 27.4513, + "eval_samples_per_second": 3.643, + "eval_steps_per_second": 3.643, + "step": 36000 + }, + { + "epoch": 0.15459845616204287, + "grad_norm": 0.9831352829933167, + "learning_rate": 8.49050990402111e-05, + "loss": 0.2074409008026123, + "step": 36010 + }, + { + "epoch": 0.1546413882520629, + "grad_norm": 0.041125647723674774, + "learning_rate": 8.490078732009348e-05, + "loss": 0.14144506454467773, + "step": 36020 + }, + { + "epoch": 0.1546843203420829, + "grad_norm": 0.06358243525028229, + "learning_rate": 8.489647559997586e-05, + "loss": 0.3165971040725708, + "step": 36030 + }, + { + "epoch": 0.1547272524321029, + "grad_norm": 1.3796640634536743, + "learning_rate": 8.489216387985824e-05, + "loss": 0.2835206985473633, + "step": 36040 + }, + { + "epoch": 0.1547701845221229, + "grad_norm": 0.26504936814308167, + "learning_rate": 8.488785215974061e-05, + "loss": 0.26969594955444337, + "step": 36050 + }, + { + "epoch": 0.15481311661214292, + "grad_norm": 0.022056257352232933, + "learning_rate": 8.488354043962299e-05, + "loss": 0.1651803135871887, + "step": 36060 + }, + { + "epoch": 0.1548560487021629, + "grad_norm": 0.06279142946004868, + "learning_rate": 8.487922871950537e-05, + "loss": 0.2755633592605591, + "step": 36070 + }, + { + "epoch": 0.15489898079218292, + "grad_norm": 0.0036553270183503628, + "learning_rate": 8.487491699938775e-05, + "loss": 0.29899609088897705, + "step": 36080 + }, + { + "epoch": 0.15494191288220294, + "grad_norm": 2.297783613204956, + "learning_rate": 8.487060527927012e-05, + "loss": 0.5786831378936768, + "step": 36090 + }, + { + "epoch": 0.15498484497222295, + "grad_norm": 7.233226299285889, + "learning_rate": 8.486629355915249e-05, + "loss": 0.5784254550933838, + "step": 36100 + }, + { + "epoch": 0.15502777706224294, + "grad_norm": 0.054397862404584885, + "learning_rate": 8.486198183903487e-05, + "loss": 0.16331673860549928, + "step": 36110 + }, + { + "epoch": 0.15507070915226295, + "grad_norm": 18.670793533325195, + "learning_rate": 8.485767011891724e-05, + "loss": 0.19945695400238037, + "step": 36120 + }, + { + "epoch": 0.15511364124228297, + "grad_norm": 0.08316559344530106, + "learning_rate": 8.485335839879962e-05, + "loss": 0.13774746656417847, + "step": 36130 + }, + { + "epoch": 0.15515657333230296, + "grad_norm": 5.581072807312012, + "learning_rate": 8.4849046678682e-05, + "loss": 0.25059173107147215, + "step": 36140 + }, + { + "epoch": 0.15519950542232297, + "grad_norm": 0.045821044594049454, + "learning_rate": 8.484473495856437e-05, + "loss": 0.1590253233909607, + "step": 36150 + }, + { + "epoch": 0.15524243751234298, + "grad_norm": 0.0009496421553194523, + "learning_rate": 8.484042323844675e-05, + "loss": 0.3221937894821167, + "step": 36160 + }, + { + "epoch": 0.15528536960236297, + "grad_norm": 0.12948010861873627, + "learning_rate": 8.483611151832913e-05, + "loss": 0.06770175695419312, + "step": 36170 + }, + { + "epoch": 0.155328301692383, + "grad_norm": 1.3551721572875977, + "learning_rate": 8.48317997982115e-05, + "loss": 0.45805158615112307, + "step": 36180 + }, + { + "epoch": 0.155371233782403, + "grad_norm": 1.7094125747680664, + "learning_rate": 8.482748807809387e-05, + "loss": 0.3378595352172852, + "step": 36190 + }, + { + "epoch": 0.155414165872423, + "grad_norm": 1.1882890462875366, + "learning_rate": 8.482317635797625e-05, + "loss": 0.22665467262268066, + "step": 36200 + }, + { + "epoch": 0.155457097962443, + "grad_norm": 0.08076924830675125, + "learning_rate": 8.481886463785863e-05, + "loss": 0.2971139669418335, + "step": 36210 + }, + { + "epoch": 0.15550003005246302, + "grad_norm": 3.9227700233459473, + "learning_rate": 8.4814552917741e-05, + "loss": 0.16980533599853515, + "step": 36220 + }, + { + "epoch": 0.15554296214248303, + "grad_norm": 1.2522578239440918, + "learning_rate": 8.481024119762338e-05, + "loss": 0.21915044784545898, + "step": 36230 + }, + { + "epoch": 0.15558589423250302, + "grad_norm": 17.51162338256836, + "learning_rate": 8.480592947750576e-05, + "loss": 0.3069744110107422, + "step": 36240 + }, + { + "epoch": 0.15562882632252303, + "grad_norm": 1.3388787508010864, + "learning_rate": 8.480161775738813e-05, + "loss": 0.22943878173828125, + "step": 36250 + }, + { + "epoch": 0.15567175841254305, + "grad_norm": 0.29865941405296326, + "learning_rate": 8.479730603727051e-05, + "loss": 0.2652363538742065, + "step": 36260 + }, + { + "epoch": 0.15571469050256304, + "grad_norm": 0.10734880715608597, + "learning_rate": 8.479299431715289e-05, + "loss": 0.11708941459655761, + "step": 36270 + }, + { + "epoch": 0.15575762259258305, + "grad_norm": 0.14861108362674713, + "learning_rate": 8.478868259703527e-05, + "loss": 0.24161868095397948, + "step": 36280 + }, + { + "epoch": 0.15580055468260307, + "grad_norm": 0.6159643530845642, + "learning_rate": 8.478437087691764e-05, + "loss": 0.3283845901489258, + "step": 36290 + }, + { + "epoch": 0.15584348677262305, + "grad_norm": 0.009295495226979256, + "learning_rate": 8.478005915680002e-05, + "loss": 0.16799557209014893, + "step": 36300 + }, + { + "epoch": 0.15588641886264307, + "grad_norm": 0.19504667818546295, + "learning_rate": 8.47757474366824e-05, + "loss": 0.06348141431808471, + "step": 36310 + }, + { + "epoch": 0.15592935095266308, + "grad_norm": 1.60503089427948, + "learning_rate": 8.477143571656478e-05, + "loss": 0.14590160846710204, + "step": 36320 + }, + { + "epoch": 0.1559722830426831, + "grad_norm": 0.0022120329085737467, + "learning_rate": 8.476712399644715e-05, + "loss": 0.33194334506988527, + "step": 36330 + }, + { + "epoch": 0.15601521513270308, + "grad_norm": 5.712146282196045, + "learning_rate": 8.476281227632953e-05, + "loss": 0.260235071182251, + "step": 36340 + }, + { + "epoch": 0.1560581472227231, + "grad_norm": 1.7155770063400269, + "learning_rate": 8.47585005562119e-05, + "loss": 0.3236240386962891, + "step": 36350 + }, + { + "epoch": 0.1561010793127431, + "grad_norm": 0.96340012550354, + "learning_rate": 8.475418883609427e-05, + "loss": 0.3302635669708252, + "step": 36360 + }, + { + "epoch": 0.1561440114027631, + "grad_norm": 0.015084980055689812, + "learning_rate": 8.474987711597665e-05, + "loss": 0.1243367314338684, + "step": 36370 + }, + { + "epoch": 0.15618694349278311, + "grad_norm": 7.593947410583496, + "learning_rate": 8.474556539585903e-05, + "loss": 0.24390408992767335, + "step": 36380 + }, + { + "epoch": 0.15622987558280313, + "grad_norm": 0.6490558981895447, + "learning_rate": 8.47412536757414e-05, + "loss": 0.25506789684295655, + "step": 36390 + }, + { + "epoch": 0.15627280767282312, + "grad_norm": 0.027163298800587654, + "learning_rate": 8.473694195562378e-05, + "loss": 0.3595500707626343, + "step": 36400 + }, + { + "epoch": 0.15631573976284313, + "grad_norm": 0.00042546645272523165, + "learning_rate": 8.473263023550616e-05, + "loss": 0.14694883823394775, + "step": 36410 + }, + { + "epoch": 0.15635867185286315, + "grad_norm": 0.024790508672595024, + "learning_rate": 8.472831851538854e-05, + "loss": 0.18631467819213868, + "step": 36420 + }, + { + "epoch": 0.15640160394288316, + "grad_norm": 0.0071322438307106495, + "learning_rate": 8.47240067952709e-05, + "loss": 0.42502856254577637, + "step": 36430 + }, + { + "epoch": 0.15644453603290315, + "grad_norm": 4.77712869644165, + "learning_rate": 8.471969507515328e-05, + "loss": 0.22539980411529542, + "step": 36440 + }, + { + "epoch": 0.15648746812292316, + "grad_norm": 0.6240074634552002, + "learning_rate": 8.471538335503565e-05, + "loss": 0.13043266534805298, + "step": 36450 + }, + { + "epoch": 0.15653040021294318, + "grad_norm": 2.7969493865966797, + "learning_rate": 8.471107163491803e-05, + "loss": 0.322530198097229, + "step": 36460 + }, + { + "epoch": 0.15657333230296316, + "grad_norm": 2.325531244277954, + "learning_rate": 8.470675991480041e-05, + "loss": 0.2518151044845581, + "step": 36470 + }, + { + "epoch": 0.15661626439298318, + "grad_norm": 0.013661663047969341, + "learning_rate": 8.470244819468279e-05, + "loss": 0.28728699684143066, + "step": 36480 + }, + { + "epoch": 0.1566591964830032, + "grad_norm": 2.1703040599823, + "learning_rate": 8.469813647456518e-05, + "loss": 0.1820298433303833, + "step": 36490 + }, + { + "epoch": 0.15670212857302318, + "grad_norm": 0.004863078705966473, + "learning_rate": 8.469382475444755e-05, + "loss": 0.43201313018798826, + "step": 36500 + }, + { + "epoch": 0.1567450606630432, + "grad_norm": 0.0009345727739855647, + "learning_rate": 8.468951303432992e-05, + "loss": 0.19721375703811644, + "step": 36510 + }, + { + "epoch": 0.1567879927530632, + "grad_norm": 0.022429587319493294, + "learning_rate": 8.46852013142123e-05, + "loss": 0.15839117765426636, + "step": 36520 + }, + { + "epoch": 0.15683092484308322, + "grad_norm": 0.006632617674767971, + "learning_rate": 8.468088959409467e-05, + "loss": 0.06467214822769166, + "step": 36530 + }, + { + "epoch": 0.1568738569331032, + "grad_norm": 0.013401179574429989, + "learning_rate": 8.467657787397705e-05, + "loss": 0.35011792182922363, + "step": 36540 + }, + { + "epoch": 0.15691678902312323, + "grad_norm": 0.0018219004850834608, + "learning_rate": 8.467226615385943e-05, + "loss": 0.2899348497390747, + "step": 36550 + }, + { + "epoch": 0.15695972111314324, + "grad_norm": 8.564081192016602, + "learning_rate": 8.46679544337418e-05, + "loss": 0.48504009246826174, + "step": 36560 + }, + { + "epoch": 0.15700265320316323, + "grad_norm": 0.031425461173057556, + "learning_rate": 8.466364271362418e-05, + "loss": 0.19798953533172609, + "step": 36570 + }, + { + "epoch": 0.15704558529318324, + "grad_norm": 1.5136810541152954, + "learning_rate": 8.465933099350656e-05, + "loss": 0.2838395595550537, + "step": 36580 + }, + { + "epoch": 0.15708851738320326, + "grad_norm": 0.0031062799971550703, + "learning_rate": 8.465501927338892e-05, + "loss": 0.1323293685913086, + "step": 36590 + }, + { + "epoch": 0.15713144947322324, + "grad_norm": 0.07706096768379211, + "learning_rate": 8.46507075532713e-05, + "loss": 0.10394009351730346, + "step": 36600 + }, + { + "epoch": 0.15717438156324326, + "grad_norm": 1.4084281921386719, + "learning_rate": 8.464639583315368e-05, + "loss": 0.30106706619262696, + "step": 36610 + }, + { + "epoch": 0.15721731365326327, + "grad_norm": 0.29943132400512695, + "learning_rate": 8.464208411303606e-05, + "loss": 0.19430623054504395, + "step": 36620 + }, + { + "epoch": 0.15726024574328326, + "grad_norm": 0.004392318427562714, + "learning_rate": 8.463777239291843e-05, + "loss": 0.2858541488647461, + "step": 36630 + }, + { + "epoch": 0.15730317783330328, + "grad_norm": 0.006837871856987476, + "learning_rate": 8.463346067280081e-05, + "loss": 0.3170685529708862, + "step": 36640 + }, + { + "epoch": 0.1573461099233233, + "grad_norm": 0.005509480368345976, + "learning_rate": 8.462914895268319e-05, + "loss": 0.1708204984664917, + "step": 36650 + }, + { + "epoch": 0.1573890420133433, + "grad_norm": 0.007557610981166363, + "learning_rate": 8.462483723256557e-05, + "loss": 0.05756177306175232, + "step": 36660 + }, + { + "epoch": 0.1574319741033633, + "grad_norm": 0.020103711634874344, + "learning_rate": 8.462052551244793e-05, + "loss": 0.1241947054862976, + "step": 36670 + }, + { + "epoch": 0.1574749061933833, + "grad_norm": 0.11283276975154877, + "learning_rate": 8.46162137923303e-05, + "loss": 0.4595289707183838, + "step": 36680 + }, + { + "epoch": 0.15751783828340332, + "grad_norm": 2.015280246734619, + "learning_rate": 8.461190207221268e-05, + "loss": 0.25780715942382815, + "step": 36690 + }, + { + "epoch": 0.1575607703734233, + "grad_norm": 0.0033210236579179764, + "learning_rate": 8.460759035209506e-05, + "loss": 0.22688713073730468, + "step": 36700 + }, + { + "epoch": 0.15760370246344332, + "grad_norm": 0.051066651940345764, + "learning_rate": 8.460327863197745e-05, + "loss": 0.153584885597229, + "step": 36710 + }, + { + "epoch": 0.15764663455346334, + "grad_norm": 0.017148710787296295, + "learning_rate": 8.459896691185983e-05, + "loss": 0.2938904523849487, + "step": 36720 + }, + { + "epoch": 0.15768956664348333, + "grad_norm": 0.14058199524879456, + "learning_rate": 8.45946551917422e-05, + "loss": 0.1530178666114807, + "step": 36730 + }, + { + "epoch": 0.15773249873350334, + "grad_norm": 0.011079216375946999, + "learning_rate": 8.459034347162458e-05, + "loss": 0.23626675605773925, + "step": 36740 + }, + { + "epoch": 0.15777543082352335, + "grad_norm": 1.6957927942276, + "learning_rate": 8.458603175150696e-05, + "loss": 0.2987285375595093, + "step": 36750 + }, + { + "epoch": 0.15781836291354337, + "grad_norm": 1.522979736328125, + "learning_rate": 8.458172003138932e-05, + "loss": 0.325986385345459, + "step": 36760 + }, + { + "epoch": 0.15786129500356336, + "grad_norm": 0.02568388730287552, + "learning_rate": 8.45774083112717e-05, + "loss": 0.16397476196289062, + "step": 36770 + }, + { + "epoch": 0.15790422709358337, + "grad_norm": 0.028842099010944366, + "learning_rate": 8.457309659115408e-05, + "loss": 0.16065794229507446, + "step": 36780 + }, + { + "epoch": 0.1579471591836034, + "grad_norm": 0.01377957034856081, + "learning_rate": 8.456878487103646e-05, + "loss": 0.1282724380493164, + "step": 36790 + }, + { + "epoch": 0.15799009127362337, + "grad_norm": 0.01206178404390812, + "learning_rate": 8.456447315091883e-05, + "loss": 0.15207911729812623, + "step": 36800 + }, + { + "epoch": 0.1580330233636434, + "grad_norm": 0.4078097343444824, + "learning_rate": 8.456016143080121e-05, + "loss": 0.21211161613464355, + "step": 36810 + }, + { + "epoch": 0.1580759554536634, + "grad_norm": 1.6632593870162964, + "learning_rate": 8.455584971068359e-05, + "loss": 0.14590305089950562, + "step": 36820 + }, + { + "epoch": 0.1581188875436834, + "grad_norm": 3.642359972000122, + "learning_rate": 8.455153799056597e-05, + "loss": 0.16237971782684327, + "step": 36830 + }, + { + "epoch": 0.1581618196337034, + "grad_norm": 0.7893156409263611, + "learning_rate": 8.454722627044833e-05, + "loss": 0.25860013961791994, + "step": 36840 + }, + { + "epoch": 0.15820475172372342, + "grad_norm": 0.0552891306579113, + "learning_rate": 8.454291455033071e-05, + "loss": 0.12260349988937377, + "step": 36850 + }, + { + "epoch": 0.15824768381374343, + "grad_norm": 0.0010891527635976672, + "learning_rate": 8.453860283021308e-05, + "loss": 0.1928635835647583, + "step": 36860 + }, + { + "epoch": 0.15829061590376342, + "grad_norm": 0.009116712026298046, + "learning_rate": 8.453429111009546e-05, + "loss": 0.39304156303405763, + "step": 36870 + }, + { + "epoch": 0.15833354799378344, + "grad_norm": 1.2738218307495117, + "learning_rate": 8.452997938997784e-05, + "loss": 0.2733743190765381, + "step": 36880 + }, + { + "epoch": 0.15837648008380345, + "grad_norm": 0.06841573864221573, + "learning_rate": 8.452566766986022e-05, + "loss": 0.286100172996521, + "step": 36890 + }, + { + "epoch": 0.15841941217382344, + "grad_norm": 0.003039463423192501, + "learning_rate": 8.45213559497426e-05, + "loss": 0.28062286376953127, + "step": 36900 + }, + { + "epoch": 0.15846234426384345, + "grad_norm": 3.5420756340026855, + "learning_rate": 8.451704422962497e-05, + "loss": 0.3182435274124146, + "step": 36910 + }, + { + "epoch": 0.15850527635386347, + "grad_norm": 0.7639907002449036, + "learning_rate": 8.451273250950734e-05, + "loss": 0.13674689531326295, + "step": 36920 + }, + { + "epoch": 0.15854820844388345, + "grad_norm": 3.4631237983703613, + "learning_rate": 8.450842078938973e-05, + "loss": 0.505620002746582, + "step": 36930 + }, + { + "epoch": 0.15859114053390347, + "grad_norm": 0.16935129463672638, + "learning_rate": 8.45041090692721e-05, + "loss": 0.3636377573013306, + "step": 36940 + }, + { + "epoch": 0.15863407262392348, + "grad_norm": 0.040034808218479156, + "learning_rate": 8.449979734915448e-05, + "loss": 0.2877780914306641, + "step": 36950 + }, + { + "epoch": 0.1586770047139435, + "grad_norm": 1.0555989742279053, + "learning_rate": 8.449548562903686e-05, + "loss": 0.40925869941711424, + "step": 36960 + }, + { + "epoch": 0.15871993680396349, + "grad_norm": 0.3184961676597595, + "learning_rate": 8.449117390891924e-05, + "loss": 0.27372145652770996, + "step": 36970 + }, + { + "epoch": 0.1587628688939835, + "grad_norm": 3.1204659938812256, + "learning_rate": 8.448686218880161e-05, + "loss": 0.08157891035079956, + "step": 36980 + }, + { + "epoch": 0.15880580098400351, + "grad_norm": 1.3315865993499756, + "learning_rate": 8.448255046868399e-05, + "loss": 0.3175251245498657, + "step": 36990 + }, + { + "epoch": 0.1588487330740235, + "grad_norm": 0.01965898834168911, + "learning_rate": 8.447823874856635e-05, + "loss": 0.17869733572006224, + "step": 37000 + }, + { + "epoch": 0.1588487330740235, + "eval_loss": 0.4670685827732086, + "eval_runtime": 27.4012, + "eval_samples_per_second": 3.649, + "eval_steps_per_second": 3.649, + "step": 37000 + }, + { + "epoch": 0.15889166516404352, + "grad_norm": 9.808270454406738, + "learning_rate": 8.447392702844873e-05, + "loss": 0.2566872835159302, + "step": 37010 + }, + { + "epoch": 0.15893459725406353, + "grad_norm": 0.01820688508450985, + "learning_rate": 8.446961530833111e-05, + "loss": 0.1685408592224121, + "step": 37020 + }, + { + "epoch": 0.15897752934408352, + "grad_norm": 5.085228443145752, + "learning_rate": 8.446530358821349e-05, + "loss": 0.13185917139053344, + "step": 37030 + }, + { + "epoch": 0.15902046143410353, + "grad_norm": 0.7252869606018066, + "learning_rate": 8.446099186809586e-05, + "loss": 0.0749538004398346, + "step": 37040 + }, + { + "epoch": 0.15906339352412355, + "grad_norm": 0.09191256016492844, + "learning_rate": 8.445668014797824e-05, + "loss": 0.30454657077789304, + "step": 37050 + }, + { + "epoch": 0.15910632561414353, + "grad_norm": 0.0034016177523881197, + "learning_rate": 8.445236842786062e-05, + "loss": 0.22394905090332032, + "step": 37060 + }, + { + "epoch": 0.15914925770416355, + "grad_norm": 0.07419943064451218, + "learning_rate": 8.4448056707743e-05, + "loss": 0.24617114067077636, + "step": 37070 + }, + { + "epoch": 0.15919218979418356, + "grad_norm": 19.793487548828125, + "learning_rate": 8.444374498762537e-05, + "loss": 0.3588016748428345, + "step": 37080 + }, + { + "epoch": 0.15923512188420358, + "grad_norm": 0.8470166921615601, + "learning_rate": 8.443943326750774e-05, + "loss": 0.15342183113098146, + "step": 37090 + }, + { + "epoch": 0.15927805397422357, + "grad_norm": 0.09765598922967911, + "learning_rate": 8.443512154739011e-05, + "loss": 0.395892071723938, + "step": 37100 + }, + { + "epoch": 0.15932098606424358, + "grad_norm": 0.03148749843239784, + "learning_rate": 8.443080982727249e-05, + "loss": 0.14055685997009276, + "step": 37110 + }, + { + "epoch": 0.1593639181542636, + "grad_norm": 0.9235115647315979, + "learning_rate": 8.442649810715487e-05, + "loss": 0.04926349222660065, + "step": 37120 + }, + { + "epoch": 0.15940685024428358, + "grad_norm": 2.640244483947754, + "learning_rate": 8.442218638703725e-05, + "loss": 0.11464877128601074, + "step": 37130 + }, + { + "epoch": 0.1594497823343036, + "grad_norm": 0.016359224915504456, + "learning_rate": 8.441787466691962e-05, + "loss": 0.2091724157333374, + "step": 37140 + }, + { + "epoch": 0.1594927144243236, + "grad_norm": 1.4536552429199219, + "learning_rate": 8.4413562946802e-05, + "loss": 0.45433721542358396, + "step": 37150 + }, + { + "epoch": 0.1595356465143436, + "grad_norm": 3.6631672382354736, + "learning_rate": 8.440925122668438e-05, + "loss": 0.3959526300430298, + "step": 37160 + }, + { + "epoch": 0.1595785786043636, + "grad_norm": 0.037146102637052536, + "learning_rate": 8.440493950656676e-05, + "loss": 0.11330243349075317, + "step": 37170 + }, + { + "epoch": 0.15962151069438363, + "grad_norm": 0.1929967701435089, + "learning_rate": 8.440062778644913e-05, + "loss": 0.2217888355255127, + "step": 37180 + }, + { + "epoch": 0.15966444278440364, + "grad_norm": 0.16690918803215027, + "learning_rate": 8.439631606633151e-05, + "loss": 0.26970853805541994, + "step": 37190 + }, + { + "epoch": 0.15970737487442363, + "grad_norm": 0.018210075795650482, + "learning_rate": 8.439200434621389e-05, + "loss": 0.00593365877866745, + "step": 37200 + }, + { + "epoch": 0.15975030696444364, + "grad_norm": 0.005669247359037399, + "learning_rate": 8.438769262609626e-05, + "loss": 0.35240318775177004, + "step": 37210 + }, + { + "epoch": 0.15979323905446366, + "grad_norm": 8.740744590759277, + "learning_rate": 8.438338090597864e-05, + "loss": 0.14758760929107667, + "step": 37220 + }, + { + "epoch": 0.15983617114448365, + "grad_norm": 0.017681747674942017, + "learning_rate": 8.437906918586102e-05, + "loss": 0.1936264991760254, + "step": 37230 + }, + { + "epoch": 0.15987910323450366, + "grad_norm": 2.3267531394958496, + "learning_rate": 8.43747574657434e-05, + "loss": 0.25931923389434813, + "step": 37240 + }, + { + "epoch": 0.15992203532452368, + "grad_norm": 0.01451034378260374, + "learning_rate": 8.437044574562576e-05, + "loss": 0.2892030954360962, + "step": 37250 + }, + { + "epoch": 0.15996496741454366, + "grad_norm": 0.5280787348747253, + "learning_rate": 8.436613402550814e-05, + "loss": 0.35807051658630373, + "step": 37260 + }, + { + "epoch": 0.16000789950456368, + "grad_norm": 1.7871356010437012, + "learning_rate": 8.436182230539052e-05, + "loss": 0.4931039333343506, + "step": 37270 + }, + { + "epoch": 0.1600508315945837, + "grad_norm": 0.7012388706207275, + "learning_rate": 8.435751058527289e-05, + "loss": 0.07530275583267212, + "step": 37280 + }, + { + "epoch": 0.1600937636846037, + "grad_norm": 1.1680361032485962, + "learning_rate": 8.435319886515527e-05, + "loss": 0.23157844543457032, + "step": 37290 + }, + { + "epoch": 0.1601366957746237, + "grad_norm": 0.012162178754806519, + "learning_rate": 8.434888714503765e-05, + "loss": 0.19087698459625244, + "step": 37300 + }, + { + "epoch": 0.1601796278646437, + "grad_norm": 2.447385787963867, + "learning_rate": 8.434457542492002e-05, + "loss": 0.21148381233215333, + "step": 37310 + }, + { + "epoch": 0.16022255995466372, + "grad_norm": 3.5636308193206787, + "learning_rate": 8.43402637048024e-05, + "loss": 0.4460421085357666, + "step": 37320 + }, + { + "epoch": 0.1602654920446837, + "grad_norm": 1.0085793733596802, + "learning_rate": 8.433595198468477e-05, + "loss": 0.21883647441864013, + "step": 37330 + }, + { + "epoch": 0.16030842413470373, + "grad_norm": 0.15825310349464417, + "learning_rate": 8.433164026456714e-05, + "loss": 0.25668346881866455, + "step": 37340 + }, + { + "epoch": 0.16035135622472374, + "grad_norm": 0.021424664184451103, + "learning_rate": 8.432732854444952e-05, + "loss": 0.3583883285522461, + "step": 37350 + }, + { + "epoch": 0.16039428831474373, + "grad_norm": 0.020336715504527092, + "learning_rate": 8.43230168243319e-05, + "loss": 0.09908640384674072, + "step": 37360 + }, + { + "epoch": 0.16043722040476374, + "grad_norm": 0.04126065596938133, + "learning_rate": 8.431870510421428e-05, + "loss": 0.3000126123428345, + "step": 37370 + }, + { + "epoch": 0.16048015249478376, + "grad_norm": 0.26878777146339417, + "learning_rate": 8.431439338409665e-05, + "loss": 0.18765181303024292, + "step": 37380 + }, + { + "epoch": 0.16052308458480377, + "grad_norm": 0.025135841220617294, + "learning_rate": 8.431008166397903e-05, + "loss": 0.03208061158657074, + "step": 37390 + }, + { + "epoch": 0.16056601667482376, + "grad_norm": 0.005874186288565397, + "learning_rate": 8.430576994386141e-05, + "loss": 0.11698212623596191, + "step": 37400 + }, + { + "epoch": 0.16060894876484377, + "grad_norm": 2.410247325897217, + "learning_rate": 8.430145822374378e-05, + "loss": 0.41802539825439455, + "step": 37410 + }, + { + "epoch": 0.1606518808548638, + "grad_norm": 0.014765270985662937, + "learning_rate": 8.429714650362616e-05, + "loss": 0.19299309253692626, + "step": 37420 + }, + { + "epoch": 0.16069481294488377, + "grad_norm": 0.8710483312606812, + "learning_rate": 8.429283478350854e-05, + "loss": 0.14042598009109497, + "step": 37430 + }, + { + "epoch": 0.1607377450349038, + "grad_norm": 0.06303207576274872, + "learning_rate": 8.428852306339092e-05, + "loss": 0.08297204971313477, + "step": 37440 + }, + { + "epoch": 0.1607806771249238, + "grad_norm": 0.020654955878853798, + "learning_rate": 8.42842113432733e-05, + "loss": 0.2506140470504761, + "step": 37450 + }, + { + "epoch": 0.1608236092149438, + "grad_norm": 0.2789359390735626, + "learning_rate": 8.427989962315567e-05, + "loss": 0.20466957092285157, + "step": 37460 + }, + { + "epoch": 0.1608665413049638, + "grad_norm": 4.288355350494385, + "learning_rate": 8.427558790303805e-05, + "loss": 0.2857557773590088, + "step": 37470 + }, + { + "epoch": 0.16090947339498382, + "grad_norm": 3.2750890254974365, + "learning_rate": 8.427127618292043e-05, + "loss": 0.2011713981628418, + "step": 37480 + }, + { + "epoch": 0.1609524054850038, + "grad_norm": 3.755577564239502, + "learning_rate": 8.42669644628028e-05, + "loss": 0.30210537910461427, + "step": 37490 + }, + { + "epoch": 0.16099533757502382, + "grad_norm": 2.026909112930298, + "learning_rate": 8.426265274268517e-05, + "loss": 0.41617717742919924, + "step": 37500 + }, + { + "epoch": 0.16103826966504384, + "grad_norm": 4.035932540893555, + "learning_rate": 8.425834102256754e-05, + "loss": 0.263014030456543, + "step": 37510 + }, + { + "epoch": 0.16108120175506385, + "grad_norm": 0.35158804059028625, + "learning_rate": 8.425402930244992e-05, + "loss": 0.007732333242893219, + "step": 37520 + }, + { + "epoch": 0.16112413384508384, + "grad_norm": 0.1560453325510025, + "learning_rate": 8.42497175823323e-05, + "loss": 0.1600959300994873, + "step": 37530 + }, + { + "epoch": 0.16116706593510385, + "grad_norm": 0.06052204594016075, + "learning_rate": 8.424540586221468e-05, + "loss": 0.2086103916168213, + "step": 37540 + }, + { + "epoch": 0.16120999802512387, + "grad_norm": 0.007203094661235809, + "learning_rate": 8.424109414209705e-05, + "loss": 0.11548494100570679, + "step": 37550 + }, + { + "epoch": 0.16125293011514386, + "grad_norm": 0.02814324013888836, + "learning_rate": 8.423678242197943e-05, + "loss": 0.09334410429000854, + "step": 37560 + }, + { + "epoch": 0.16129586220516387, + "grad_norm": 0.9473418593406677, + "learning_rate": 8.423247070186181e-05, + "loss": 0.09891886115074158, + "step": 37570 + }, + { + "epoch": 0.16133879429518388, + "grad_norm": 1.5969558954238892, + "learning_rate": 8.422815898174417e-05, + "loss": 0.3447112560272217, + "step": 37580 + }, + { + "epoch": 0.16138172638520387, + "grad_norm": 0.9284400343894958, + "learning_rate": 8.422384726162655e-05, + "loss": 0.36273190975189207, + "step": 37590 + }, + { + "epoch": 0.1614246584752239, + "grad_norm": 0.7826418280601501, + "learning_rate": 8.421953554150893e-05, + "loss": 0.06588080525398254, + "step": 37600 + }, + { + "epoch": 0.1614675905652439, + "grad_norm": 1.0751705169677734, + "learning_rate": 8.42152238213913e-05, + "loss": 0.20182514190673828, + "step": 37610 + }, + { + "epoch": 0.16151052265526392, + "grad_norm": 1.7379590272903442, + "learning_rate": 8.421091210127368e-05, + "loss": 0.35259897708892823, + "step": 37620 + }, + { + "epoch": 0.1615534547452839, + "grad_norm": 1.1306309700012207, + "learning_rate": 8.420660038115606e-05, + "loss": 0.3186183452606201, + "step": 37630 + }, + { + "epoch": 0.16159638683530392, + "grad_norm": 0.6245974898338318, + "learning_rate": 8.420228866103844e-05, + "loss": 0.19649020433425904, + "step": 37640 + }, + { + "epoch": 0.16163931892532393, + "grad_norm": 1.8757081031799316, + "learning_rate": 8.419797694092081e-05, + "loss": 0.16638264656066895, + "step": 37650 + }, + { + "epoch": 0.16168225101534392, + "grad_norm": 1.2016990184783936, + "learning_rate": 8.419366522080319e-05, + "loss": 0.34046337604522703, + "step": 37660 + }, + { + "epoch": 0.16172518310536393, + "grad_norm": 18.265628814697266, + "learning_rate": 8.418935350068557e-05, + "loss": 0.13246519565582277, + "step": 37670 + }, + { + "epoch": 0.16176811519538395, + "grad_norm": 2.4141156673431396, + "learning_rate": 8.418504178056795e-05, + "loss": 0.24419615268707276, + "step": 37680 + }, + { + "epoch": 0.16181104728540394, + "grad_norm": 1.8383233547210693, + "learning_rate": 8.418073006045032e-05, + "loss": 0.23851385116577148, + "step": 37690 + }, + { + "epoch": 0.16185397937542395, + "grad_norm": 0.0010904427617788315, + "learning_rate": 8.41764183403327e-05, + "loss": 0.03627316951751709, + "step": 37700 + }, + { + "epoch": 0.16189691146544397, + "grad_norm": 1.3042843341827393, + "learning_rate": 8.417210662021508e-05, + "loss": 0.12120405435562134, + "step": 37710 + }, + { + "epoch": 0.16193984355546398, + "grad_norm": 0.03553265333175659, + "learning_rate": 8.416779490009745e-05, + "loss": 0.29834039211273194, + "step": 37720 + }, + { + "epoch": 0.16198277564548397, + "grad_norm": 0.03280557319521904, + "learning_rate": 8.416348317997983e-05, + "loss": 0.22968535423278807, + "step": 37730 + }, + { + "epoch": 0.16202570773550398, + "grad_norm": 0.3235560357570648, + "learning_rate": 8.41591714598622e-05, + "loss": 0.22959692478179933, + "step": 37740 + }, + { + "epoch": 0.162068639825524, + "grad_norm": 3.005444288253784, + "learning_rate": 8.415485973974457e-05, + "loss": 0.22586019039154054, + "step": 37750 + }, + { + "epoch": 0.16211157191554398, + "grad_norm": 0.9470056891441345, + "learning_rate": 8.415054801962695e-05, + "loss": 0.27291393280029297, + "step": 37760 + }, + { + "epoch": 0.162154504005564, + "grad_norm": 0.06368622928857803, + "learning_rate": 8.414623629950933e-05, + "loss": 0.3721860647201538, + "step": 37770 + }, + { + "epoch": 0.162197436095584, + "grad_norm": 0.3762516379356384, + "learning_rate": 8.41419245793917e-05, + "loss": 0.14585323333740235, + "step": 37780 + }, + { + "epoch": 0.162240368185604, + "grad_norm": 4.130314826965332, + "learning_rate": 8.413761285927408e-05, + "loss": 0.3997512340545654, + "step": 37790 + }, + { + "epoch": 0.16228330027562402, + "grad_norm": 0.42971429228782654, + "learning_rate": 8.413330113915646e-05, + "loss": 0.238565993309021, + "step": 37800 + }, + { + "epoch": 0.16232623236564403, + "grad_norm": 0.07225003093481064, + "learning_rate": 8.412898941903884e-05, + "loss": 0.3309208154678345, + "step": 37810 + }, + { + "epoch": 0.16236916445566402, + "grad_norm": 3.640429735183716, + "learning_rate": 8.412467769892121e-05, + "loss": 0.280135440826416, + "step": 37820 + }, + { + "epoch": 0.16241209654568403, + "grad_norm": 0.04398081824183464, + "learning_rate": 8.412036597880358e-05, + "loss": 0.35233240127563475, + "step": 37830 + }, + { + "epoch": 0.16245502863570405, + "grad_norm": 2.9799294471740723, + "learning_rate": 8.411605425868596e-05, + "loss": 0.5887829780578613, + "step": 37840 + }, + { + "epoch": 0.16249796072572406, + "grad_norm": 1.3944103717803955, + "learning_rate": 8.411174253856833e-05, + "loss": 0.37382643222808837, + "step": 37850 + }, + { + "epoch": 0.16254089281574405, + "grad_norm": 0.040403928607702255, + "learning_rate": 8.410743081845071e-05, + "loss": 0.11647080183029175, + "step": 37860 + }, + { + "epoch": 0.16258382490576406, + "grad_norm": 1.098812222480774, + "learning_rate": 8.410311909833309e-05, + "loss": 0.19743818044662476, + "step": 37870 + }, + { + "epoch": 0.16262675699578408, + "grad_norm": 3.2215824127197266, + "learning_rate": 8.409880737821547e-05, + "loss": 0.24659082889556885, + "step": 37880 + }, + { + "epoch": 0.16266968908580406, + "grad_norm": 0.6110808253288269, + "learning_rate": 8.409449565809784e-05, + "loss": 0.28290505409240724, + "step": 37890 + }, + { + "epoch": 0.16271262117582408, + "grad_norm": 0.11528348922729492, + "learning_rate": 8.409018393798023e-05, + "loss": 0.36647610664367675, + "step": 37900 + }, + { + "epoch": 0.1627555532658441, + "grad_norm": 2.565178155899048, + "learning_rate": 8.40858722178626e-05, + "loss": 0.29935436248779296, + "step": 37910 + }, + { + "epoch": 0.16279848535586408, + "grad_norm": 0.04275639355182648, + "learning_rate": 8.408156049774497e-05, + "loss": 0.2516483783721924, + "step": 37920 + }, + { + "epoch": 0.1628414174458841, + "grad_norm": 2.4341046810150146, + "learning_rate": 8.407724877762735e-05, + "loss": 0.39244976043701174, + "step": 37930 + }, + { + "epoch": 0.1628843495359041, + "grad_norm": 0.3288959562778473, + "learning_rate": 8.407293705750973e-05, + "loss": 0.1974055528640747, + "step": 37940 + }, + { + "epoch": 0.16292728162592413, + "grad_norm": 0.2797980010509491, + "learning_rate": 8.406862533739211e-05, + "loss": 0.2573603391647339, + "step": 37950 + }, + { + "epoch": 0.1629702137159441, + "grad_norm": 2.3312742710113525, + "learning_rate": 8.406431361727448e-05, + "loss": 0.4253364562988281, + "step": 37960 + }, + { + "epoch": 0.16301314580596413, + "grad_norm": 0.15798674523830414, + "learning_rate": 8.406000189715686e-05, + "loss": 0.2994741201400757, + "step": 37970 + }, + { + "epoch": 0.16305607789598414, + "grad_norm": 0.6870970726013184, + "learning_rate": 8.405569017703924e-05, + "loss": 0.3028193712234497, + "step": 37980 + }, + { + "epoch": 0.16309900998600413, + "grad_norm": 1.180202603340149, + "learning_rate": 8.40513784569216e-05, + "loss": 0.20432660579681397, + "step": 37990 + }, + { + "epoch": 0.16314194207602414, + "grad_norm": 0.05655556917190552, + "learning_rate": 8.404706673680398e-05, + "loss": 0.08881938457489014, + "step": 38000 + }, + { + "epoch": 0.16314194207602414, + "eval_loss": 0.4883147180080414, + "eval_runtime": 27.6249, + "eval_samples_per_second": 3.62, + "eval_steps_per_second": 3.62, + "step": 38000 + }, + { + "epoch": 0.16318487416604416, + "grad_norm": 1.2886759042739868, + "learning_rate": 8.404275501668636e-05, + "loss": 0.30479917526245115, + "step": 38010 + }, + { + "epoch": 0.16322780625606415, + "grad_norm": 1.9093838930130005, + "learning_rate": 8.403844329656873e-05, + "loss": 0.33849563598632815, + "step": 38020 + }, + { + "epoch": 0.16327073834608416, + "grad_norm": 0.09240783751010895, + "learning_rate": 8.403413157645111e-05, + "loss": 0.3744537353515625, + "step": 38030 + }, + { + "epoch": 0.16331367043610417, + "grad_norm": 0.06679513305425644, + "learning_rate": 8.402981985633349e-05, + "loss": 0.24076201915740966, + "step": 38040 + }, + { + "epoch": 0.1633566025261242, + "grad_norm": 1.3797893524169922, + "learning_rate": 8.402550813621587e-05, + "loss": 0.30988612174987795, + "step": 38050 + }, + { + "epoch": 0.16339953461614418, + "grad_norm": 3.352512836456299, + "learning_rate": 8.402119641609824e-05, + "loss": 0.3587747097015381, + "step": 38060 + }, + { + "epoch": 0.1634424667061642, + "grad_norm": 17.774965286254883, + "learning_rate": 8.401688469598061e-05, + "loss": 0.402712345123291, + "step": 38070 + }, + { + "epoch": 0.1634853987961842, + "grad_norm": 3.710422992706299, + "learning_rate": 8.401257297586299e-05, + "loss": 0.2656731128692627, + "step": 38080 + }, + { + "epoch": 0.1635283308862042, + "grad_norm": 0.02095325104892254, + "learning_rate": 8.400826125574536e-05, + "loss": 0.04868249893188477, + "step": 38090 + }, + { + "epoch": 0.1635712629762242, + "grad_norm": 0.013746113516390324, + "learning_rate": 8.400394953562774e-05, + "loss": 0.2931448221206665, + "step": 38100 + }, + { + "epoch": 0.16361419506624422, + "grad_norm": 0.6826333999633789, + "learning_rate": 8.399963781551012e-05, + "loss": 0.015343394875526429, + "step": 38110 + }, + { + "epoch": 0.1636571271562642, + "grad_norm": 1.5140215158462524, + "learning_rate": 8.399532609539251e-05, + "loss": 0.372647762298584, + "step": 38120 + }, + { + "epoch": 0.16370005924628422, + "grad_norm": 0.013481021858751774, + "learning_rate": 8.399101437527489e-05, + "loss": 0.20894341468811034, + "step": 38130 + }, + { + "epoch": 0.16374299133630424, + "grad_norm": 1.5430631637573242, + "learning_rate": 8.398670265515726e-05, + "loss": 0.11156511306762695, + "step": 38140 + }, + { + "epoch": 0.16378592342632425, + "grad_norm": 0.005721495021134615, + "learning_rate": 8.398239093503963e-05, + "loss": 0.32973394393920896, + "step": 38150 + }, + { + "epoch": 0.16382885551634424, + "grad_norm": 0.028001677244901657, + "learning_rate": 8.3978079214922e-05, + "loss": 0.2010807991027832, + "step": 38160 + }, + { + "epoch": 0.16387178760636426, + "grad_norm": 2.5055196285247803, + "learning_rate": 8.397376749480438e-05, + "loss": 0.3040961742401123, + "step": 38170 + }, + { + "epoch": 0.16391471969638427, + "grad_norm": 0.2771667540073395, + "learning_rate": 8.396945577468676e-05, + "loss": 0.47368788719177246, + "step": 38180 + }, + { + "epoch": 0.16395765178640426, + "grad_norm": 0.07175099849700928, + "learning_rate": 8.396514405456914e-05, + "loss": 0.27983407974243163, + "step": 38190 + }, + { + "epoch": 0.16400058387642427, + "grad_norm": 4.176136493682861, + "learning_rate": 8.396083233445151e-05, + "loss": 0.2169114828109741, + "step": 38200 + }, + { + "epoch": 0.1640435159664443, + "grad_norm": 3.844069242477417, + "learning_rate": 8.395652061433389e-05, + "loss": 0.21165781021118163, + "step": 38210 + }, + { + "epoch": 0.16408644805646427, + "grad_norm": 46.019081115722656, + "learning_rate": 8.395220889421627e-05, + "loss": 0.25102906227111815, + "step": 38220 + }, + { + "epoch": 0.1641293801464843, + "grad_norm": 0.062254827469587326, + "learning_rate": 8.394789717409865e-05, + "loss": 0.20149946212768555, + "step": 38230 + }, + { + "epoch": 0.1641723122365043, + "grad_norm": 2.3644673824310303, + "learning_rate": 8.394358545398101e-05, + "loss": 0.20966775417327882, + "step": 38240 + }, + { + "epoch": 0.1642152443265243, + "grad_norm": 0.4895040988922119, + "learning_rate": 8.393927373386339e-05, + "loss": 0.4971171855926514, + "step": 38250 + }, + { + "epoch": 0.1642581764165443, + "grad_norm": 0.3563346862792969, + "learning_rate": 8.393496201374576e-05, + "loss": 0.15563546419143676, + "step": 38260 + }, + { + "epoch": 0.16430110850656432, + "grad_norm": 0.06369837373495102, + "learning_rate": 8.393065029362814e-05, + "loss": 0.39025804996490476, + "step": 38270 + }, + { + "epoch": 0.16434404059658433, + "grad_norm": 0.1078178659081459, + "learning_rate": 8.392633857351052e-05, + "loss": 0.2438561201095581, + "step": 38280 + }, + { + "epoch": 0.16438697268660432, + "grad_norm": 0.012385016307234764, + "learning_rate": 8.39220268533929e-05, + "loss": 0.15948692560195923, + "step": 38290 + }, + { + "epoch": 0.16442990477662434, + "grad_norm": 1.3638696670532227, + "learning_rate": 8.391771513327527e-05, + "loss": 0.13995628356933593, + "step": 38300 + }, + { + "epoch": 0.16447283686664435, + "grad_norm": 0.22712171077728271, + "learning_rate": 8.391340341315765e-05, + "loss": 0.18561434745788574, + "step": 38310 + }, + { + "epoch": 0.16451576895666434, + "grad_norm": 0.051490023732185364, + "learning_rate": 8.390909169304001e-05, + "loss": 0.2995032548904419, + "step": 38320 + }, + { + "epoch": 0.16455870104668435, + "grad_norm": 0.010072890669107437, + "learning_rate": 8.390477997292239e-05, + "loss": 0.29950876235961915, + "step": 38330 + }, + { + "epoch": 0.16460163313670437, + "grad_norm": 0.013207241892814636, + "learning_rate": 8.390046825280478e-05, + "loss": 0.3185295820236206, + "step": 38340 + }, + { + "epoch": 0.16464456522672435, + "grad_norm": 0.010572721250355244, + "learning_rate": 8.389615653268716e-05, + "loss": 0.16593341827392577, + "step": 38350 + }, + { + "epoch": 0.16468749731674437, + "grad_norm": 2.061591386795044, + "learning_rate": 8.389184481256954e-05, + "loss": 0.02529054880142212, + "step": 38360 + }, + { + "epoch": 0.16473042940676438, + "grad_norm": 0.02285430207848549, + "learning_rate": 8.388753309245191e-05, + "loss": 0.2505360126495361, + "step": 38370 + }, + { + "epoch": 0.1647733614967844, + "grad_norm": 4.048598766326904, + "learning_rate": 8.388322137233429e-05, + "loss": 0.3513182640075684, + "step": 38380 + }, + { + "epoch": 0.16481629358680439, + "grad_norm": 0.014832216314971447, + "learning_rate": 8.387890965221667e-05, + "loss": 0.04674802124500275, + "step": 38390 + }, + { + "epoch": 0.1648592256768244, + "grad_norm": 9.298293113708496, + "learning_rate": 8.387459793209903e-05, + "loss": 0.16453347206115723, + "step": 38400 + }, + { + "epoch": 0.16490215776684441, + "grad_norm": 0.04931933432817459, + "learning_rate": 8.387028621198141e-05, + "loss": 0.3382518768310547, + "step": 38410 + }, + { + "epoch": 0.1649450898568644, + "grad_norm": 0.01446615345776081, + "learning_rate": 8.386597449186379e-05, + "loss": 0.2328251838684082, + "step": 38420 + }, + { + "epoch": 0.16498802194688442, + "grad_norm": 0.0058226981200277805, + "learning_rate": 8.386166277174617e-05, + "loss": 0.24504258632659912, + "step": 38430 + }, + { + "epoch": 0.16503095403690443, + "grad_norm": 0.8236973881721497, + "learning_rate": 8.385735105162854e-05, + "loss": 0.35173616409301756, + "step": 38440 + }, + { + "epoch": 0.16507388612692442, + "grad_norm": 0.08230235427618027, + "learning_rate": 8.385303933151092e-05, + "loss": 0.12847490310668946, + "step": 38450 + }, + { + "epoch": 0.16511681821694443, + "grad_norm": 0.006980022415518761, + "learning_rate": 8.38487276113933e-05, + "loss": 0.3361195087432861, + "step": 38460 + }, + { + "epoch": 0.16515975030696445, + "grad_norm": 4.36317253112793, + "learning_rate": 8.384441589127567e-05, + "loss": 0.44304499626159666, + "step": 38470 + }, + { + "epoch": 0.16520268239698446, + "grad_norm": 0.03942442685365677, + "learning_rate": 8.384010417115804e-05, + "loss": 0.23771591186523439, + "step": 38480 + }, + { + "epoch": 0.16524561448700445, + "grad_norm": 8.045804977416992, + "learning_rate": 8.383579245104042e-05, + "loss": 0.24896998405456544, + "step": 38490 + }, + { + "epoch": 0.16528854657702446, + "grad_norm": 0.01741672493517399, + "learning_rate": 8.383148073092279e-05, + "loss": 0.038726434111595154, + "step": 38500 + }, + { + "epoch": 0.16533147866704448, + "grad_norm": 20.042821884155273, + "learning_rate": 8.382716901080517e-05, + "loss": 0.28770525455474855, + "step": 38510 + }, + { + "epoch": 0.16537441075706447, + "grad_norm": 0.5119468569755554, + "learning_rate": 8.382285729068755e-05, + "loss": 0.2299135446548462, + "step": 38520 + }, + { + "epoch": 0.16541734284708448, + "grad_norm": 0.4313623011112213, + "learning_rate": 8.381854557056992e-05, + "loss": 0.16424697637557983, + "step": 38530 + }, + { + "epoch": 0.1654602749371045, + "grad_norm": 2.6440560817718506, + "learning_rate": 8.38142338504523e-05, + "loss": 0.3694345712661743, + "step": 38540 + }, + { + "epoch": 0.16550320702712448, + "grad_norm": 0.9811276793479919, + "learning_rate": 8.380992213033468e-05, + "loss": 0.300356388092041, + "step": 38550 + }, + { + "epoch": 0.1655461391171445, + "grad_norm": 0.2382563352584839, + "learning_rate": 8.380561041021706e-05, + "loss": 0.44238839149475095, + "step": 38560 + }, + { + "epoch": 0.1655890712071645, + "grad_norm": 0.06049331650137901, + "learning_rate": 8.380129869009943e-05, + "loss": 0.13134862184524537, + "step": 38570 + }, + { + "epoch": 0.16563200329718453, + "grad_norm": 0.07067941129207611, + "learning_rate": 8.379698696998181e-05, + "loss": 0.26463468074798585, + "step": 38580 + }, + { + "epoch": 0.1656749353872045, + "grad_norm": 0.07888341695070267, + "learning_rate": 8.379267524986419e-05, + "loss": 0.17368587255477905, + "step": 38590 + }, + { + "epoch": 0.16571786747722453, + "grad_norm": 0.030739109963178635, + "learning_rate": 8.378836352974657e-05, + "loss": 0.2229764461517334, + "step": 38600 + }, + { + "epoch": 0.16576079956724454, + "grad_norm": 3.1724841594696045, + "learning_rate": 8.378405180962894e-05, + "loss": 0.4950582504272461, + "step": 38610 + }, + { + "epoch": 0.16580373165726453, + "grad_norm": 0.7055134773254395, + "learning_rate": 8.377974008951132e-05, + "loss": 0.32541618347167967, + "step": 38620 + }, + { + "epoch": 0.16584666374728455, + "grad_norm": 6.156464576721191, + "learning_rate": 8.37754283693937e-05, + "loss": 0.2963061809539795, + "step": 38630 + }, + { + "epoch": 0.16588959583730456, + "grad_norm": 0.15203474462032318, + "learning_rate": 8.377111664927608e-05, + "loss": 0.12441049814224243, + "step": 38640 + }, + { + "epoch": 0.16593252792732455, + "grad_norm": 1.184449553489685, + "learning_rate": 8.376680492915844e-05, + "loss": 0.23802320957183837, + "step": 38650 + }, + { + "epoch": 0.16597546001734456, + "grad_norm": 0.4919390380382538, + "learning_rate": 8.376249320904082e-05, + "loss": 0.18030476570129395, + "step": 38660 + }, + { + "epoch": 0.16601839210736458, + "grad_norm": 5.8653669357299805, + "learning_rate": 8.37581814889232e-05, + "loss": 0.0840020477771759, + "step": 38670 + }, + { + "epoch": 0.16606132419738456, + "grad_norm": 0.10958441346883774, + "learning_rate": 8.375386976880557e-05, + "loss": 0.1868067502975464, + "step": 38680 + }, + { + "epoch": 0.16610425628740458, + "grad_norm": 0.012140369974076748, + "learning_rate": 8.374955804868795e-05, + "loss": 0.19965635538101195, + "step": 38690 + }, + { + "epoch": 0.1661471883774246, + "grad_norm": 5.1828932762146, + "learning_rate": 8.374524632857033e-05, + "loss": 0.24122245311737062, + "step": 38700 + }, + { + "epoch": 0.1661901204674446, + "grad_norm": 0.11190202832221985, + "learning_rate": 8.37409346084527e-05, + "loss": 0.2018735408782959, + "step": 38710 + }, + { + "epoch": 0.1662330525574646, + "grad_norm": 0.09691668301820755, + "learning_rate": 8.373662288833508e-05, + "loss": 0.17452099323272705, + "step": 38720 + }, + { + "epoch": 0.1662759846474846, + "grad_norm": 1.8215162754058838, + "learning_rate": 8.373231116821744e-05, + "loss": 0.37970480918884275, + "step": 38730 + }, + { + "epoch": 0.16631891673750462, + "grad_norm": 0.8948038220405579, + "learning_rate": 8.372799944809982e-05, + "loss": 0.3769418001174927, + "step": 38740 + }, + { + "epoch": 0.1663618488275246, + "grad_norm": 2.60607647895813, + "learning_rate": 8.37236877279822e-05, + "loss": 0.42996745109558104, + "step": 38750 + }, + { + "epoch": 0.16640478091754463, + "grad_norm": 0.06351848691701889, + "learning_rate": 8.371937600786458e-05, + "loss": 0.2707476854324341, + "step": 38760 + }, + { + "epoch": 0.16644771300756464, + "grad_norm": 0.2907825708389282, + "learning_rate": 8.371506428774695e-05, + "loss": 0.22936019897460938, + "step": 38770 + }, + { + "epoch": 0.16649064509758463, + "grad_norm": 0.14782923460006714, + "learning_rate": 8.371075256762933e-05, + "loss": 0.3442514657974243, + "step": 38780 + }, + { + "epoch": 0.16653357718760464, + "grad_norm": 0.015982678160071373, + "learning_rate": 8.370644084751171e-05, + "loss": 0.2651229381561279, + "step": 38790 + }, + { + "epoch": 0.16657650927762466, + "grad_norm": 1.0819584131240845, + "learning_rate": 8.370212912739409e-05, + "loss": 0.5299183845520019, + "step": 38800 + }, + { + "epoch": 0.16661944136764467, + "grad_norm": 0.06313298642635345, + "learning_rate": 8.369781740727646e-05, + "loss": 0.042073649168014524, + "step": 38810 + }, + { + "epoch": 0.16666237345766466, + "grad_norm": 1.9854984283447266, + "learning_rate": 8.369350568715884e-05, + "loss": 0.15298646688461304, + "step": 38820 + }, + { + "epoch": 0.16670530554768467, + "grad_norm": 0.0021584900096058846, + "learning_rate": 8.368919396704122e-05, + "loss": 0.19425376653671264, + "step": 38830 + }, + { + "epoch": 0.1667482376377047, + "grad_norm": 0.0018256878247484565, + "learning_rate": 8.36848822469236e-05, + "loss": 0.08114975094795226, + "step": 38840 + }, + { + "epoch": 0.16679116972772468, + "grad_norm": 1.6823513507843018, + "learning_rate": 8.368057052680597e-05, + "loss": 0.3098435401916504, + "step": 38850 + }, + { + "epoch": 0.1668341018177447, + "grad_norm": 1.3453181982040405, + "learning_rate": 8.367625880668835e-05, + "loss": 0.3250315189361572, + "step": 38860 + }, + { + "epoch": 0.1668770339077647, + "grad_norm": 0.5118316411972046, + "learning_rate": 8.367194708657073e-05, + "loss": 0.22069883346557617, + "step": 38870 + }, + { + "epoch": 0.1669199659977847, + "grad_norm": 0.005584300495684147, + "learning_rate": 8.36676353664531e-05, + "loss": 0.4313997268676758, + "step": 38880 + }, + { + "epoch": 0.1669628980878047, + "grad_norm": 0.02512713335454464, + "learning_rate": 8.366332364633548e-05, + "loss": 0.22731406688690187, + "step": 38890 + }, + { + "epoch": 0.16700583017782472, + "grad_norm": 0.18230260908603668, + "learning_rate": 8.365901192621785e-05, + "loss": 0.19736795425415038, + "step": 38900 + }, + { + "epoch": 0.16704876226784474, + "grad_norm": 0.20787113904953003, + "learning_rate": 8.365470020610022e-05, + "loss": 0.11477090120315551, + "step": 38910 + }, + { + "epoch": 0.16709169435786472, + "grad_norm": 0.20564806461334229, + "learning_rate": 8.36503884859826e-05, + "loss": 0.20185043811798095, + "step": 38920 + }, + { + "epoch": 0.16713462644788474, + "grad_norm": 1.9511644840240479, + "learning_rate": 8.364607676586498e-05, + "loss": 0.27272036075592043, + "step": 38930 + }, + { + "epoch": 0.16717755853790475, + "grad_norm": 0.10560095310211182, + "learning_rate": 8.364176504574736e-05, + "loss": 0.11860344409942628, + "step": 38940 + }, + { + "epoch": 0.16722049062792474, + "grad_norm": 2.4232141971588135, + "learning_rate": 8.363745332562973e-05, + "loss": 0.3961954116821289, + "step": 38950 + }, + { + "epoch": 0.16726342271794475, + "grad_norm": 1.6023025512695312, + "learning_rate": 8.363314160551211e-05, + "loss": 0.1828877091407776, + "step": 38960 + }, + { + "epoch": 0.16730635480796477, + "grad_norm": 0.0037863650359213352, + "learning_rate": 8.362882988539449e-05, + "loss": 0.2468169927597046, + "step": 38970 + }, + { + "epoch": 0.16734928689798476, + "grad_norm": 0.004623674787580967, + "learning_rate": 8.362451816527685e-05, + "loss": 0.2642094135284424, + "step": 38980 + }, + { + "epoch": 0.16739221898800477, + "grad_norm": 0.8750384449958801, + "learning_rate": 8.362020644515923e-05, + "loss": 0.4149921894073486, + "step": 38990 + }, + { + "epoch": 0.16743515107802479, + "grad_norm": 6.293197154998779, + "learning_rate": 8.36158947250416e-05, + "loss": 0.1499064326286316, + "step": 39000 + }, + { + "epoch": 0.16743515107802479, + "eval_loss": 0.47232383489608765, + "eval_runtime": 27.4652, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 39000 + }, + { + "epoch": 0.1674780831680448, + "grad_norm": 0.1681136190891266, + "learning_rate": 8.361158300492398e-05, + "loss": 0.2481478214263916, + "step": 39010 + }, + { + "epoch": 0.1675210152580648, + "grad_norm": 0.026456119492650032, + "learning_rate": 8.360727128480636e-05, + "loss": 0.08642295002937317, + "step": 39020 + }, + { + "epoch": 0.1675639473480848, + "grad_norm": 5.313982963562012, + "learning_rate": 8.360295956468874e-05, + "loss": 0.25217013359069823, + "step": 39030 + }, + { + "epoch": 0.16760687943810482, + "grad_norm": 1.552626609802246, + "learning_rate": 8.359864784457112e-05, + "loss": 0.4480263710021973, + "step": 39040 + }, + { + "epoch": 0.1676498115281248, + "grad_norm": 5.05755615234375, + "learning_rate": 8.359433612445349e-05, + "loss": 0.43097610473632814, + "step": 39050 + }, + { + "epoch": 0.16769274361814482, + "grad_norm": 0.3707757890224457, + "learning_rate": 8.359002440433587e-05, + "loss": 0.20354764461517333, + "step": 39060 + }, + { + "epoch": 0.16773567570816483, + "grad_norm": 11.889803886413574, + "learning_rate": 8.358571268421825e-05, + "loss": 0.3243248701095581, + "step": 39070 + }, + { + "epoch": 0.16777860779818482, + "grad_norm": 0.4324072599411011, + "learning_rate": 8.358140096410062e-05, + "loss": 0.1808680295944214, + "step": 39080 + }, + { + "epoch": 0.16782153988820483, + "grad_norm": 0.42824888229370117, + "learning_rate": 8.3577089243983e-05, + "loss": 0.14043327569961547, + "step": 39090 + }, + { + "epoch": 0.16786447197822485, + "grad_norm": 0.08431851863861084, + "learning_rate": 8.357277752386538e-05, + "loss": 0.3430315017700195, + "step": 39100 + }, + { + "epoch": 0.16790740406824484, + "grad_norm": 0.020337456837296486, + "learning_rate": 8.356846580374776e-05, + "loss": 0.1652182936668396, + "step": 39110 + }, + { + "epoch": 0.16795033615826485, + "grad_norm": 14.583819389343262, + "learning_rate": 8.356415408363013e-05, + "loss": 0.11169205904006958, + "step": 39120 + }, + { + "epoch": 0.16799326824828487, + "grad_norm": 1.863498330116272, + "learning_rate": 8.355984236351251e-05, + "loss": 0.10965640544891357, + "step": 39130 + }, + { + "epoch": 0.16803620033830488, + "grad_norm": 0.011555775068700314, + "learning_rate": 8.355553064339488e-05, + "loss": 0.4207490921020508, + "step": 39140 + }, + { + "epoch": 0.16807913242832487, + "grad_norm": 0.7759541869163513, + "learning_rate": 8.355121892327725e-05, + "loss": 0.3634244441986084, + "step": 39150 + }, + { + "epoch": 0.16812206451834488, + "grad_norm": 0.04319796711206436, + "learning_rate": 8.354690720315963e-05, + "loss": 0.22236130237579346, + "step": 39160 + }, + { + "epoch": 0.1681649966083649, + "grad_norm": 0.022113563492894173, + "learning_rate": 8.354259548304201e-05, + "loss": 0.22561120986938477, + "step": 39170 + }, + { + "epoch": 0.16820792869838488, + "grad_norm": 0.5064992308616638, + "learning_rate": 8.353828376292438e-05, + "loss": 0.2085291862487793, + "step": 39180 + }, + { + "epoch": 0.1682508607884049, + "grad_norm": 4.303263187408447, + "learning_rate": 8.353397204280676e-05, + "loss": 0.1187258243560791, + "step": 39190 + }, + { + "epoch": 0.1682937928784249, + "grad_norm": 0.09465790539979935, + "learning_rate": 8.352966032268914e-05, + "loss": 0.20660836696624757, + "step": 39200 + }, + { + "epoch": 0.1683367249684449, + "grad_norm": 0.19095900654792786, + "learning_rate": 8.352534860257152e-05, + "loss": 0.2394178867340088, + "step": 39210 + }, + { + "epoch": 0.16837965705846492, + "grad_norm": 0.12276028841733932, + "learning_rate": 8.352103688245388e-05, + "loss": 0.35126869678497313, + "step": 39220 + }, + { + "epoch": 0.16842258914848493, + "grad_norm": 5.452217102050781, + "learning_rate": 8.351672516233626e-05, + "loss": 0.41213340759277345, + "step": 39230 + }, + { + "epoch": 0.16846552123850494, + "grad_norm": 0.2814752459526062, + "learning_rate": 8.351241344221863e-05, + "loss": 0.1482291579246521, + "step": 39240 + }, + { + "epoch": 0.16850845332852493, + "grad_norm": 0.04191487655043602, + "learning_rate": 8.350810172210101e-05, + "loss": 0.24670803546905518, + "step": 39250 + }, + { + "epoch": 0.16855138541854495, + "grad_norm": 0.04874122142791748, + "learning_rate": 8.350379000198339e-05, + "loss": 0.05260342955589294, + "step": 39260 + }, + { + "epoch": 0.16859431750856496, + "grad_norm": 0.040154241025447845, + "learning_rate": 8.349947828186577e-05, + "loss": 0.22768948078155518, + "step": 39270 + }, + { + "epoch": 0.16863724959858495, + "grad_norm": 0.3587961792945862, + "learning_rate": 8.349516656174814e-05, + "loss": 0.17796025276184083, + "step": 39280 + }, + { + "epoch": 0.16868018168860496, + "grad_norm": 0.0058316877111792564, + "learning_rate": 8.349085484163052e-05, + "loss": 0.3597349405288696, + "step": 39290 + }, + { + "epoch": 0.16872311377862498, + "grad_norm": 0.5243772864341736, + "learning_rate": 8.34865431215129e-05, + "loss": 0.2949937582015991, + "step": 39300 + }, + { + "epoch": 0.16876604586864496, + "grad_norm": 1.1572577953338623, + "learning_rate": 8.348223140139528e-05, + "loss": 0.28983776569366454, + "step": 39310 + }, + { + "epoch": 0.16880897795866498, + "grad_norm": 5.315773010253906, + "learning_rate": 8.347791968127765e-05, + "loss": 0.24558615684509277, + "step": 39320 + }, + { + "epoch": 0.168851910048685, + "grad_norm": 3.1546738147735596, + "learning_rate": 8.347360796116003e-05, + "loss": 0.427794361114502, + "step": 39330 + }, + { + "epoch": 0.168894842138705, + "grad_norm": 14.885223388671875, + "learning_rate": 8.346929624104241e-05, + "loss": 0.04551592469215393, + "step": 39340 + }, + { + "epoch": 0.168937774228725, + "grad_norm": 0.022690167650580406, + "learning_rate": 8.346498452092479e-05, + "loss": 0.21496713161468506, + "step": 39350 + }, + { + "epoch": 0.168980706318745, + "grad_norm": 0.11703800410032272, + "learning_rate": 8.346067280080716e-05, + "loss": 0.1862417697906494, + "step": 39360 + }, + { + "epoch": 0.16902363840876503, + "grad_norm": 1.971624493598938, + "learning_rate": 8.345636108068954e-05, + "loss": 0.26860053539276124, + "step": 39370 + }, + { + "epoch": 0.169066570498785, + "grad_norm": 1.7773724794387817, + "learning_rate": 8.345204936057192e-05, + "loss": 0.5067049026489258, + "step": 39380 + }, + { + "epoch": 0.16910950258880503, + "grad_norm": 0.8315603733062744, + "learning_rate": 8.344773764045428e-05, + "loss": 0.27385969161987306, + "step": 39390 + }, + { + "epoch": 0.16915243467882504, + "grad_norm": 6.316635608673096, + "learning_rate": 8.344342592033666e-05, + "loss": 0.16933102607727052, + "step": 39400 + }, + { + "epoch": 0.16919536676884503, + "grad_norm": 0.08984553068876266, + "learning_rate": 8.343911420021904e-05, + "loss": 0.2484668970108032, + "step": 39410 + }, + { + "epoch": 0.16923829885886504, + "grad_norm": 3.962224245071411, + "learning_rate": 8.343480248010141e-05, + "loss": 0.3198500633239746, + "step": 39420 + }, + { + "epoch": 0.16928123094888506, + "grad_norm": 1.6144697666168213, + "learning_rate": 8.343049075998379e-05, + "loss": 0.1846301794052124, + "step": 39430 + }, + { + "epoch": 0.16932416303890507, + "grad_norm": 0.06883388757705688, + "learning_rate": 8.342617903986617e-05, + "loss": 0.31305603981018065, + "step": 39440 + }, + { + "epoch": 0.16936709512892506, + "grad_norm": 0.22652801871299744, + "learning_rate": 8.342186731974855e-05, + "loss": 0.12272937297821045, + "step": 39450 + }, + { + "epoch": 0.16941002721894508, + "grad_norm": 0.03398356959223747, + "learning_rate": 8.341755559963092e-05, + "loss": 0.2238314390182495, + "step": 39460 + }, + { + "epoch": 0.1694529593089651, + "grad_norm": 0.009827976115047932, + "learning_rate": 8.341324387951329e-05, + "loss": 0.16203200817108154, + "step": 39470 + }, + { + "epoch": 0.16949589139898508, + "grad_norm": 2.6871118545532227, + "learning_rate": 8.340893215939566e-05, + "loss": 0.41886377334594727, + "step": 39480 + }, + { + "epoch": 0.1695388234890051, + "grad_norm": 2.0361788272857666, + "learning_rate": 8.340462043927804e-05, + "loss": 0.2878458023071289, + "step": 39490 + }, + { + "epoch": 0.1695817555790251, + "grad_norm": 0.22583113610744476, + "learning_rate": 8.340030871916042e-05, + "loss": 0.33467190265655516, + "step": 39500 + }, + { + "epoch": 0.1696246876690451, + "grad_norm": 2.315829277038574, + "learning_rate": 8.33959969990428e-05, + "loss": 0.20003714561462402, + "step": 39510 + }, + { + "epoch": 0.1696676197590651, + "grad_norm": 1.986151099205017, + "learning_rate": 8.339168527892517e-05, + "loss": 0.1555501937866211, + "step": 39520 + }, + { + "epoch": 0.16971055184908512, + "grad_norm": 0.007451371289789677, + "learning_rate": 8.338737355880756e-05, + "loss": 0.07800998091697693, + "step": 39530 + }, + { + "epoch": 0.1697534839391051, + "grad_norm": 21.262826919555664, + "learning_rate": 8.338306183868994e-05, + "loss": 0.16026850938796997, + "step": 39540 + }, + { + "epoch": 0.16979641602912512, + "grad_norm": 0.18673354387283325, + "learning_rate": 8.33787501185723e-05, + "loss": 0.19166061878204346, + "step": 39550 + }, + { + "epoch": 0.16983934811914514, + "grad_norm": 0.9280945658683777, + "learning_rate": 8.337443839845468e-05, + "loss": 0.10031145811080933, + "step": 39560 + }, + { + "epoch": 0.16988228020916515, + "grad_norm": 0.27290984988212585, + "learning_rate": 8.337012667833706e-05, + "loss": 0.3064888000488281, + "step": 39570 + }, + { + "epoch": 0.16992521229918514, + "grad_norm": 3.447659730911255, + "learning_rate": 8.336581495821944e-05, + "loss": 0.21117255687713624, + "step": 39580 + }, + { + "epoch": 0.16996814438920516, + "grad_norm": 0.550126314163208, + "learning_rate": 8.336150323810181e-05, + "loss": 0.26300458908081054, + "step": 39590 + }, + { + "epoch": 0.17001107647922517, + "grad_norm": 0.05337730422616005, + "learning_rate": 8.335719151798419e-05, + "loss": 0.14356883764266967, + "step": 39600 + }, + { + "epoch": 0.17005400856924516, + "grad_norm": 2.290454626083374, + "learning_rate": 8.335287979786657e-05, + "loss": 0.34690542221069337, + "step": 39610 + }, + { + "epoch": 0.17009694065926517, + "grad_norm": 0.43621301651000977, + "learning_rate": 8.334856807774895e-05, + "loss": 0.31757278442382814, + "step": 39620 + }, + { + "epoch": 0.1701398727492852, + "grad_norm": 0.0348827950656414, + "learning_rate": 8.334425635763132e-05, + "loss": 0.4615351676940918, + "step": 39630 + }, + { + "epoch": 0.17018280483930517, + "grad_norm": 1.76190984249115, + "learning_rate": 8.333994463751369e-05, + "loss": 0.31025004386901855, + "step": 39640 + }, + { + "epoch": 0.1702257369293252, + "grad_norm": 0.04441544786095619, + "learning_rate": 8.333563291739607e-05, + "loss": 0.1971571445465088, + "step": 39650 + }, + { + "epoch": 0.1702686690193452, + "grad_norm": 0.07046888023614883, + "learning_rate": 8.333132119727844e-05, + "loss": 0.27866973876953127, + "step": 39660 + }, + { + "epoch": 0.17031160110936522, + "grad_norm": 0.023150363937020302, + "learning_rate": 8.332700947716082e-05, + "loss": 0.14343875646591187, + "step": 39670 + }, + { + "epoch": 0.1703545331993852, + "grad_norm": 2.077636480331421, + "learning_rate": 8.33226977570432e-05, + "loss": 0.2821050643920898, + "step": 39680 + }, + { + "epoch": 0.17039746528940522, + "grad_norm": 1.0875072479248047, + "learning_rate": 8.331838603692557e-05, + "loss": 0.3242486953735352, + "step": 39690 + }, + { + "epoch": 0.17044039737942523, + "grad_norm": 2.452104330062866, + "learning_rate": 8.331407431680795e-05, + "loss": 0.12137858867645264, + "step": 39700 + }, + { + "epoch": 0.17048332946944522, + "grad_norm": 0.6530572772026062, + "learning_rate": 8.330976259669033e-05, + "loss": 0.13224581480026246, + "step": 39710 + }, + { + "epoch": 0.17052626155946524, + "grad_norm": 0.2018723040819168, + "learning_rate": 8.330545087657269e-05, + "loss": 0.16498252153396606, + "step": 39720 + }, + { + "epoch": 0.17056919364948525, + "grad_norm": 5.927464962005615, + "learning_rate": 8.330113915645507e-05, + "loss": 0.37170138359069826, + "step": 39730 + }, + { + "epoch": 0.17061212573950524, + "grad_norm": 0.03498421981930733, + "learning_rate": 8.329682743633745e-05, + "loss": 0.28318626880645753, + "step": 39740 + }, + { + "epoch": 0.17065505782952525, + "grad_norm": 0.20146240293979645, + "learning_rate": 8.329251571621984e-05, + "loss": 0.3529873609542847, + "step": 39750 + }, + { + "epoch": 0.17069798991954527, + "grad_norm": 0.021076412871479988, + "learning_rate": 8.328820399610222e-05, + "loss": 0.20006706714630126, + "step": 39760 + }, + { + "epoch": 0.17074092200956528, + "grad_norm": 1.7591829299926758, + "learning_rate": 8.32838922759846e-05, + "loss": 0.2442645788192749, + "step": 39770 + }, + { + "epoch": 0.17078385409958527, + "grad_norm": 0.038296714425086975, + "learning_rate": 8.327958055586697e-05, + "loss": 0.06047918200492859, + "step": 39780 + }, + { + "epoch": 0.17082678618960528, + "grad_norm": 2.6467275619506836, + "learning_rate": 8.327526883574935e-05, + "loss": 0.2018986940383911, + "step": 39790 + }, + { + "epoch": 0.1708697182796253, + "grad_norm": 0.1143501028418541, + "learning_rate": 8.327095711563171e-05, + "loss": 0.18430989980697632, + "step": 39800 + }, + { + "epoch": 0.17091265036964529, + "grad_norm": 0.04112870246171951, + "learning_rate": 8.326664539551409e-05, + "loss": 0.448679256439209, + "step": 39810 + }, + { + "epoch": 0.1709555824596653, + "grad_norm": 1.7692079544067383, + "learning_rate": 8.326233367539647e-05, + "loss": 0.40442686080932616, + "step": 39820 + }, + { + "epoch": 0.17099851454968532, + "grad_norm": 33.07231140136719, + "learning_rate": 8.325802195527884e-05, + "loss": 0.10954853296279907, + "step": 39830 + }, + { + "epoch": 0.1710414466397053, + "grad_norm": 0.02489142306149006, + "learning_rate": 8.325371023516122e-05, + "loss": 0.1942327857017517, + "step": 39840 + }, + { + "epoch": 0.17108437872972532, + "grad_norm": 0.01788198947906494, + "learning_rate": 8.32493985150436e-05, + "loss": 0.22420060634613037, + "step": 39850 + }, + { + "epoch": 0.17112731081974533, + "grad_norm": 0.19262222945690155, + "learning_rate": 8.324508679492598e-05, + "loss": 0.18279441595077514, + "step": 39860 + }, + { + "epoch": 0.17117024290976535, + "grad_norm": 0.37432318925857544, + "learning_rate": 8.324077507480835e-05, + "loss": 0.0887138843536377, + "step": 39870 + }, + { + "epoch": 0.17121317499978533, + "grad_norm": 0.007091057952493429, + "learning_rate": 8.323646335469072e-05, + "loss": 0.3868217945098877, + "step": 39880 + }, + { + "epoch": 0.17125610708980535, + "grad_norm": 2.7712514400482178, + "learning_rate": 8.32321516345731e-05, + "loss": 0.34295656681060793, + "step": 39890 + }, + { + "epoch": 0.17129903917982536, + "grad_norm": 0.012256702408194542, + "learning_rate": 8.322783991445547e-05, + "loss": 0.08540812134742737, + "step": 39900 + }, + { + "epoch": 0.17134197126984535, + "grad_norm": 2.801025867462158, + "learning_rate": 8.322352819433785e-05, + "loss": 0.30571622848510743, + "step": 39910 + }, + { + "epoch": 0.17138490335986536, + "grad_norm": 1.7197296619415283, + "learning_rate": 8.321921647422023e-05, + "loss": 0.5131281852722168, + "step": 39920 + }, + { + "epoch": 0.17142783544988538, + "grad_norm": 0.019641146063804626, + "learning_rate": 8.32149047541026e-05, + "loss": 0.22599990367889405, + "step": 39930 + }, + { + "epoch": 0.17147076753990537, + "grad_norm": 0.03997062146663666, + "learning_rate": 8.321059303398498e-05, + "loss": 0.15376033782958984, + "step": 39940 + }, + { + "epoch": 0.17151369962992538, + "grad_norm": 0.41821911931037903, + "learning_rate": 8.320628131386736e-05, + "loss": 0.23149895668029785, + "step": 39950 + }, + { + "epoch": 0.1715566317199454, + "grad_norm": 0.30758556723594666, + "learning_rate": 8.320196959374972e-05, + "loss": 0.14392002820968627, + "step": 39960 + }, + { + "epoch": 0.17159956380996538, + "grad_norm": 0.23203805088996887, + "learning_rate": 8.319765787363211e-05, + "loss": 0.5086146831512451, + "step": 39970 + }, + { + "epoch": 0.1716424958999854, + "grad_norm": 0.42892053723335266, + "learning_rate": 8.319334615351449e-05, + "loss": 0.20939264297485352, + "step": 39980 + }, + { + "epoch": 0.1716854279900054, + "grad_norm": 0.9444646239280701, + "learning_rate": 8.318903443339687e-05, + "loss": 0.2334125280380249, + "step": 39990 + }, + { + "epoch": 0.17172836008002543, + "grad_norm": 4.157931804656982, + "learning_rate": 8.318472271327925e-05, + "loss": 0.3207542419433594, + "step": 40000 + }, + { + "epoch": 0.17172836008002543, + "eval_loss": 0.4570688307285309, + "eval_runtime": 27.607, + "eval_samples_per_second": 3.622, + "eval_steps_per_second": 3.622, + "step": 40000 + }, + { + "epoch": 0.17177129217004541, + "grad_norm": 4.318746566772461, + "learning_rate": 8.318041099316162e-05, + "loss": 0.05203125476837158, + "step": 40010 + }, + { + "epoch": 0.17181422426006543, + "grad_norm": 1.4116801023483276, + "learning_rate": 8.3176099273044e-05, + "loss": 0.4147486686706543, + "step": 40020 + }, + { + "epoch": 0.17185715635008544, + "grad_norm": 2.940032958984375, + "learning_rate": 8.317178755292638e-05, + "loss": 0.28691074848175047, + "step": 40030 + }, + { + "epoch": 0.17190008844010543, + "grad_norm": 0.46757587790489197, + "learning_rate": 8.316747583280875e-05, + "loss": 0.25227696895599366, + "step": 40040 + }, + { + "epoch": 0.17194302053012545, + "grad_norm": 0.18634115159511566, + "learning_rate": 8.316316411269112e-05, + "loss": 0.007025787234306335, + "step": 40050 + }, + { + "epoch": 0.17198595262014546, + "grad_norm": 0.1440853774547577, + "learning_rate": 8.31588523925735e-05, + "loss": 0.23718724250793458, + "step": 40060 + }, + { + "epoch": 0.17202888471016545, + "grad_norm": 2.4190330505371094, + "learning_rate": 8.315454067245587e-05, + "loss": 0.39030263423919676, + "step": 40070 + }, + { + "epoch": 0.17207181680018546, + "grad_norm": 1.9734594821929932, + "learning_rate": 8.315022895233825e-05, + "loss": 0.3922515630722046, + "step": 40080 + }, + { + "epoch": 0.17211474889020548, + "grad_norm": 0.6959921717643738, + "learning_rate": 8.314591723222063e-05, + "loss": 0.3082964181900024, + "step": 40090 + }, + { + "epoch": 0.1721576809802255, + "grad_norm": 0.06165720522403717, + "learning_rate": 8.3141605512103e-05, + "loss": 0.17087411880493164, + "step": 40100 + }, + { + "epoch": 0.17220061307024548, + "grad_norm": 5.1847100257873535, + "learning_rate": 8.313729379198538e-05, + "loss": 0.1909353256225586, + "step": 40110 + }, + { + "epoch": 0.1722435451602655, + "grad_norm": 0.07140224426984787, + "learning_rate": 8.313298207186776e-05, + "loss": 0.3157165050506592, + "step": 40120 + }, + { + "epoch": 0.1722864772502855, + "grad_norm": 2.0928823947906494, + "learning_rate": 8.312867035175012e-05, + "loss": 0.22499537467956543, + "step": 40130 + }, + { + "epoch": 0.1723294093403055, + "grad_norm": 0.9970523118972778, + "learning_rate": 8.31243586316325e-05, + "loss": 0.3301548004150391, + "step": 40140 + }, + { + "epoch": 0.1723723414303255, + "grad_norm": 0.13911788165569305, + "learning_rate": 8.312004691151488e-05, + "loss": 0.07332115769386291, + "step": 40150 + }, + { + "epoch": 0.17241527352034552, + "grad_norm": 1.103356122970581, + "learning_rate": 8.311573519139726e-05, + "loss": 0.34607686996459963, + "step": 40160 + }, + { + "epoch": 0.1724582056103655, + "grad_norm": 0.013140438124537468, + "learning_rate": 8.311142347127963e-05, + "loss": 0.3559520483016968, + "step": 40170 + }, + { + "epoch": 0.17250113770038553, + "grad_norm": 3.9121856689453125, + "learning_rate": 8.310711175116201e-05, + "loss": 0.2703645944595337, + "step": 40180 + }, + { + "epoch": 0.17254406979040554, + "grad_norm": 1.812775731086731, + "learning_rate": 8.310280003104439e-05, + "loss": 0.3910071849822998, + "step": 40190 + }, + { + "epoch": 0.17258700188042556, + "grad_norm": 2.7580201625823975, + "learning_rate": 8.309848831092676e-05, + "loss": 0.16809067726135254, + "step": 40200 + }, + { + "epoch": 0.17262993397044554, + "grad_norm": 0.010955448262393475, + "learning_rate": 8.309417659080914e-05, + "loss": 0.2514191150665283, + "step": 40210 + }, + { + "epoch": 0.17267286606046556, + "grad_norm": 3.1789658069610596, + "learning_rate": 8.308986487069152e-05, + "loss": 0.19436899423599244, + "step": 40220 + }, + { + "epoch": 0.17271579815048557, + "grad_norm": 4.4677252769470215, + "learning_rate": 8.30855531505739e-05, + "loss": 0.4041276454925537, + "step": 40230 + }, + { + "epoch": 0.17275873024050556, + "grad_norm": 0.10957568138837814, + "learning_rate": 8.308124143045627e-05, + "loss": 0.22359180450439453, + "step": 40240 + }, + { + "epoch": 0.17280166233052557, + "grad_norm": 0.10355505347251892, + "learning_rate": 8.307692971033865e-05, + "loss": 0.1507003903388977, + "step": 40250 + }, + { + "epoch": 0.1728445944205456, + "grad_norm": 1.1672568321228027, + "learning_rate": 8.307261799022103e-05, + "loss": 0.2168051481246948, + "step": 40260 + }, + { + "epoch": 0.17288752651056558, + "grad_norm": 1.6299830675125122, + "learning_rate": 8.30683062701034e-05, + "loss": 0.29830398559570315, + "step": 40270 + }, + { + "epoch": 0.1729304586005856, + "grad_norm": 0.09983634203672409, + "learning_rate": 8.306399454998578e-05, + "loss": 0.24507391452789307, + "step": 40280 + }, + { + "epoch": 0.1729733906906056, + "grad_norm": 1.581100583076477, + "learning_rate": 8.305968282986815e-05, + "loss": 0.2604495048522949, + "step": 40290 + }, + { + "epoch": 0.17301632278062562, + "grad_norm": 2.6971969604492188, + "learning_rate": 8.305537110975052e-05, + "loss": 0.23511652946472167, + "step": 40300 + }, + { + "epoch": 0.1730592548706456, + "grad_norm": 0.9278387427330017, + "learning_rate": 8.30510593896329e-05, + "loss": 0.2202451229095459, + "step": 40310 + }, + { + "epoch": 0.17310218696066562, + "grad_norm": 1.2856556177139282, + "learning_rate": 8.304674766951528e-05, + "loss": 0.09628130197525024, + "step": 40320 + }, + { + "epoch": 0.17314511905068564, + "grad_norm": 0.8613991141319275, + "learning_rate": 8.304243594939766e-05, + "loss": 0.17015516757965088, + "step": 40330 + }, + { + "epoch": 0.17318805114070562, + "grad_norm": 0.4966510236263275, + "learning_rate": 8.303812422928003e-05, + "loss": 0.2915595293045044, + "step": 40340 + }, + { + "epoch": 0.17323098323072564, + "grad_norm": 3.0629260540008545, + "learning_rate": 8.303381250916241e-05, + "loss": 0.26964516639709474, + "step": 40350 + }, + { + "epoch": 0.17327391532074565, + "grad_norm": 0.004435913171619177, + "learning_rate": 8.302950078904479e-05, + "loss": 0.13189687728881835, + "step": 40360 + }, + { + "epoch": 0.17331684741076564, + "grad_norm": 0.012355843558907509, + "learning_rate": 8.302518906892717e-05, + "loss": 0.12814369201660156, + "step": 40370 + }, + { + "epoch": 0.17335977950078565, + "grad_norm": 0.001818435383029282, + "learning_rate": 8.302087734880953e-05, + "loss": 0.15914521217346192, + "step": 40380 + }, + { + "epoch": 0.17340271159080567, + "grad_norm": 2.8527755737304688, + "learning_rate": 8.301656562869191e-05, + "loss": 0.3045322418212891, + "step": 40390 + }, + { + "epoch": 0.17344564368082566, + "grad_norm": 4.89120626449585, + "learning_rate": 8.301225390857428e-05, + "loss": 0.14866328239440918, + "step": 40400 + }, + { + "epoch": 0.17348857577084567, + "grad_norm": 1.227921962738037, + "learning_rate": 8.300794218845666e-05, + "loss": 0.2188586711883545, + "step": 40410 + }, + { + "epoch": 0.17353150786086569, + "grad_norm": 0.14513230323791504, + "learning_rate": 8.300363046833904e-05, + "loss": 0.31578736305236815, + "step": 40420 + }, + { + "epoch": 0.1735744399508857, + "grad_norm": 0.004886118695139885, + "learning_rate": 8.299931874822142e-05, + "loss": 0.1223494529724121, + "step": 40430 + }, + { + "epoch": 0.1736173720409057, + "grad_norm": 0.0034907562658190727, + "learning_rate": 8.29950070281038e-05, + "loss": 0.35157797336578367, + "step": 40440 + }, + { + "epoch": 0.1736603041309257, + "grad_norm": 0.002745399484410882, + "learning_rate": 8.299069530798617e-05, + "loss": 0.3651834011077881, + "step": 40450 + }, + { + "epoch": 0.17370323622094572, + "grad_norm": 1.5831266641616821, + "learning_rate": 8.298638358786855e-05, + "loss": 0.19076406955718994, + "step": 40460 + }, + { + "epoch": 0.1737461683109657, + "grad_norm": 0.0015953868860378861, + "learning_rate": 8.298207186775093e-05, + "loss": 0.25166780948638917, + "step": 40470 + }, + { + "epoch": 0.17378910040098572, + "grad_norm": 0.03768213838338852, + "learning_rate": 8.29777601476333e-05, + "loss": 0.32109010219573975, + "step": 40480 + }, + { + "epoch": 0.17383203249100573, + "grad_norm": 0.5360517501831055, + "learning_rate": 8.297344842751568e-05, + "loss": 0.23133792877197265, + "step": 40490 + }, + { + "epoch": 0.17387496458102572, + "grad_norm": 1.6855971813201904, + "learning_rate": 8.296913670739806e-05, + "loss": 0.22416160106658936, + "step": 40500 + }, + { + "epoch": 0.17391789667104574, + "grad_norm": 2.100095510482788, + "learning_rate": 8.296482498728044e-05, + "loss": 0.3099149942398071, + "step": 40510 + }, + { + "epoch": 0.17396082876106575, + "grad_norm": 0.001875672023743391, + "learning_rate": 8.296051326716281e-05, + "loss": 0.16080970764160157, + "step": 40520 + }, + { + "epoch": 0.17400376085108576, + "grad_norm": 0.03504222258925438, + "learning_rate": 8.295620154704519e-05, + "loss": 0.16347267627716064, + "step": 40530 + }, + { + "epoch": 0.17404669294110575, + "grad_norm": 0.009600917808711529, + "learning_rate": 8.295188982692755e-05, + "loss": 0.39733908176422117, + "step": 40540 + }, + { + "epoch": 0.17408962503112577, + "grad_norm": 2.063370704650879, + "learning_rate": 8.294757810680993e-05, + "loss": 0.31655449867248536, + "step": 40550 + }, + { + "epoch": 0.17413255712114578, + "grad_norm": 1.423431634902954, + "learning_rate": 8.294326638669231e-05, + "loss": 0.42395715713500975, + "step": 40560 + }, + { + "epoch": 0.17417548921116577, + "grad_norm": 0.05649884417653084, + "learning_rate": 8.293895466657469e-05, + "loss": 0.06664473414421082, + "step": 40570 + }, + { + "epoch": 0.17421842130118578, + "grad_norm": 0.06601168215274811, + "learning_rate": 8.293464294645706e-05, + "loss": 0.2564336538314819, + "step": 40580 + }, + { + "epoch": 0.1742613533912058, + "grad_norm": 1.3635640144348145, + "learning_rate": 8.293033122633944e-05, + "loss": 0.2436471700668335, + "step": 40590 + }, + { + "epoch": 0.17430428548122578, + "grad_norm": 0.6892813444137573, + "learning_rate": 8.292601950622182e-05, + "loss": 0.1424673914909363, + "step": 40600 + }, + { + "epoch": 0.1743472175712458, + "grad_norm": 0.30439263582229614, + "learning_rate": 8.29217077861042e-05, + "loss": 0.2444373369216919, + "step": 40610 + }, + { + "epoch": 0.17439014966126581, + "grad_norm": 0.3621372580528259, + "learning_rate": 8.291739606598656e-05, + "loss": 0.12778749465942382, + "step": 40620 + }, + { + "epoch": 0.17443308175128583, + "grad_norm": 0.1329677700996399, + "learning_rate": 8.291308434586894e-05, + "loss": 0.18559612035751344, + "step": 40630 + }, + { + "epoch": 0.17447601384130582, + "grad_norm": 1.1921926736831665, + "learning_rate": 8.290877262575131e-05, + "loss": 0.2326160192489624, + "step": 40640 + }, + { + "epoch": 0.17451894593132583, + "grad_norm": 0.9672372937202454, + "learning_rate": 8.290446090563369e-05, + "loss": 0.4107320308685303, + "step": 40650 + }, + { + "epoch": 0.17456187802134585, + "grad_norm": 1.4626498222351074, + "learning_rate": 8.290014918551607e-05, + "loss": 0.2904258489608765, + "step": 40660 + }, + { + "epoch": 0.17460481011136583, + "grad_norm": 0.010346812196075916, + "learning_rate": 8.289583746539845e-05, + "loss": 0.10135916471481324, + "step": 40670 + }, + { + "epoch": 0.17464774220138585, + "grad_norm": 1.992686152458191, + "learning_rate": 8.289152574528082e-05, + "loss": 0.2161275863647461, + "step": 40680 + }, + { + "epoch": 0.17469067429140586, + "grad_norm": 0.03004583902657032, + "learning_rate": 8.28872140251632e-05, + "loss": 0.19164289236068727, + "step": 40690 + }, + { + "epoch": 0.17473360638142585, + "grad_norm": 0.106281578540802, + "learning_rate": 8.288290230504558e-05, + "loss": 0.21553289890289307, + "step": 40700 + }, + { + "epoch": 0.17477653847144586, + "grad_norm": 0.8848221898078918, + "learning_rate": 8.287859058492796e-05, + "loss": 0.17538909912109374, + "step": 40710 + }, + { + "epoch": 0.17481947056146588, + "grad_norm": 0.006868957541882992, + "learning_rate": 8.287427886481033e-05, + "loss": 0.26656954288482665, + "step": 40720 + }, + { + "epoch": 0.1748624026514859, + "grad_norm": 15.10798454284668, + "learning_rate": 8.286996714469271e-05, + "loss": 0.4881472587585449, + "step": 40730 + }, + { + "epoch": 0.17490533474150588, + "grad_norm": 0.7054204344749451, + "learning_rate": 8.286565542457509e-05, + "loss": 0.2571603536605835, + "step": 40740 + }, + { + "epoch": 0.1749482668315259, + "grad_norm": 0.016062183305621147, + "learning_rate": 8.286134370445746e-05, + "loss": 0.2579342365264893, + "step": 40750 + }, + { + "epoch": 0.1749911989215459, + "grad_norm": 0.19596177339553833, + "learning_rate": 8.285703198433984e-05, + "loss": 0.29378952980041506, + "step": 40760 + }, + { + "epoch": 0.1750341310115659, + "grad_norm": 2.484384059906006, + "learning_rate": 8.285272026422222e-05, + "loss": 0.24021799564361573, + "step": 40770 + }, + { + "epoch": 0.1750770631015859, + "grad_norm": 0.2761439085006714, + "learning_rate": 8.28484085441046e-05, + "loss": 0.1571637988090515, + "step": 40780 + }, + { + "epoch": 0.17511999519160593, + "grad_norm": 2.089289903640747, + "learning_rate": 8.284409682398696e-05, + "loss": 0.21136255264282228, + "step": 40790 + }, + { + "epoch": 0.1751629272816259, + "grad_norm": 0.3317461311817169, + "learning_rate": 8.283978510386934e-05, + "loss": 0.3460866928100586, + "step": 40800 + }, + { + "epoch": 0.17520585937164593, + "grad_norm": 10.414310455322266, + "learning_rate": 8.283547338375172e-05, + "loss": 0.2301114559173584, + "step": 40810 + }, + { + "epoch": 0.17524879146166594, + "grad_norm": 0.1468975692987442, + "learning_rate": 8.283116166363409e-05, + "loss": 0.32068285942077634, + "step": 40820 + }, + { + "epoch": 0.17529172355168593, + "grad_norm": 0.01950320042669773, + "learning_rate": 8.282684994351647e-05, + "loss": 0.10026562213897705, + "step": 40830 + }, + { + "epoch": 0.17533465564170594, + "grad_norm": 1.38692045211792, + "learning_rate": 8.282253822339885e-05, + "loss": 0.3668211936950684, + "step": 40840 + }, + { + "epoch": 0.17537758773172596, + "grad_norm": 0.07779578119516373, + "learning_rate": 8.281822650328122e-05, + "loss": 0.4057168960571289, + "step": 40850 + }, + { + "epoch": 0.17542051982174597, + "grad_norm": 0.011414109729230404, + "learning_rate": 8.28139147831636e-05, + "loss": 0.08641638159751892, + "step": 40860 + }, + { + "epoch": 0.17546345191176596, + "grad_norm": 0.026972273364663124, + "learning_rate": 8.280960306304597e-05, + "loss": 0.1714036226272583, + "step": 40870 + }, + { + "epoch": 0.17550638400178598, + "grad_norm": 0.11325936764478683, + "learning_rate": 8.280529134292834e-05, + "loss": 0.18394806385040283, + "step": 40880 + }, + { + "epoch": 0.175549316091806, + "grad_norm": 1.1557916402816772, + "learning_rate": 8.280097962281072e-05, + "loss": 0.26514892578125, + "step": 40890 + }, + { + "epoch": 0.17559224818182598, + "grad_norm": 3.6013333797454834, + "learning_rate": 8.27966679026931e-05, + "loss": 0.19369306564331054, + "step": 40900 + }, + { + "epoch": 0.175635180271846, + "grad_norm": 1.8386772871017456, + "learning_rate": 8.279235618257548e-05, + "loss": 0.3296739816665649, + "step": 40910 + }, + { + "epoch": 0.175678112361866, + "grad_norm": 0.2031298577785492, + "learning_rate": 8.278804446245785e-05, + "loss": 0.42958965301513674, + "step": 40920 + }, + { + "epoch": 0.175721044451886, + "grad_norm": 0.014757785946130753, + "learning_rate": 8.278373274234023e-05, + "loss": 0.10978200435638427, + "step": 40930 + }, + { + "epoch": 0.175763976541906, + "grad_norm": 0.0178249292075634, + "learning_rate": 8.277942102222262e-05, + "loss": 0.21592988967895507, + "step": 40940 + }, + { + "epoch": 0.17580690863192602, + "grad_norm": 0.13518759608268738, + "learning_rate": 8.277510930210498e-05, + "loss": 0.08380707502365112, + "step": 40950 + }, + { + "epoch": 0.17584984072194604, + "grad_norm": 4.348465442657471, + "learning_rate": 8.277079758198736e-05, + "loss": 0.20458722114562988, + "step": 40960 + }, + { + "epoch": 0.17589277281196602, + "grad_norm": 0.040076449513435364, + "learning_rate": 8.276648586186974e-05, + "loss": 0.2710048913955688, + "step": 40970 + }, + { + "epoch": 0.17593570490198604, + "grad_norm": 7.410130977630615, + "learning_rate": 8.276217414175212e-05, + "loss": 0.584017276763916, + "step": 40980 + }, + { + "epoch": 0.17597863699200605, + "grad_norm": 0.06982174515724182, + "learning_rate": 8.27578624216345e-05, + "loss": 0.008809458464384079, + "step": 40990 + }, + { + "epoch": 0.17602156908202604, + "grad_norm": 0.008431232534348965, + "learning_rate": 8.275355070151687e-05, + "loss": 0.3506460189819336, + "step": 41000 + }, + { + "epoch": 0.17602156908202604, + "eval_loss": 0.4541700780391693, + "eval_runtime": 27.5582, + "eval_samples_per_second": 3.629, + "eval_steps_per_second": 3.629, + "step": 41000 + }, + { + "epoch": 0.17606450117204606, + "grad_norm": 0.009548353031277657, + "learning_rate": 8.274923898139925e-05, + "loss": 0.2820699453353882, + "step": 41010 + }, + { + "epoch": 0.17610743326206607, + "grad_norm": 0.009989765472710133, + "learning_rate": 8.274492726128163e-05, + "loss": 0.25205843448638915, + "step": 41020 + }, + { + "epoch": 0.17615036535208606, + "grad_norm": 0.23707497119903564, + "learning_rate": 8.274061554116399e-05, + "loss": 0.263662314414978, + "step": 41030 + }, + { + "epoch": 0.17619329744210607, + "grad_norm": 0.0967523604631424, + "learning_rate": 8.273630382104637e-05, + "loss": 0.23872857093811034, + "step": 41040 + }, + { + "epoch": 0.1762362295321261, + "grad_norm": 12.732280731201172, + "learning_rate": 8.273199210092874e-05, + "loss": 0.1568984270095825, + "step": 41050 + }, + { + "epoch": 0.1762791616221461, + "grad_norm": 0.02078324370086193, + "learning_rate": 8.272768038081112e-05, + "loss": 0.18609501123428346, + "step": 41060 + }, + { + "epoch": 0.1763220937121661, + "grad_norm": 0.015798605978488922, + "learning_rate": 8.27233686606935e-05, + "loss": 0.30507678985595704, + "step": 41070 + }, + { + "epoch": 0.1763650258021861, + "grad_norm": 1.2599973678588867, + "learning_rate": 8.271905694057588e-05, + "loss": 0.311283016204834, + "step": 41080 + }, + { + "epoch": 0.17640795789220612, + "grad_norm": 0.40799635648727417, + "learning_rate": 8.271474522045825e-05, + "loss": 0.3520747423171997, + "step": 41090 + }, + { + "epoch": 0.1764508899822261, + "grad_norm": 1.6615732908248901, + "learning_rate": 8.271043350034063e-05, + "loss": 0.2939105987548828, + "step": 41100 + }, + { + "epoch": 0.17649382207224612, + "grad_norm": 1.2815062999725342, + "learning_rate": 8.270612178022301e-05, + "loss": 0.28693690299987795, + "step": 41110 + }, + { + "epoch": 0.17653675416226614, + "grad_norm": 0.01296562422066927, + "learning_rate": 8.270181006010537e-05, + "loss": 0.12417706251144409, + "step": 41120 + }, + { + "epoch": 0.17657968625228612, + "grad_norm": 0.043650198727846146, + "learning_rate": 8.269749833998775e-05, + "loss": 0.48966689109802247, + "step": 41130 + }, + { + "epoch": 0.17662261834230614, + "grad_norm": 1.0135577917099, + "learning_rate": 8.269318661987013e-05, + "loss": 0.11902866363525391, + "step": 41140 + }, + { + "epoch": 0.17666555043232615, + "grad_norm": 0.048548776656389236, + "learning_rate": 8.26888748997525e-05, + "loss": 0.21979255676269532, + "step": 41150 + }, + { + "epoch": 0.17670848252234617, + "grad_norm": 8.79423713684082, + "learning_rate": 8.26845631796349e-05, + "loss": 0.09572447538375854, + "step": 41160 + }, + { + "epoch": 0.17675141461236615, + "grad_norm": 1.3120108842849731, + "learning_rate": 8.268025145951727e-05, + "loss": 0.3123744487762451, + "step": 41170 + }, + { + "epoch": 0.17679434670238617, + "grad_norm": 0.3405528962612152, + "learning_rate": 8.267593973939965e-05, + "loss": 0.14704158306121826, + "step": 41180 + }, + { + "epoch": 0.17683727879240618, + "grad_norm": 0.2074926495552063, + "learning_rate": 8.267162801928203e-05, + "loss": 0.28987486362457277, + "step": 41190 + }, + { + "epoch": 0.17688021088242617, + "grad_norm": 0.663354754447937, + "learning_rate": 8.266731629916439e-05, + "loss": 0.07300347089767456, + "step": 41200 + }, + { + "epoch": 0.17692314297244618, + "grad_norm": 0.36614349484443665, + "learning_rate": 8.266300457904677e-05, + "loss": 0.2700439214706421, + "step": 41210 + }, + { + "epoch": 0.1769660750624662, + "grad_norm": 2.989150047302246, + "learning_rate": 8.265869285892915e-05, + "loss": 0.1963904857635498, + "step": 41220 + }, + { + "epoch": 0.1770090071524862, + "grad_norm": 0.02262544445693493, + "learning_rate": 8.265438113881152e-05, + "loss": 0.1728546142578125, + "step": 41230 + }, + { + "epoch": 0.1770519392425062, + "grad_norm": 3.4345920085906982, + "learning_rate": 8.26500694186939e-05, + "loss": 0.4857645988464355, + "step": 41240 + }, + { + "epoch": 0.17709487133252622, + "grad_norm": 0.12471339851617813, + "learning_rate": 8.264575769857628e-05, + "loss": 0.24559836387634276, + "step": 41250 + }, + { + "epoch": 0.1771378034225462, + "grad_norm": 2.662829875946045, + "learning_rate": 8.264144597845865e-05, + "loss": 0.25598394870758057, + "step": 41260 + }, + { + "epoch": 0.17718073551256622, + "grad_norm": 1.2927974462509155, + "learning_rate": 8.263713425834103e-05, + "loss": 0.31288459300994875, + "step": 41270 + }, + { + "epoch": 0.17722366760258623, + "grad_norm": 1.086735486984253, + "learning_rate": 8.26328225382234e-05, + "loss": 0.4813650608062744, + "step": 41280 + }, + { + "epoch": 0.17726659969260625, + "grad_norm": 3.4481236934661865, + "learning_rate": 8.262851081810577e-05, + "loss": 0.204862380027771, + "step": 41290 + }, + { + "epoch": 0.17730953178262623, + "grad_norm": 0.015600155107676983, + "learning_rate": 8.262419909798815e-05, + "loss": 0.31542177200317384, + "step": 41300 + }, + { + "epoch": 0.17735246387264625, + "grad_norm": 1.3220545053482056, + "learning_rate": 8.261988737787053e-05, + "loss": 0.3501570224761963, + "step": 41310 + }, + { + "epoch": 0.17739539596266626, + "grad_norm": 0.05844166874885559, + "learning_rate": 8.26155756577529e-05, + "loss": 0.10675265789031982, + "step": 41320 + }, + { + "epoch": 0.17743832805268625, + "grad_norm": 0.009374409914016724, + "learning_rate": 8.261126393763528e-05, + "loss": 0.21986827850341797, + "step": 41330 + }, + { + "epoch": 0.17748126014270627, + "grad_norm": 0.1090593934059143, + "learning_rate": 8.260695221751766e-05, + "loss": 0.28274610042572024, + "step": 41340 + }, + { + "epoch": 0.17752419223272628, + "grad_norm": 4.326338768005371, + "learning_rate": 8.260264049740004e-05, + "loss": 0.14196935892105103, + "step": 41350 + }, + { + "epoch": 0.17756712432274627, + "grad_norm": 0.08671343326568604, + "learning_rate": 8.25983287772824e-05, + "loss": 0.43413190841674804, + "step": 41360 + }, + { + "epoch": 0.17761005641276628, + "grad_norm": 1.5184999704360962, + "learning_rate": 8.259401705716478e-05, + "loss": 0.2877436637878418, + "step": 41370 + }, + { + "epoch": 0.1776529885027863, + "grad_norm": 0.7909638285636902, + "learning_rate": 8.258970533704717e-05, + "loss": 0.26583013534545896, + "step": 41380 + }, + { + "epoch": 0.1776959205928063, + "grad_norm": 2.740144729614258, + "learning_rate": 8.258539361692955e-05, + "loss": 0.27145915031433104, + "step": 41390 + }, + { + "epoch": 0.1777388526828263, + "grad_norm": 2.0741074085235596, + "learning_rate": 8.258108189681192e-05, + "loss": 0.28565273284912107, + "step": 41400 + }, + { + "epoch": 0.1777817847728463, + "grad_norm": 0.8548224568367004, + "learning_rate": 8.25767701766943e-05, + "loss": 0.15239741802215576, + "step": 41410 + }, + { + "epoch": 0.17782471686286633, + "grad_norm": 0.21049945056438446, + "learning_rate": 8.257245845657668e-05, + "loss": 0.1537050724029541, + "step": 41420 + }, + { + "epoch": 0.17786764895288631, + "grad_norm": 0.023179372772574425, + "learning_rate": 8.256814673645906e-05, + "loss": 0.19055650234222413, + "step": 41430 + }, + { + "epoch": 0.17791058104290633, + "grad_norm": 1.333984613418579, + "learning_rate": 8.256383501634143e-05, + "loss": 0.45396127700805666, + "step": 41440 + }, + { + "epoch": 0.17795351313292634, + "grad_norm": 0.789638102054596, + "learning_rate": 8.25595232962238e-05, + "loss": 0.19562928676605223, + "step": 41450 + }, + { + "epoch": 0.17799644522294633, + "grad_norm": 0.018816199153661728, + "learning_rate": 8.255521157610617e-05, + "loss": 0.20530054569244385, + "step": 41460 + }, + { + "epoch": 0.17803937731296635, + "grad_norm": 0.03177690878510475, + "learning_rate": 8.255089985598855e-05, + "loss": 0.2804241180419922, + "step": 41470 + }, + { + "epoch": 0.17808230940298636, + "grad_norm": 0.057153262197971344, + "learning_rate": 8.254658813587093e-05, + "loss": 0.0474990576505661, + "step": 41480 + }, + { + "epoch": 0.17812524149300638, + "grad_norm": 0.4023728370666504, + "learning_rate": 8.25422764157533e-05, + "loss": 0.011729182302951812, + "step": 41490 + }, + { + "epoch": 0.17816817358302636, + "grad_norm": 0.3172079622745514, + "learning_rate": 8.253796469563568e-05, + "loss": 0.29478228092193604, + "step": 41500 + }, + { + "epoch": 0.17821110567304638, + "grad_norm": 0.0051256525330245495, + "learning_rate": 8.253365297551806e-05, + "loss": 0.1952364444732666, + "step": 41510 + }, + { + "epoch": 0.1782540377630664, + "grad_norm": 0.07210695743560791, + "learning_rate": 8.252934125540044e-05, + "loss": 0.17813553810119628, + "step": 41520 + }, + { + "epoch": 0.17829696985308638, + "grad_norm": 1.6913539171218872, + "learning_rate": 8.25250295352828e-05, + "loss": 0.29789347648620607, + "step": 41530 + }, + { + "epoch": 0.1783399019431064, + "grad_norm": 0.7843422293663025, + "learning_rate": 8.252071781516518e-05, + "loss": 0.07679291367530823, + "step": 41540 + }, + { + "epoch": 0.1783828340331264, + "grad_norm": 0.1356504112482071, + "learning_rate": 8.251640609504756e-05, + "loss": 0.14666395187377929, + "step": 41550 + }, + { + "epoch": 0.1784257661231464, + "grad_norm": 4.225996971130371, + "learning_rate": 8.251209437492993e-05, + "loss": 0.26961095333099366, + "step": 41560 + }, + { + "epoch": 0.1784686982131664, + "grad_norm": 5.507353782653809, + "learning_rate": 8.250778265481231e-05, + "loss": 0.42926778793334963, + "step": 41570 + }, + { + "epoch": 0.17851163030318642, + "grad_norm": 0.029965883120894432, + "learning_rate": 8.250347093469469e-05, + "loss": 0.15851476192474365, + "step": 41580 + }, + { + "epoch": 0.17855456239320644, + "grad_norm": 4.527895450592041, + "learning_rate": 8.249915921457707e-05, + "loss": 0.3978370428085327, + "step": 41590 + }, + { + "epoch": 0.17859749448322643, + "grad_norm": 0.023621153086423874, + "learning_rate": 8.249484749445944e-05, + "loss": 0.04133687913417816, + "step": 41600 + }, + { + "epoch": 0.17864042657324644, + "grad_norm": 0.011967599391937256, + "learning_rate": 8.249053577434182e-05, + "loss": 0.20162932872772216, + "step": 41610 + }, + { + "epoch": 0.17868335866326646, + "grad_norm": 0.05433309078216553, + "learning_rate": 8.24862240542242e-05, + "loss": 0.21610791683197023, + "step": 41620 + }, + { + "epoch": 0.17872629075328644, + "grad_norm": 0.5024532675743103, + "learning_rate": 8.248191233410658e-05, + "loss": 0.25946621894836425, + "step": 41630 + }, + { + "epoch": 0.17876922284330646, + "grad_norm": 0.04332248494029045, + "learning_rate": 8.247760061398895e-05, + "loss": 0.2946115255355835, + "step": 41640 + }, + { + "epoch": 0.17881215493332647, + "grad_norm": 0.04508750140666962, + "learning_rate": 8.247328889387133e-05, + "loss": 0.27277469635009766, + "step": 41650 + }, + { + "epoch": 0.17885508702334646, + "grad_norm": 0.09932596236467361, + "learning_rate": 8.246897717375371e-05, + "loss": 0.13800952434539795, + "step": 41660 + }, + { + "epoch": 0.17889801911336647, + "grad_norm": 2.0395140647888184, + "learning_rate": 8.246466545363609e-05, + "loss": 0.4774670124053955, + "step": 41670 + }, + { + "epoch": 0.1789409512033865, + "grad_norm": 0.00492170499637723, + "learning_rate": 8.246035373351846e-05, + "loss": 0.06394680142402649, + "step": 41680 + }, + { + "epoch": 0.17898388329340648, + "grad_norm": 5.37308931350708, + "learning_rate": 8.245604201340083e-05, + "loss": 0.22611804008483888, + "step": 41690 + }, + { + "epoch": 0.1790268153834265, + "grad_norm": 0.35247814655303955, + "learning_rate": 8.24517302932832e-05, + "loss": 0.133196222782135, + "step": 41700 + }, + { + "epoch": 0.1790697474734465, + "grad_norm": 0.07962910085916519, + "learning_rate": 8.244741857316558e-05, + "loss": 0.02865527868270874, + "step": 41710 + }, + { + "epoch": 0.17911267956346652, + "grad_norm": 0.22916433215141296, + "learning_rate": 8.244310685304796e-05, + "loss": 0.3877936124801636, + "step": 41720 + }, + { + "epoch": 0.1791556116534865, + "grad_norm": 0.14095786213874817, + "learning_rate": 8.243879513293034e-05, + "loss": 0.15569915771484374, + "step": 41730 + }, + { + "epoch": 0.17919854374350652, + "grad_norm": 2.432192325592041, + "learning_rate": 8.243448341281271e-05, + "loss": 0.13069474697113037, + "step": 41740 + }, + { + "epoch": 0.17924147583352654, + "grad_norm": 0.006407095119357109, + "learning_rate": 8.243017169269509e-05, + "loss": 0.36841678619384766, + "step": 41750 + }, + { + "epoch": 0.17928440792354652, + "grad_norm": 2.003530263900757, + "learning_rate": 8.242585997257747e-05, + "loss": 0.3995458364486694, + "step": 41760 + }, + { + "epoch": 0.17932734001356654, + "grad_norm": 2.8348910808563232, + "learning_rate": 8.242154825245983e-05, + "loss": 0.33652119636535643, + "step": 41770 + }, + { + "epoch": 0.17937027210358655, + "grad_norm": 2.0822720527648926, + "learning_rate": 8.241723653234221e-05, + "loss": 0.3451982498168945, + "step": 41780 + }, + { + "epoch": 0.17941320419360654, + "grad_norm": 1.157434105873108, + "learning_rate": 8.241292481222459e-05, + "loss": 0.2674999475479126, + "step": 41790 + }, + { + "epoch": 0.17945613628362656, + "grad_norm": 0.003249003551900387, + "learning_rate": 8.240861309210696e-05, + "loss": 0.4022883415222168, + "step": 41800 + }, + { + "epoch": 0.17949906837364657, + "grad_norm": 0.005270634777843952, + "learning_rate": 8.240430137198934e-05, + "loss": 0.17502715587615966, + "step": 41810 + }, + { + "epoch": 0.17954200046366658, + "grad_norm": 0.00106601242441684, + "learning_rate": 8.239998965187172e-05, + "loss": 0.19975433349609376, + "step": 41820 + }, + { + "epoch": 0.17958493255368657, + "grad_norm": 0.0015999526949599385, + "learning_rate": 8.23956779317541e-05, + "loss": 0.07730457186698914, + "step": 41830 + }, + { + "epoch": 0.1796278646437066, + "grad_norm": 1.591696858406067, + "learning_rate": 8.239136621163647e-05, + "loss": 0.2634559631347656, + "step": 41840 + }, + { + "epoch": 0.1796707967337266, + "grad_norm": 0.1443047970533371, + "learning_rate": 8.238705449151885e-05, + "loss": 0.09159661531448364, + "step": 41850 + }, + { + "epoch": 0.1797137288237466, + "grad_norm": 2.106316089630127, + "learning_rate": 8.238274277140123e-05, + "loss": 0.34165334701538086, + "step": 41860 + }, + { + "epoch": 0.1797566609137666, + "grad_norm": 0.41624924540519714, + "learning_rate": 8.23784310512836e-05, + "loss": 0.16944279670715331, + "step": 41870 + }, + { + "epoch": 0.17979959300378662, + "grad_norm": 0.09014492481946945, + "learning_rate": 8.237411933116598e-05, + "loss": 0.32366485595703126, + "step": 41880 + }, + { + "epoch": 0.1798425250938066, + "grad_norm": 3.917996883392334, + "learning_rate": 8.236980761104836e-05, + "loss": 0.3025381565093994, + "step": 41890 + }, + { + "epoch": 0.17988545718382662, + "grad_norm": 0.019549962133169174, + "learning_rate": 8.236549589093074e-05, + "loss": 0.13743315935134887, + "step": 41900 + }, + { + "epoch": 0.17992838927384663, + "grad_norm": 0.008352905511856079, + "learning_rate": 8.236118417081311e-05, + "loss": 0.16161361932754517, + "step": 41910 + }, + { + "epoch": 0.17997132136386665, + "grad_norm": 0.2613341808319092, + "learning_rate": 8.235687245069549e-05, + "loss": 0.1621633529663086, + "step": 41920 + }, + { + "epoch": 0.18001425345388664, + "grad_norm": 0.011575527489185333, + "learning_rate": 8.235256073057787e-05, + "loss": 0.3463058233261108, + "step": 41930 + }, + { + "epoch": 0.18005718554390665, + "grad_norm": 0.03169770911335945, + "learning_rate": 8.234824901046023e-05, + "loss": 0.22944414615631104, + "step": 41940 + }, + { + "epoch": 0.18010011763392667, + "grad_norm": 2.3105247020721436, + "learning_rate": 8.234393729034261e-05, + "loss": 0.33264400959014895, + "step": 41950 + }, + { + "epoch": 0.18014304972394665, + "grad_norm": 2.5481674671173096, + "learning_rate": 8.233962557022499e-05, + "loss": 0.1760340452194214, + "step": 41960 + }, + { + "epoch": 0.18018598181396667, + "grad_norm": 0.025155572220683098, + "learning_rate": 8.233531385010736e-05, + "loss": 0.17142497301101683, + "step": 41970 + }, + { + "epoch": 0.18022891390398668, + "grad_norm": 0.008635635487735271, + "learning_rate": 8.233100212998974e-05, + "loss": 0.06564597487449646, + "step": 41980 + }, + { + "epoch": 0.18027184599400667, + "grad_norm": 5.458888530731201, + "learning_rate": 8.232669040987212e-05, + "loss": 0.2626389741897583, + "step": 41990 + }, + { + "epoch": 0.18031477808402668, + "grad_norm": 0.04839470610022545, + "learning_rate": 8.23223786897545e-05, + "loss": 0.10072095394134521, + "step": 42000 + }, + { + "epoch": 0.18031477808402668, + "eval_loss": 0.4729278087615967, + "eval_runtime": 27.4307, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 42000 + }, + { + "epoch": 0.1803577101740467, + "grad_norm": 1.669901967048645, + "learning_rate": 8.231806696963687e-05, + "loss": 0.20919899940490722, + "step": 42010 + }, + { + "epoch": 0.1804006422640667, + "grad_norm": 0.21869534254074097, + "learning_rate": 8.231375524951924e-05, + "loss": 0.42443008422851564, + "step": 42020 + }, + { + "epoch": 0.1804435743540867, + "grad_norm": 0.06950627267360687, + "learning_rate": 8.230944352940162e-05, + "loss": 0.2566273927688599, + "step": 42030 + }, + { + "epoch": 0.18048650644410671, + "grad_norm": 0.002647142857313156, + "learning_rate": 8.230513180928399e-05, + "loss": 0.3426970958709717, + "step": 42040 + }, + { + "epoch": 0.18052943853412673, + "grad_norm": 0.2864302396774292, + "learning_rate": 8.230082008916637e-05, + "loss": 0.18864575624465943, + "step": 42050 + }, + { + "epoch": 0.18057237062414672, + "grad_norm": 0.0666997879743576, + "learning_rate": 8.229650836904875e-05, + "loss": 0.49730625152587893, + "step": 42060 + }, + { + "epoch": 0.18061530271416673, + "grad_norm": 0.04434436559677124, + "learning_rate": 8.229219664893112e-05, + "loss": 0.22104194164276122, + "step": 42070 + }, + { + "epoch": 0.18065823480418675, + "grad_norm": 0.15641604363918304, + "learning_rate": 8.22878849288135e-05, + "loss": 0.22287838459014891, + "step": 42080 + }, + { + "epoch": 0.18070116689420673, + "grad_norm": 0.672537088394165, + "learning_rate": 8.228357320869588e-05, + "loss": 0.3182207584381104, + "step": 42090 + }, + { + "epoch": 0.18074409898422675, + "grad_norm": 0.039332207292318344, + "learning_rate": 8.227926148857826e-05, + "loss": 0.27091445922851565, + "step": 42100 + }, + { + "epoch": 0.18078703107424676, + "grad_norm": 0.060980744659900665, + "learning_rate": 8.227494976846063e-05, + "loss": 0.11896921396255493, + "step": 42110 + }, + { + "epoch": 0.18082996316426675, + "grad_norm": 0.03762351721525192, + "learning_rate": 8.227063804834301e-05, + "loss": 0.15179141759872436, + "step": 42120 + }, + { + "epoch": 0.18087289525428676, + "grad_norm": 0.38810792565345764, + "learning_rate": 8.226632632822539e-05, + "loss": 0.19824562072753907, + "step": 42130 + }, + { + "epoch": 0.18091582734430678, + "grad_norm": 0.01573784649372101, + "learning_rate": 8.226201460810777e-05, + "loss": 0.30882740020751953, + "step": 42140 + }, + { + "epoch": 0.1809587594343268, + "grad_norm": 0.029129816219210625, + "learning_rate": 8.225770288799014e-05, + "loss": 0.2297649383544922, + "step": 42150 + }, + { + "epoch": 0.18100169152434678, + "grad_norm": 6.418994426727295, + "learning_rate": 8.225339116787252e-05, + "loss": 0.3056930065155029, + "step": 42160 + }, + { + "epoch": 0.1810446236143668, + "grad_norm": 0.07934199273586273, + "learning_rate": 8.22490794477549e-05, + "loss": 0.30078830718994143, + "step": 42170 + }, + { + "epoch": 0.1810875557043868, + "grad_norm": 1.3679813146591187, + "learning_rate": 8.224476772763728e-05, + "loss": 0.2010509490966797, + "step": 42180 + }, + { + "epoch": 0.1811304877944068, + "grad_norm": 2.9077062606811523, + "learning_rate": 8.224045600751964e-05, + "loss": 0.11555584669113159, + "step": 42190 + }, + { + "epoch": 0.1811734198844268, + "grad_norm": 0.609990119934082, + "learning_rate": 8.223614428740202e-05, + "loss": 0.14299447536468507, + "step": 42200 + }, + { + "epoch": 0.18121635197444683, + "grad_norm": 0.04955404996871948, + "learning_rate": 8.22318325672844e-05, + "loss": 0.15406620502471924, + "step": 42210 + }, + { + "epoch": 0.1812592840644668, + "grad_norm": 2.1297833919525146, + "learning_rate": 8.222752084716677e-05, + "loss": 0.31810736656188965, + "step": 42220 + }, + { + "epoch": 0.18130221615448683, + "grad_norm": 0.11107069253921509, + "learning_rate": 8.222320912704915e-05, + "loss": 0.2613961696624756, + "step": 42230 + }, + { + "epoch": 0.18134514824450684, + "grad_norm": 0.04765243083238602, + "learning_rate": 8.221889740693153e-05, + "loss": 0.12535251379013063, + "step": 42240 + }, + { + "epoch": 0.18138808033452686, + "grad_norm": 22.591459274291992, + "learning_rate": 8.22145856868139e-05, + "loss": 0.2527450084686279, + "step": 42250 + }, + { + "epoch": 0.18143101242454684, + "grad_norm": 0.5352850556373596, + "learning_rate": 8.221027396669628e-05, + "loss": 0.43744745254516604, + "step": 42260 + }, + { + "epoch": 0.18147394451456686, + "grad_norm": 0.6301557421684265, + "learning_rate": 8.220596224657864e-05, + "loss": 0.1265001654624939, + "step": 42270 + }, + { + "epoch": 0.18151687660458687, + "grad_norm": 0.7245357036590576, + "learning_rate": 8.220165052646102e-05, + "loss": 0.137465763092041, + "step": 42280 + }, + { + "epoch": 0.18155980869460686, + "grad_norm": 0.04410898685455322, + "learning_rate": 8.21973388063434e-05, + "loss": 0.353134298324585, + "step": 42290 + }, + { + "epoch": 0.18160274078462688, + "grad_norm": 0.023124389350414276, + "learning_rate": 8.219302708622578e-05, + "loss": 0.08382171988487244, + "step": 42300 + }, + { + "epoch": 0.1816456728746469, + "grad_norm": 0.9598243236541748, + "learning_rate": 8.218871536610815e-05, + "loss": 0.09931755065917969, + "step": 42310 + }, + { + "epoch": 0.18168860496466688, + "grad_norm": 0.006847964599728584, + "learning_rate": 8.218440364599053e-05, + "loss": 0.18975690603256226, + "step": 42320 + }, + { + "epoch": 0.1817315370546869, + "grad_norm": 1.4635645151138306, + "learning_rate": 8.218009192587291e-05, + "loss": 0.18958462476730348, + "step": 42330 + }, + { + "epoch": 0.1817744691447069, + "grad_norm": 6.285202503204346, + "learning_rate": 8.21757802057553e-05, + "loss": 0.4254147052764893, + "step": 42340 + }, + { + "epoch": 0.18181740123472692, + "grad_norm": 1.0863999128341675, + "learning_rate": 8.217146848563766e-05, + "loss": 0.4314442157745361, + "step": 42350 + }, + { + "epoch": 0.1818603333247469, + "grad_norm": 0.8314317464828491, + "learning_rate": 8.216715676552004e-05, + "loss": 0.3709148168563843, + "step": 42360 + }, + { + "epoch": 0.18190326541476692, + "grad_norm": 0.7620775699615479, + "learning_rate": 8.216284504540242e-05, + "loss": 0.19915242195129396, + "step": 42370 + }, + { + "epoch": 0.18194619750478694, + "grad_norm": 1.8169580698013306, + "learning_rate": 8.21585333252848e-05, + "loss": 0.39284143447875974, + "step": 42380 + }, + { + "epoch": 0.18198912959480693, + "grad_norm": 0.15456518530845642, + "learning_rate": 8.215422160516717e-05, + "loss": 0.2430192708969116, + "step": 42390 + }, + { + "epoch": 0.18203206168482694, + "grad_norm": 0.012154005467891693, + "learning_rate": 8.214990988504955e-05, + "loss": 0.22760634422302245, + "step": 42400 + }, + { + "epoch": 0.18207499377484695, + "grad_norm": 0.19953225553035736, + "learning_rate": 8.214559816493193e-05, + "loss": 0.17155539989471436, + "step": 42410 + }, + { + "epoch": 0.18211792586486694, + "grad_norm": 0.028711501508951187, + "learning_rate": 8.21412864448143e-05, + "loss": 0.12059812545776367, + "step": 42420 + }, + { + "epoch": 0.18216085795488696, + "grad_norm": 5.916935443878174, + "learning_rate": 8.213697472469667e-05, + "loss": 0.11507797241210938, + "step": 42430 + }, + { + "epoch": 0.18220379004490697, + "grad_norm": 0.39378035068511963, + "learning_rate": 8.213266300457905e-05, + "loss": 0.6166230201721191, + "step": 42440 + }, + { + "epoch": 0.182246722134927, + "grad_norm": 1.32924222946167, + "learning_rate": 8.212835128446142e-05, + "loss": 0.2679696798324585, + "step": 42450 + }, + { + "epoch": 0.18228965422494697, + "grad_norm": 0.06272288411855698, + "learning_rate": 8.21240395643438e-05, + "loss": 0.29037282466888426, + "step": 42460 + }, + { + "epoch": 0.182332586314967, + "grad_norm": 0.163888618350029, + "learning_rate": 8.211972784422618e-05, + "loss": 0.0659745216369629, + "step": 42470 + }, + { + "epoch": 0.182375518404987, + "grad_norm": 0.491892546415329, + "learning_rate": 8.211541612410856e-05, + "loss": 0.2176523208618164, + "step": 42480 + }, + { + "epoch": 0.182418450495007, + "grad_norm": 0.5934542417526245, + "learning_rate": 8.211110440399093e-05, + "loss": 0.24393646717071532, + "step": 42490 + }, + { + "epoch": 0.182461382585027, + "grad_norm": 2.1584486961364746, + "learning_rate": 8.210679268387331e-05, + "loss": 0.3037613868713379, + "step": 42500 + }, + { + "epoch": 0.18250431467504702, + "grad_norm": 0.009108933620154858, + "learning_rate": 8.210248096375567e-05, + "loss": 0.15713260173797608, + "step": 42510 + }, + { + "epoch": 0.182547246765067, + "grad_norm": 0.4322991669178009, + "learning_rate": 8.209816924363805e-05, + "loss": 0.17076945304870605, + "step": 42520 + }, + { + "epoch": 0.18259017885508702, + "grad_norm": 0.08237133920192719, + "learning_rate": 8.209385752352043e-05, + "loss": 0.21980366706848145, + "step": 42530 + }, + { + "epoch": 0.18263311094510704, + "grad_norm": 8.105067253112793, + "learning_rate": 8.20895458034028e-05, + "loss": 0.2191450834274292, + "step": 42540 + }, + { + "epoch": 0.18267604303512702, + "grad_norm": 0.0717390924692154, + "learning_rate": 8.208523408328518e-05, + "loss": 0.39773619174957275, + "step": 42550 + }, + { + "epoch": 0.18271897512514704, + "grad_norm": 4.677486896514893, + "learning_rate": 8.208092236316757e-05, + "loss": 0.15335140228271485, + "step": 42560 + }, + { + "epoch": 0.18276190721516705, + "grad_norm": 4.2845377922058105, + "learning_rate": 8.207661064304995e-05, + "loss": 0.37812421321868894, + "step": 42570 + }, + { + "epoch": 0.18280483930518707, + "grad_norm": 10.023009300231934, + "learning_rate": 8.207229892293233e-05, + "loss": 0.22293882369995116, + "step": 42580 + }, + { + "epoch": 0.18284777139520705, + "grad_norm": 0.15922299027442932, + "learning_rate": 8.20679872028147e-05, + "loss": 0.21837432384490968, + "step": 42590 + }, + { + "epoch": 0.18289070348522707, + "grad_norm": 2.1236989498138428, + "learning_rate": 8.206367548269707e-05, + "loss": 0.24211366176605226, + "step": 42600 + }, + { + "epoch": 0.18293363557524708, + "grad_norm": 0.8039203882217407, + "learning_rate": 8.205936376257945e-05, + "loss": 0.28853487968444824, + "step": 42610 + }, + { + "epoch": 0.18297656766526707, + "grad_norm": 0.020852619782090187, + "learning_rate": 8.205505204246182e-05, + "loss": 0.23580570220947267, + "step": 42620 + }, + { + "epoch": 0.18301949975528709, + "grad_norm": 0.0024098677095025778, + "learning_rate": 8.20507403223442e-05, + "loss": 0.03820265829563141, + "step": 42630 + }, + { + "epoch": 0.1830624318453071, + "grad_norm": 0.018851445987820625, + "learning_rate": 8.204642860222658e-05, + "loss": 0.15569354295730592, + "step": 42640 + }, + { + "epoch": 0.1831053639353271, + "grad_norm": 1.3270905017852783, + "learning_rate": 8.204211688210896e-05, + "loss": 0.3316626071929932, + "step": 42650 + }, + { + "epoch": 0.1831482960253471, + "grad_norm": 0.7571421265602112, + "learning_rate": 8.203780516199133e-05, + "loss": 0.1260097622871399, + "step": 42660 + }, + { + "epoch": 0.18319122811536712, + "grad_norm": 0.11175458878278732, + "learning_rate": 8.203349344187371e-05, + "loss": 0.2821631908416748, + "step": 42670 + }, + { + "epoch": 0.18323416020538713, + "grad_norm": 2.5466408729553223, + "learning_rate": 8.202918172175607e-05, + "loss": 0.1871044158935547, + "step": 42680 + }, + { + "epoch": 0.18327709229540712, + "grad_norm": 1.2957346439361572, + "learning_rate": 8.202487000163845e-05, + "loss": 0.30394940376281737, + "step": 42690 + }, + { + "epoch": 0.18332002438542713, + "grad_norm": 0.023146262392401695, + "learning_rate": 8.202055828152083e-05, + "loss": 0.1928611397743225, + "step": 42700 + }, + { + "epoch": 0.18336295647544715, + "grad_norm": 0.003472856944426894, + "learning_rate": 8.201624656140321e-05, + "loss": 0.2598503351211548, + "step": 42710 + }, + { + "epoch": 0.18340588856546713, + "grad_norm": 0.004443190060555935, + "learning_rate": 8.201193484128558e-05, + "loss": 0.20475189685821532, + "step": 42720 + }, + { + "epoch": 0.18344882065548715, + "grad_norm": 0.41619807481765747, + "learning_rate": 8.200762312116796e-05, + "loss": 0.2871143102645874, + "step": 42730 + }, + { + "epoch": 0.18349175274550716, + "grad_norm": 9.765052795410156, + "learning_rate": 8.200331140105034e-05, + "loss": 0.4204115867614746, + "step": 42740 + }, + { + "epoch": 0.18353468483552715, + "grad_norm": 0.0008733683498576283, + "learning_rate": 8.199899968093272e-05, + "loss": 0.2748641729354858, + "step": 42750 + }, + { + "epoch": 0.18357761692554717, + "grad_norm": 0.3988295793533325, + "learning_rate": 8.199468796081508e-05, + "loss": 0.3808812141418457, + "step": 42760 + }, + { + "epoch": 0.18362054901556718, + "grad_norm": 0.009889461100101471, + "learning_rate": 8.199037624069746e-05, + "loss": 0.16420258283615113, + "step": 42770 + }, + { + "epoch": 0.1836634811055872, + "grad_norm": 0.004972816910594702, + "learning_rate": 8.198606452057985e-05, + "loss": 0.19041136503219605, + "step": 42780 + }, + { + "epoch": 0.18370641319560718, + "grad_norm": 0.03542853519320488, + "learning_rate": 8.198175280046223e-05, + "loss": 0.10970422029495239, + "step": 42790 + }, + { + "epoch": 0.1837493452856272, + "grad_norm": 2.3165123462677, + "learning_rate": 8.19774410803446e-05, + "loss": 0.18427956104278564, + "step": 42800 + }, + { + "epoch": 0.1837922773756472, + "grad_norm": 0.012739105150103569, + "learning_rate": 8.197312936022698e-05, + "loss": 0.1309381127357483, + "step": 42810 + }, + { + "epoch": 0.1838352094656672, + "grad_norm": 0.005955998320132494, + "learning_rate": 8.196881764010936e-05, + "loss": 0.20765841007232666, + "step": 42820 + }, + { + "epoch": 0.1838781415556872, + "grad_norm": 0.0022688531316816807, + "learning_rate": 8.196450591999174e-05, + "loss": 0.3057706356048584, + "step": 42830 + }, + { + "epoch": 0.18392107364570723, + "grad_norm": 0.004292634781450033, + "learning_rate": 8.19601941998741e-05, + "loss": 0.2799015283584595, + "step": 42840 + }, + { + "epoch": 0.18396400573572722, + "grad_norm": 2.1287357807159424, + "learning_rate": 8.195588247975648e-05, + "loss": 0.17402925491333007, + "step": 42850 + }, + { + "epoch": 0.18400693782574723, + "grad_norm": 0.19822724163532257, + "learning_rate": 8.195157075963885e-05, + "loss": 0.3075894594192505, + "step": 42860 + }, + { + "epoch": 0.18404986991576724, + "grad_norm": 0.3585273325443268, + "learning_rate": 8.194725903952123e-05, + "loss": 0.07946454286575318, + "step": 42870 + }, + { + "epoch": 0.18409280200578726, + "grad_norm": 2.302215814590454, + "learning_rate": 8.194294731940361e-05, + "loss": 0.36985416412353517, + "step": 42880 + }, + { + "epoch": 0.18413573409580725, + "grad_norm": 0.5702426433563232, + "learning_rate": 8.193863559928599e-05, + "loss": 0.371164870262146, + "step": 42890 + }, + { + "epoch": 0.18417866618582726, + "grad_norm": 3.3670296669006348, + "learning_rate": 8.193432387916836e-05, + "loss": 0.2882568359375, + "step": 42900 + }, + { + "epoch": 0.18422159827584728, + "grad_norm": 0.10460800677537918, + "learning_rate": 8.193001215905074e-05, + "loss": 0.15762473344802858, + "step": 42910 + }, + { + "epoch": 0.18426453036586726, + "grad_norm": 0.09293296188116074, + "learning_rate": 8.192570043893312e-05, + "loss": 0.28543522357940676, + "step": 42920 + }, + { + "epoch": 0.18430746245588728, + "grad_norm": 1.2589787244796753, + "learning_rate": 8.192138871881548e-05, + "loss": 0.11235880851745605, + "step": 42930 + }, + { + "epoch": 0.1843503945459073, + "grad_norm": 0.021063808351755142, + "learning_rate": 8.191707699869786e-05, + "loss": 0.3688133478164673, + "step": 42940 + }, + { + "epoch": 0.18439332663592728, + "grad_norm": 4.861030101776123, + "learning_rate": 8.191276527858024e-05, + "loss": 0.16458606719970703, + "step": 42950 + }, + { + "epoch": 0.1844362587259473, + "grad_norm": 0.6773809790611267, + "learning_rate": 8.190845355846261e-05, + "loss": 0.4682401180267334, + "step": 42960 + }, + { + "epoch": 0.1844791908159673, + "grad_norm": 0.4810944199562073, + "learning_rate": 8.190414183834499e-05, + "loss": 0.28884458541870117, + "step": 42970 + }, + { + "epoch": 0.1845221229059873, + "grad_norm": 2.6145782470703125, + "learning_rate": 8.189983011822737e-05, + "loss": 0.05625054836273193, + "step": 42980 + }, + { + "epoch": 0.1845650549960073, + "grad_norm": 2.071237564086914, + "learning_rate": 8.189551839810975e-05, + "loss": 0.20640103816986083, + "step": 42990 + }, + { + "epoch": 0.18460798708602733, + "grad_norm": 0.013447938486933708, + "learning_rate": 8.189120667799212e-05, + "loss": 0.2571903944015503, + "step": 43000 + }, + { + "epoch": 0.18460798708602733, + "eval_loss": 0.4624760150909424, + "eval_runtime": 27.5883, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 3.625, + "step": 43000 + }, + { + "epoch": 0.18465091917604734, + "grad_norm": 0.09999194741249084, + "learning_rate": 8.18868949578745e-05, + "loss": 0.0944497048854828, + "step": 43010 + }, + { + "epoch": 0.18469385126606733, + "grad_norm": 2.868917942047119, + "learning_rate": 8.188258323775688e-05, + "loss": 0.26001758575439454, + "step": 43020 + }, + { + "epoch": 0.18473678335608734, + "grad_norm": 3.3130545616149902, + "learning_rate": 8.187827151763925e-05, + "loss": 0.3481125354766846, + "step": 43030 + }, + { + "epoch": 0.18477971544610736, + "grad_norm": 0.002941107377409935, + "learning_rate": 8.187395979752163e-05, + "loss": 0.19944119453430176, + "step": 43040 + }, + { + "epoch": 0.18482264753612734, + "grad_norm": 14.443581581115723, + "learning_rate": 8.186964807740401e-05, + "loss": 0.2885287761688232, + "step": 43050 + }, + { + "epoch": 0.18486557962614736, + "grad_norm": 3.258791446685791, + "learning_rate": 8.186533635728639e-05, + "loss": 0.17024463415145874, + "step": 43060 + }, + { + "epoch": 0.18490851171616737, + "grad_norm": 0.04481988027691841, + "learning_rate": 8.186102463716876e-05, + "loss": 0.1365174889564514, + "step": 43070 + }, + { + "epoch": 0.18495144380618736, + "grad_norm": 0.05688230320811272, + "learning_rate": 8.185671291705114e-05, + "loss": 0.427608060836792, + "step": 43080 + }, + { + "epoch": 0.18499437589620737, + "grad_norm": 3.2461934089660645, + "learning_rate": 8.18524011969335e-05, + "loss": 0.19203683137893676, + "step": 43090 + }, + { + "epoch": 0.1850373079862274, + "grad_norm": 0.08931227028369904, + "learning_rate": 8.184808947681588e-05, + "loss": 0.31490564346313477, + "step": 43100 + }, + { + "epoch": 0.1850802400762474, + "grad_norm": 1.140378713607788, + "learning_rate": 8.184377775669826e-05, + "loss": 0.1009800910949707, + "step": 43110 + }, + { + "epoch": 0.1851231721662674, + "grad_norm": 0.06171561777591705, + "learning_rate": 8.183946603658064e-05, + "loss": 0.07267990708351135, + "step": 43120 + }, + { + "epoch": 0.1851661042562874, + "grad_norm": 0.047086216509342194, + "learning_rate": 8.183515431646301e-05, + "loss": 0.3527937650680542, + "step": 43130 + }, + { + "epoch": 0.18520903634630742, + "grad_norm": 0.013647548854351044, + "learning_rate": 8.183084259634539e-05, + "loss": 0.26084198951721194, + "step": 43140 + }, + { + "epoch": 0.1852519684363274, + "grad_norm": 0.07848212122917175, + "learning_rate": 8.182653087622777e-05, + "loss": 0.39730684757232665, + "step": 43150 + }, + { + "epoch": 0.18529490052634742, + "grad_norm": 0.002793788444250822, + "learning_rate": 8.182221915611015e-05, + "loss": 0.23335707187652588, + "step": 43160 + }, + { + "epoch": 0.18533783261636744, + "grad_norm": 0.018101299181580544, + "learning_rate": 8.181790743599251e-05, + "loss": 0.2064742088317871, + "step": 43170 + }, + { + "epoch": 0.18538076470638742, + "grad_norm": 0.9697562456130981, + "learning_rate": 8.181359571587489e-05, + "loss": 0.2400217294692993, + "step": 43180 + }, + { + "epoch": 0.18542369679640744, + "grad_norm": 0.07094231992959976, + "learning_rate": 8.180928399575727e-05, + "loss": 0.3039929151535034, + "step": 43190 + }, + { + "epoch": 0.18546662888642745, + "grad_norm": 0.013909654691815376, + "learning_rate": 8.180497227563964e-05, + "loss": 0.27940926551818845, + "step": 43200 + }, + { + "epoch": 0.18550956097644747, + "grad_norm": 0.03464217483997345, + "learning_rate": 8.180066055552202e-05, + "loss": 0.06249539852142334, + "step": 43210 + }, + { + "epoch": 0.18555249306646746, + "grad_norm": 0.022050119936466217, + "learning_rate": 8.17963488354044e-05, + "loss": 0.2281078577041626, + "step": 43220 + }, + { + "epoch": 0.18559542515648747, + "grad_norm": 1.9440895318984985, + "learning_rate": 8.179203711528677e-05, + "loss": 0.5420735836029053, + "step": 43230 + }, + { + "epoch": 0.18563835724650748, + "grad_norm": 3.1730079650878906, + "learning_rate": 8.178772539516915e-05, + "loss": 0.1774951696395874, + "step": 43240 + }, + { + "epoch": 0.18568128933652747, + "grad_norm": 0.05383119732141495, + "learning_rate": 8.178341367505153e-05, + "loss": 0.11135185956954956, + "step": 43250 + }, + { + "epoch": 0.1857242214265475, + "grad_norm": 0.0012574224965646863, + "learning_rate": 8.17791019549339e-05, + "loss": 0.2155439853668213, + "step": 43260 + }, + { + "epoch": 0.1857671535165675, + "grad_norm": 0.07369806617498398, + "learning_rate": 8.177479023481628e-05, + "loss": 0.313634467124939, + "step": 43270 + }, + { + "epoch": 0.1858100856065875, + "grad_norm": 0.4887748062610626, + "learning_rate": 8.177047851469866e-05, + "loss": 0.3466495037078857, + "step": 43280 + }, + { + "epoch": 0.1858530176966075, + "grad_norm": 0.11614210903644562, + "learning_rate": 8.176616679458104e-05, + "loss": 0.16023856401443481, + "step": 43290 + }, + { + "epoch": 0.18589594978662752, + "grad_norm": 0.01644720695912838, + "learning_rate": 8.176185507446342e-05, + "loss": 0.14743173122406006, + "step": 43300 + }, + { + "epoch": 0.18593888187664753, + "grad_norm": 2.1997272968292236, + "learning_rate": 8.17575433543458e-05, + "loss": 0.15039613246917724, + "step": 43310 + }, + { + "epoch": 0.18598181396666752, + "grad_norm": 0.0693756565451622, + "learning_rate": 8.175323163422817e-05, + "loss": 0.3908696174621582, + "step": 43320 + }, + { + "epoch": 0.18602474605668753, + "grad_norm": 0.02574329636991024, + "learning_rate": 8.174891991411055e-05, + "loss": 0.15041611194610596, + "step": 43330 + }, + { + "epoch": 0.18606767814670755, + "grad_norm": 3.0219974517822266, + "learning_rate": 8.174460819399291e-05, + "loss": 0.37598633766174316, + "step": 43340 + }, + { + "epoch": 0.18611061023672754, + "grad_norm": 6.371044635772705, + "learning_rate": 8.174029647387529e-05, + "loss": 0.24004921913146973, + "step": 43350 + }, + { + "epoch": 0.18615354232674755, + "grad_norm": 0.007055271882563829, + "learning_rate": 8.173598475375767e-05, + "loss": 0.1907490611076355, + "step": 43360 + }, + { + "epoch": 0.18619647441676757, + "grad_norm": 0.8306063413619995, + "learning_rate": 8.173167303364004e-05, + "loss": 0.2930524587631226, + "step": 43370 + }, + { + "epoch": 0.18623940650678755, + "grad_norm": 0.6232219934463501, + "learning_rate": 8.172736131352242e-05, + "loss": 0.27646067142486574, + "step": 43380 + }, + { + "epoch": 0.18628233859680757, + "grad_norm": 5.046137809753418, + "learning_rate": 8.17230495934048e-05, + "loss": 0.2907963752746582, + "step": 43390 + }, + { + "epoch": 0.18632527068682758, + "grad_norm": 0.12482646852731705, + "learning_rate": 8.171873787328718e-05, + "loss": 0.2309260129928589, + "step": 43400 + }, + { + "epoch": 0.18636820277684757, + "grad_norm": 1.5691694021224976, + "learning_rate": 8.171442615316955e-05, + "loss": 0.267235255241394, + "step": 43410 + }, + { + "epoch": 0.18641113486686758, + "grad_norm": 0.054734162986278534, + "learning_rate": 8.171011443305192e-05, + "loss": 0.5326226711273193, + "step": 43420 + }, + { + "epoch": 0.1864540669568876, + "grad_norm": 0.022658541798591614, + "learning_rate": 8.17058027129343e-05, + "loss": 0.3653179883956909, + "step": 43430 + }, + { + "epoch": 0.1864969990469076, + "grad_norm": 1.6763198375701904, + "learning_rate": 8.170149099281667e-05, + "loss": 0.38337364196777346, + "step": 43440 + }, + { + "epoch": 0.1865399311369276, + "grad_norm": 5.016429424285889, + "learning_rate": 8.169717927269905e-05, + "loss": 0.19746410846710205, + "step": 43450 + }, + { + "epoch": 0.18658286322694762, + "grad_norm": 1.6247531175613403, + "learning_rate": 8.169286755258143e-05, + "loss": 0.10856088399887084, + "step": 43460 + }, + { + "epoch": 0.18662579531696763, + "grad_norm": 0.14577797055244446, + "learning_rate": 8.16885558324638e-05, + "loss": 0.29924936294555665, + "step": 43470 + }, + { + "epoch": 0.18666872740698762, + "grad_norm": 0.13083939254283905, + "learning_rate": 8.168424411234618e-05, + "loss": 0.2384188652038574, + "step": 43480 + }, + { + "epoch": 0.18671165949700763, + "grad_norm": 7.798047065734863, + "learning_rate": 8.167993239222856e-05, + "loss": 0.30132806301116943, + "step": 43490 + }, + { + "epoch": 0.18675459158702765, + "grad_norm": 0.5570793151855469, + "learning_rate": 8.167562067211094e-05, + "loss": 0.27629728317260743, + "step": 43500 + }, + { + "epoch": 0.18679752367704763, + "grad_norm": 0.05040042847394943, + "learning_rate": 8.167130895199331e-05, + "loss": 0.060433989763259886, + "step": 43510 + }, + { + "epoch": 0.18684045576706765, + "grad_norm": 0.03875486180186272, + "learning_rate": 8.166699723187569e-05, + "loss": 0.22466940879821778, + "step": 43520 + }, + { + "epoch": 0.18688338785708766, + "grad_norm": 2.4116384983062744, + "learning_rate": 8.166268551175807e-05, + "loss": 0.35434484481811523, + "step": 43530 + }, + { + "epoch": 0.18692631994710768, + "grad_norm": 4.129846096038818, + "learning_rate": 8.165837379164045e-05, + "loss": 0.46059222221374513, + "step": 43540 + }, + { + "epoch": 0.18696925203712766, + "grad_norm": 0.02307182177901268, + "learning_rate": 8.165406207152282e-05, + "loss": 0.17703438997268678, + "step": 43550 + }, + { + "epoch": 0.18701218412714768, + "grad_norm": 3.282421827316284, + "learning_rate": 8.16497503514052e-05, + "loss": 0.20219864845275878, + "step": 43560 + }, + { + "epoch": 0.1870551162171677, + "grad_norm": 2.4306914806365967, + "learning_rate": 8.164543863128758e-05, + "loss": 0.3165069341659546, + "step": 43570 + }, + { + "epoch": 0.18709804830718768, + "grad_norm": 3.8549060821533203, + "learning_rate": 8.164112691116994e-05, + "loss": 0.20757479667663575, + "step": 43580 + }, + { + "epoch": 0.1871409803972077, + "grad_norm": 0.05286364257335663, + "learning_rate": 8.163681519105232e-05, + "loss": 0.20257492065429689, + "step": 43590 + }, + { + "epoch": 0.1871839124872277, + "grad_norm": 1.4247897863388062, + "learning_rate": 8.16325034709347e-05, + "loss": 0.1439652681350708, + "step": 43600 + }, + { + "epoch": 0.1872268445772477, + "grad_norm": 1.548986792564392, + "learning_rate": 8.162819175081707e-05, + "loss": 0.18846890926361085, + "step": 43610 + }, + { + "epoch": 0.1872697766672677, + "grad_norm": 0.008255302906036377, + "learning_rate": 8.162388003069945e-05, + "loss": 0.20841715335845948, + "step": 43620 + }, + { + "epoch": 0.18731270875728773, + "grad_norm": 0.42507603764533997, + "learning_rate": 8.161956831058183e-05, + "loss": 0.2078617811203003, + "step": 43630 + }, + { + "epoch": 0.18735564084730774, + "grad_norm": 0.04114179313182831, + "learning_rate": 8.16152565904642e-05, + "loss": 0.04051432609558105, + "step": 43640 + }, + { + "epoch": 0.18739857293732773, + "grad_norm": 0.010206053033471107, + "learning_rate": 8.161094487034658e-05, + "loss": 0.35066268444061277, + "step": 43650 + }, + { + "epoch": 0.18744150502734774, + "grad_norm": 0.012079720385372639, + "learning_rate": 8.160663315022896e-05, + "loss": 0.17734284400939943, + "step": 43660 + }, + { + "epoch": 0.18748443711736776, + "grad_norm": 0.00811173114925623, + "learning_rate": 8.160232143011132e-05, + "loss": 0.1144661545753479, + "step": 43670 + }, + { + "epoch": 0.18752736920738775, + "grad_norm": 0.3730086088180542, + "learning_rate": 8.15980097099937e-05, + "loss": 0.16818068027496338, + "step": 43680 + }, + { + "epoch": 0.18757030129740776, + "grad_norm": 0.024107687175273895, + "learning_rate": 8.159369798987608e-05, + "loss": 0.28643581867218015, + "step": 43690 + }, + { + "epoch": 0.18761323338742777, + "grad_norm": 0.023587489500641823, + "learning_rate": 8.158938626975846e-05, + "loss": 0.27016143798828124, + "step": 43700 + }, + { + "epoch": 0.18765616547744776, + "grad_norm": 1.4881666898727417, + "learning_rate": 8.158507454964083e-05, + "loss": 0.2365088939666748, + "step": 43710 + }, + { + "epoch": 0.18769909756746778, + "grad_norm": 0.2194991409778595, + "learning_rate": 8.158076282952321e-05, + "loss": 0.31122922897338867, + "step": 43720 + }, + { + "epoch": 0.1877420296574878, + "grad_norm": 3.471586227416992, + "learning_rate": 8.157645110940559e-05, + "loss": 0.30818378925323486, + "step": 43730 + }, + { + "epoch": 0.1877849617475078, + "grad_norm": 3.3636248111724854, + "learning_rate": 8.157213938928796e-05, + "loss": 0.13375380039215087, + "step": 43740 + }, + { + "epoch": 0.1878278938375278, + "grad_norm": 3.003509521484375, + "learning_rate": 8.156782766917034e-05, + "loss": 0.17285083532333373, + "step": 43750 + }, + { + "epoch": 0.1878708259275478, + "grad_norm": 0.09808559715747833, + "learning_rate": 8.156351594905272e-05, + "loss": 0.18395047187805175, + "step": 43760 + }, + { + "epoch": 0.18791375801756782, + "grad_norm": 0.08174111694097519, + "learning_rate": 8.15592042289351e-05, + "loss": 0.31794826984405516, + "step": 43770 + }, + { + "epoch": 0.1879566901075878, + "grad_norm": 3.1812849044799805, + "learning_rate": 8.155489250881747e-05, + "loss": 0.259110426902771, + "step": 43780 + }, + { + "epoch": 0.18799962219760782, + "grad_norm": 2.050807476043701, + "learning_rate": 8.155058078869985e-05, + "loss": 0.2605229139328003, + "step": 43790 + }, + { + "epoch": 0.18804255428762784, + "grad_norm": 1.9166364669799805, + "learning_rate": 8.154626906858223e-05, + "loss": 0.3418402910232544, + "step": 43800 + }, + { + "epoch": 0.18808548637764783, + "grad_norm": 0.7631783485412598, + "learning_rate": 8.15419573484646e-05, + "loss": 0.2409060001373291, + "step": 43810 + }, + { + "epoch": 0.18812841846766784, + "grad_norm": 2.0551998615264893, + "learning_rate": 8.153764562834698e-05, + "loss": 0.31589469909667967, + "step": 43820 + }, + { + "epoch": 0.18817135055768786, + "grad_norm": 0.06294999271631241, + "learning_rate": 8.153333390822935e-05, + "loss": 0.4397084712982178, + "step": 43830 + }, + { + "epoch": 0.18821428264770784, + "grad_norm": 3.0229833126068115, + "learning_rate": 8.152902218811172e-05, + "loss": 0.24915146827697754, + "step": 43840 + }, + { + "epoch": 0.18825721473772786, + "grad_norm": 0.04572201520204544, + "learning_rate": 8.15247104679941e-05, + "loss": 0.2244415283203125, + "step": 43850 + }, + { + "epoch": 0.18830014682774787, + "grad_norm": 2.7412846088409424, + "learning_rate": 8.152039874787648e-05, + "loss": 0.2302554130554199, + "step": 43860 + }, + { + "epoch": 0.1883430789177679, + "grad_norm": 1.3648289442062378, + "learning_rate": 8.151608702775886e-05, + "loss": 0.2644734144210815, + "step": 43870 + }, + { + "epoch": 0.18838601100778787, + "grad_norm": 0.25226426124572754, + "learning_rate": 8.151177530764123e-05, + "loss": 0.11064702272415161, + "step": 43880 + }, + { + "epoch": 0.1884289430978079, + "grad_norm": 0.028316723182797432, + "learning_rate": 8.150746358752361e-05, + "loss": 0.3318798542022705, + "step": 43890 + }, + { + "epoch": 0.1884718751878279, + "grad_norm": 4.9640631675720215, + "learning_rate": 8.150315186740599e-05, + "loss": 0.2857161521911621, + "step": 43900 + }, + { + "epoch": 0.1885148072778479, + "grad_norm": 0.480720579624176, + "learning_rate": 8.149884014728835e-05, + "loss": 0.1920159101486206, + "step": 43910 + }, + { + "epoch": 0.1885577393678679, + "grad_norm": 0.23835653066635132, + "learning_rate": 8.149452842717073e-05, + "loss": 0.3155388355255127, + "step": 43920 + }, + { + "epoch": 0.18860067145788792, + "grad_norm": 0.02639954164624214, + "learning_rate": 8.149021670705311e-05, + "loss": 0.30919642448425294, + "step": 43930 + }, + { + "epoch": 0.1886436035479079, + "grad_norm": 0.004375265445560217, + "learning_rate": 8.148590498693548e-05, + "loss": 0.245635724067688, + "step": 43940 + }, + { + "epoch": 0.18868653563792792, + "grad_norm": 0.0014972257195040584, + "learning_rate": 8.148159326681786e-05, + "loss": 0.3194050073623657, + "step": 43950 + }, + { + "epoch": 0.18872946772794794, + "grad_norm": 0.3180157244205475, + "learning_rate": 8.147728154670024e-05, + "loss": 0.05688638091087341, + "step": 43960 + }, + { + "epoch": 0.18877239981796795, + "grad_norm": 1.7093822956085205, + "learning_rate": 8.147296982658263e-05, + "loss": 0.3212114334106445, + "step": 43970 + }, + { + "epoch": 0.18881533190798794, + "grad_norm": 1.0756118297576904, + "learning_rate": 8.146865810646501e-05, + "loss": 0.27136006355285647, + "step": 43980 + }, + { + "epoch": 0.18885826399800795, + "grad_norm": 0.05142730847001076, + "learning_rate": 8.146434638634738e-05, + "loss": 0.2458888292312622, + "step": 43990 + }, + { + "epoch": 0.18890119608802797, + "grad_norm": 2.8642890453338623, + "learning_rate": 8.146003466622975e-05, + "loss": 0.2910621643066406, + "step": 44000 + }, + { + "epoch": 0.18890119608802797, + "eval_loss": 0.4605604410171509, + "eval_runtime": 27.4334, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 44000 + }, + { + "epoch": 0.18894412817804795, + "grad_norm": 1.807096004486084, + "learning_rate": 8.145572294611213e-05, + "loss": 0.3342024564743042, + "step": 44010 + }, + { + "epoch": 0.18898706026806797, + "grad_norm": 0.6657182574272156, + "learning_rate": 8.14514112259945e-05, + "loss": 0.05959969758987427, + "step": 44020 + }, + { + "epoch": 0.18902999235808798, + "grad_norm": 1.3843122720718384, + "learning_rate": 8.144709950587688e-05, + "loss": 0.10991348028182983, + "step": 44030 + }, + { + "epoch": 0.18907292444810797, + "grad_norm": 6.659622669219971, + "learning_rate": 8.144278778575926e-05, + "loss": 0.5495349407196045, + "step": 44040 + }, + { + "epoch": 0.18911585653812799, + "grad_norm": 3.2412028312683105, + "learning_rate": 8.143847606564164e-05, + "loss": 0.29310102462768556, + "step": 44050 + }, + { + "epoch": 0.189158788628148, + "grad_norm": 2.484363317489624, + "learning_rate": 8.143416434552401e-05, + "loss": 0.4231656551361084, + "step": 44060 + }, + { + "epoch": 0.18920172071816801, + "grad_norm": 0.0007273709634318948, + "learning_rate": 8.142985262540639e-05, + "loss": 0.0684486985206604, + "step": 44070 + }, + { + "epoch": 0.189244652808188, + "grad_norm": 5.1812920570373535, + "learning_rate": 8.142554090528875e-05, + "loss": 0.2998863935470581, + "step": 44080 + }, + { + "epoch": 0.18928758489820802, + "grad_norm": 0.029622757807374, + "learning_rate": 8.142122918517113e-05, + "loss": 0.18725346326828002, + "step": 44090 + }, + { + "epoch": 0.18933051698822803, + "grad_norm": 0.0006494583212770522, + "learning_rate": 8.141691746505351e-05, + "loss": 0.22647314071655272, + "step": 44100 + }, + { + "epoch": 0.18937344907824802, + "grad_norm": 1.4165912866592407, + "learning_rate": 8.141260574493589e-05, + "loss": 0.28343505859375, + "step": 44110 + }, + { + "epoch": 0.18941638116826803, + "grad_norm": 0.4761173725128174, + "learning_rate": 8.140829402481826e-05, + "loss": 0.11881612539291382, + "step": 44120 + }, + { + "epoch": 0.18945931325828805, + "grad_norm": 0.06735536456108093, + "learning_rate": 8.140398230470064e-05, + "loss": 0.3430104494094849, + "step": 44130 + }, + { + "epoch": 0.18950224534830803, + "grad_norm": 0.007762636058032513, + "learning_rate": 8.139967058458302e-05, + "loss": 0.5389622211456299, + "step": 44140 + }, + { + "epoch": 0.18954517743832805, + "grad_norm": 1.3800119161605835, + "learning_rate": 8.13953588644654e-05, + "loss": 0.24931068420410157, + "step": 44150 + }, + { + "epoch": 0.18958810952834806, + "grad_norm": 0.880368173122406, + "learning_rate": 8.139104714434776e-05, + "loss": 0.12496919631958008, + "step": 44160 + }, + { + "epoch": 0.18963104161836808, + "grad_norm": 0.0014428679132834077, + "learning_rate": 8.138673542423014e-05, + "loss": 0.07668641209602356, + "step": 44170 + }, + { + "epoch": 0.18967397370838807, + "grad_norm": 2.079153060913086, + "learning_rate": 8.138242370411251e-05, + "loss": 0.2590550422668457, + "step": 44180 + }, + { + "epoch": 0.18971690579840808, + "grad_norm": 0.15682633221149445, + "learning_rate": 8.13781119839949e-05, + "loss": 0.05421972870826721, + "step": 44190 + }, + { + "epoch": 0.1897598378884281, + "grad_norm": 0.07640580087900162, + "learning_rate": 8.137380026387728e-05, + "loss": 0.19816200733184813, + "step": 44200 + }, + { + "epoch": 0.18980276997844808, + "grad_norm": 0.005920249968767166, + "learning_rate": 8.136948854375966e-05, + "loss": 0.34413843154907225, + "step": 44210 + }, + { + "epoch": 0.1898457020684681, + "grad_norm": 2.7683632373809814, + "learning_rate": 8.136517682364204e-05, + "loss": 0.4210531234741211, + "step": 44220 + }, + { + "epoch": 0.1898886341584881, + "grad_norm": 1.1772396564483643, + "learning_rate": 8.136086510352441e-05, + "loss": 0.2988885879516602, + "step": 44230 + }, + { + "epoch": 0.1899315662485081, + "grad_norm": 0.8943934440612793, + "learning_rate": 8.135655338340678e-05, + "loss": 0.2540787696838379, + "step": 44240 + }, + { + "epoch": 0.1899744983385281, + "grad_norm": 0.004835850093513727, + "learning_rate": 8.135224166328916e-05, + "loss": 0.08346214890480042, + "step": 44250 + }, + { + "epoch": 0.19001743042854813, + "grad_norm": 4.239825248718262, + "learning_rate": 8.134792994317153e-05, + "loss": 0.22071146965026855, + "step": 44260 + }, + { + "epoch": 0.19006036251856812, + "grad_norm": 0.07698159664869308, + "learning_rate": 8.134361822305391e-05, + "loss": 0.1325700879096985, + "step": 44270 + }, + { + "epoch": 0.19010329460858813, + "grad_norm": 2.4468345642089844, + "learning_rate": 8.133930650293629e-05, + "loss": 0.3593152046203613, + "step": 44280 + }, + { + "epoch": 0.19014622669860815, + "grad_norm": 41.485435485839844, + "learning_rate": 8.133499478281866e-05, + "loss": 0.08537226915359497, + "step": 44290 + }, + { + "epoch": 0.19018915878862816, + "grad_norm": 2.0043179988861084, + "learning_rate": 8.133068306270104e-05, + "loss": 0.135001802444458, + "step": 44300 + }, + { + "epoch": 0.19023209087864815, + "grad_norm": 0.0291176475584507, + "learning_rate": 8.132637134258342e-05, + "loss": 0.2878988742828369, + "step": 44310 + }, + { + "epoch": 0.19027502296866816, + "grad_norm": 0.2772849202156067, + "learning_rate": 8.132205962246578e-05, + "loss": 0.20665276050567627, + "step": 44320 + }, + { + "epoch": 0.19031795505868818, + "grad_norm": 0.02503611147403717, + "learning_rate": 8.131774790234816e-05, + "loss": 0.29029097557067873, + "step": 44330 + }, + { + "epoch": 0.19036088714870816, + "grad_norm": 1.3659212589263916, + "learning_rate": 8.131343618223054e-05, + "loss": 0.21326007843017578, + "step": 44340 + }, + { + "epoch": 0.19040381923872818, + "grad_norm": 2.154261589050293, + "learning_rate": 8.130912446211292e-05, + "loss": 0.29660470485687257, + "step": 44350 + }, + { + "epoch": 0.1904467513287482, + "grad_norm": 0.5898604989051819, + "learning_rate": 8.130481274199529e-05, + "loss": 0.2806436777114868, + "step": 44360 + }, + { + "epoch": 0.19048968341876818, + "grad_norm": 0.006019935477524996, + "learning_rate": 8.130050102187767e-05, + "loss": 0.21853513717651368, + "step": 44370 + }, + { + "epoch": 0.1905326155087882, + "grad_norm": 0.09301898628473282, + "learning_rate": 8.129618930176005e-05, + "loss": 0.44185843467712405, + "step": 44380 + }, + { + "epoch": 0.1905755475988082, + "grad_norm": 0.16995981335639954, + "learning_rate": 8.129187758164242e-05, + "loss": 0.22304224967956543, + "step": 44390 + }, + { + "epoch": 0.19061847968882822, + "grad_norm": 0.0012088268995285034, + "learning_rate": 8.12875658615248e-05, + "loss": 0.23868961334228517, + "step": 44400 + }, + { + "epoch": 0.1906614117788482, + "grad_norm": 0.14039476215839386, + "learning_rate": 8.128325414140718e-05, + "loss": 0.2872094869613647, + "step": 44410 + }, + { + "epoch": 0.19070434386886823, + "grad_norm": 0.004024143796414137, + "learning_rate": 8.127894242128956e-05, + "loss": 0.19596340656280517, + "step": 44420 + }, + { + "epoch": 0.19074727595888824, + "grad_norm": 0.02372003346681595, + "learning_rate": 8.127463070117193e-05, + "loss": 0.295436954498291, + "step": 44430 + }, + { + "epoch": 0.19079020804890823, + "grad_norm": 0.023033304139971733, + "learning_rate": 8.127031898105431e-05, + "loss": 0.34102492332458495, + "step": 44440 + }, + { + "epoch": 0.19083314013892824, + "grad_norm": 3.642709493637085, + "learning_rate": 8.126600726093669e-05, + "loss": 0.3784060478210449, + "step": 44450 + }, + { + "epoch": 0.19087607222894826, + "grad_norm": 0.009738151915371418, + "learning_rate": 8.126169554081907e-05, + "loss": 0.33837051391601564, + "step": 44460 + }, + { + "epoch": 0.19091900431896824, + "grad_norm": 0.08202206343412399, + "learning_rate": 8.125738382070144e-05, + "loss": 0.23539764881134034, + "step": 44470 + }, + { + "epoch": 0.19096193640898826, + "grad_norm": 0.3848929703235626, + "learning_rate": 8.125307210058382e-05, + "loss": 0.2307135581970215, + "step": 44480 + }, + { + "epoch": 0.19100486849900827, + "grad_norm": 0.019833676517009735, + "learning_rate": 8.124876038046618e-05, + "loss": 0.22377758026123046, + "step": 44490 + }, + { + "epoch": 0.1910478005890283, + "grad_norm": 0.027888990938663483, + "learning_rate": 8.124444866034856e-05, + "loss": 0.2692965030670166, + "step": 44500 + }, + { + "epoch": 0.19109073267904828, + "grad_norm": 0.05748312547802925, + "learning_rate": 8.124013694023094e-05, + "loss": 0.31379878520965576, + "step": 44510 + }, + { + "epoch": 0.1911336647690683, + "grad_norm": 0.06689203530550003, + "learning_rate": 8.123582522011332e-05, + "loss": 0.32233970165252684, + "step": 44520 + }, + { + "epoch": 0.1911765968590883, + "grad_norm": 0.008352093398571014, + "learning_rate": 8.12315134999957e-05, + "loss": 0.1896106481552124, + "step": 44530 + }, + { + "epoch": 0.1912195289491083, + "grad_norm": 0.14758487045764923, + "learning_rate": 8.122720177987807e-05, + "loss": 0.1729714035987854, + "step": 44540 + }, + { + "epoch": 0.1912624610391283, + "grad_norm": 0.03349534422159195, + "learning_rate": 8.122289005976045e-05, + "loss": 0.16641179323196412, + "step": 44550 + }, + { + "epoch": 0.19130539312914832, + "grad_norm": 0.08998719602823257, + "learning_rate": 8.121857833964283e-05, + "loss": 0.23800258636474608, + "step": 44560 + }, + { + "epoch": 0.1913483252191683, + "grad_norm": 2.7044553756713867, + "learning_rate": 8.121426661952519e-05, + "loss": 0.1590886116027832, + "step": 44570 + }, + { + "epoch": 0.19139125730918832, + "grad_norm": 1.6016440391540527, + "learning_rate": 8.120995489940757e-05, + "loss": 0.36989946365356446, + "step": 44580 + }, + { + "epoch": 0.19143418939920834, + "grad_norm": 3.2121012210845947, + "learning_rate": 8.120564317928994e-05, + "loss": 0.31107995510101316, + "step": 44590 + }, + { + "epoch": 0.19147712148922835, + "grad_norm": 0.006490649655461311, + "learning_rate": 8.120133145917232e-05, + "loss": 0.21261978149414062, + "step": 44600 + }, + { + "epoch": 0.19152005357924834, + "grad_norm": 0.019323797896504402, + "learning_rate": 8.11970197390547e-05, + "loss": 0.12796977758407593, + "step": 44610 + }, + { + "epoch": 0.19156298566926835, + "grad_norm": 2.3980066776275635, + "learning_rate": 8.119270801893708e-05, + "loss": 0.29838697910308837, + "step": 44620 + }, + { + "epoch": 0.19160591775928837, + "grad_norm": 10.988465309143066, + "learning_rate": 8.118839629881945e-05, + "loss": 0.4121725082397461, + "step": 44630 + }, + { + "epoch": 0.19164884984930836, + "grad_norm": 0.031216247007250786, + "learning_rate": 8.118408457870183e-05, + "loss": 0.1725583553314209, + "step": 44640 + }, + { + "epoch": 0.19169178193932837, + "grad_norm": 0.5122570395469666, + "learning_rate": 8.117977285858421e-05, + "loss": 0.26382031440734866, + "step": 44650 + }, + { + "epoch": 0.19173471402934839, + "grad_norm": 0.006867983378469944, + "learning_rate": 8.117546113846659e-05, + "loss": 0.03699140548706055, + "step": 44660 + }, + { + "epoch": 0.19177764611936837, + "grad_norm": 1.920220971107483, + "learning_rate": 8.117114941834896e-05, + "loss": 0.29547007083892823, + "step": 44670 + }, + { + "epoch": 0.1918205782093884, + "grad_norm": 0.6627082228660583, + "learning_rate": 8.116683769823134e-05, + "loss": 0.21683313846588134, + "step": 44680 + }, + { + "epoch": 0.1918635102994084, + "grad_norm": 1.7581837177276611, + "learning_rate": 8.116252597811372e-05, + "loss": 0.11833486557006836, + "step": 44690 + }, + { + "epoch": 0.1919064423894284, + "grad_norm": 1.6450203657150269, + "learning_rate": 8.11582142579961e-05, + "loss": 0.35963356494903564, + "step": 44700 + }, + { + "epoch": 0.1919493744794484, + "grad_norm": 2.345304489135742, + "learning_rate": 8.115390253787847e-05, + "loss": 0.28931460380554197, + "step": 44710 + }, + { + "epoch": 0.19199230656946842, + "grad_norm": 0.010913246311247349, + "learning_rate": 8.114959081776085e-05, + "loss": 0.22629737854003906, + "step": 44720 + }, + { + "epoch": 0.19203523865948843, + "grad_norm": 0.5741439461708069, + "learning_rate": 8.114527909764323e-05, + "loss": 0.25016350746154786, + "step": 44730 + }, + { + "epoch": 0.19207817074950842, + "grad_norm": 0.7402640581130981, + "learning_rate": 8.114096737752559e-05, + "loss": 0.32834055423736574, + "step": 44740 + }, + { + "epoch": 0.19212110283952843, + "grad_norm": 0.04131797328591347, + "learning_rate": 8.113665565740797e-05, + "loss": 0.3026629686355591, + "step": 44750 + }, + { + "epoch": 0.19216403492954845, + "grad_norm": 1.0897802114486694, + "learning_rate": 8.113234393729035e-05, + "loss": 0.18321956396102906, + "step": 44760 + }, + { + "epoch": 0.19220696701956844, + "grad_norm": 0.0162035059183836, + "learning_rate": 8.112803221717272e-05, + "loss": 0.22167975902557374, + "step": 44770 + }, + { + "epoch": 0.19224989910958845, + "grad_norm": 0.018093420192599297, + "learning_rate": 8.11237204970551e-05, + "loss": 0.04558416903018951, + "step": 44780 + }, + { + "epoch": 0.19229283119960847, + "grad_norm": 0.211838498711586, + "learning_rate": 8.111940877693748e-05, + "loss": 0.33308026790618894, + "step": 44790 + }, + { + "epoch": 0.19233576328962845, + "grad_norm": 0.0031913614366203547, + "learning_rate": 8.111509705681985e-05, + "loss": 0.2766871929168701, + "step": 44800 + }, + { + "epoch": 0.19237869537964847, + "grad_norm": 0.028377799317240715, + "learning_rate": 8.111078533670223e-05, + "loss": 0.11854208707809448, + "step": 44810 + }, + { + "epoch": 0.19242162746966848, + "grad_norm": 0.04484991356730461, + "learning_rate": 8.11064736165846e-05, + "loss": 0.09982895851135254, + "step": 44820 + }, + { + "epoch": 0.1924645595596885, + "grad_norm": 0.08299509435892105, + "learning_rate": 8.110216189646697e-05, + "loss": 0.40526819229125977, + "step": 44830 + }, + { + "epoch": 0.19250749164970848, + "grad_norm": 0.25029635429382324, + "learning_rate": 8.109785017634935e-05, + "loss": 0.008327079564332962, + "step": 44840 + }, + { + "epoch": 0.1925504237397285, + "grad_norm": 0.3442751169204712, + "learning_rate": 8.109353845623173e-05, + "loss": 0.13076180219650269, + "step": 44850 + }, + { + "epoch": 0.1925933558297485, + "grad_norm": 0.012612867169082165, + "learning_rate": 8.10892267361141e-05, + "loss": 0.0522712230682373, + "step": 44860 + }, + { + "epoch": 0.1926362879197685, + "grad_norm": 0.09985598921775818, + "learning_rate": 8.108491501599648e-05, + "loss": 0.4725681781768799, + "step": 44870 + }, + { + "epoch": 0.19267922000978852, + "grad_norm": 0.15437287092208862, + "learning_rate": 8.108060329587886e-05, + "loss": 0.2096705436706543, + "step": 44880 + }, + { + "epoch": 0.19272215209980853, + "grad_norm": 3.897529363632202, + "learning_rate": 8.107629157576124e-05, + "loss": 0.24176254272460937, + "step": 44890 + }, + { + "epoch": 0.19276508418982852, + "grad_norm": 1.8788121938705444, + "learning_rate": 8.107197985564361e-05, + "loss": 0.13124208450317382, + "step": 44900 + }, + { + "epoch": 0.19280801627984853, + "grad_norm": 3.367654800415039, + "learning_rate": 8.106766813552599e-05, + "loss": 0.15661016702651978, + "step": 44910 + }, + { + "epoch": 0.19285094836986855, + "grad_norm": 1.950370192527771, + "learning_rate": 8.106335641540837e-05, + "loss": 0.23335545063018798, + "step": 44920 + }, + { + "epoch": 0.19289388045988856, + "grad_norm": 0.7737196087837219, + "learning_rate": 8.105904469529075e-05, + "loss": 0.1777315616607666, + "step": 44930 + }, + { + "epoch": 0.19293681254990855, + "grad_norm": 6.4171462059021, + "learning_rate": 8.105473297517312e-05, + "loss": 0.4596564769744873, + "step": 44940 + }, + { + "epoch": 0.19297974463992856, + "grad_norm": 0.08766023814678192, + "learning_rate": 8.10504212550555e-05, + "loss": 0.1520202040672302, + "step": 44950 + }, + { + "epoch": 0.19302267672994858, + "grad_norm": 0.14796678721904755, + "learning_rate": 8.104610953493788e-05, + "loss": 0.41483144760131835, + "step": 44960 + }, + { + "epoch": 0.19306560881996856, + "grad_norm": 1.5162951946258545, + "learning_rate": 8.104179781482026e-05, + "loss": 0.22687315940856934, + "step": 44970 + }, + { + "epoch": 0.19310854090998858, + "grad_norm": 0.051669228821992874, + "learning_rate": 8.103748609470262e-05, + "loss": 0.42571425437927246, + "step": 44980 + }, + { + "epoch": 0.1931514730000086, + "grad_norm": 1.4661331176757812, + "learning_rate": 8.1033174374585e-05, + "loss": 0.1966134190559387, + "step": 44990 + }, + { + "epoch": 0.19319440509002858, + "grad_norm": 3.3682007789611816, + "learning_rate": 8.102886265446737e-05, + "loss": 0.19375011920928956, + "step": 45000 + }, + { + "epoch": 0.19319440509002858, + "eval_loss": 0.4712151288986206, + "eval_runtime": 27.4644, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 45000 + }, + { + "epoch": 0.1932373371800486, + "grad_norm": 1.5242284536361694, + "learning_rate": 8.102455093434975e-05, + "loss": 0.28296632766723634, + "step": 45010 + }, + { + "epoch": 0.1932802692700686, + "grad_norm": 0.13998153805732727, + "learning_rate": 8.102023921423213e-05, + "loss": 0.2556852102279663, + "step": 45020 + }, + { + "epoch": 0.19332320136008863, + "grad_norm": 1.6157907247543335, + "learning_rate": 8.10159274941145e-05, + "loss": 0.13697457313537598, + "step": 45030 + }, + { + "epoch": 0.1933661334501086, + "grad_norm": 0.015710826963186264, + "learning_rate": 8.101161577399688e-05, + "loss": 0.18795580863952638, + "step": 45040 + }, + { + "epoch": 0.19340906554012863, + "grad_norm": 0.883563756942749, + "learning_rate": 8.100730405387926e-05, + "loss": 0.17424606084823607, + "step": 45050 + }, + { + "epoch": 0.19345199763014864, + "grad_norm": 0.014735180884599686, + "learning_rate": 8.100299233376163e-05, + "loss": 0.192516553401947, + "step": 45060 + }, + { + "epoch": 0.19349492972016863, + "grad_norm": 0.012432921677827835, + "learning_rate": 8.0998680613644e-05, + "loss": 0.29526875019073484, + "step": 45070 + }, + { + "epoch": 0.19353786181018864, + "grad_norm": 0.280377596616745, + "learning_rate": 8.099436889352638e-05, + "loss": 0.3330434799194336, + "step": 45080 + }, + { + "epoch": 0.19358079390020866, + "grad_norm": 0.03499651327729225, + "learning_rate": 8.099005717340876e-05, + "loss": 0.12696655988693237, + "step": 45090 + }, + { + "epoch": 0.19362372599022865, + "grad_norm": 2.5255191326141357, + "learning_rate": 8.098574545329113e-05, + "loss": 0.12479696273803711, + "step": 45100 + }, + { + "epoch": 0.19366665808024866, + "grad_norm": 0.135679230093956, + "learning_rate": 8.098143373317351e-05, + "loss": 0.19221022129058837, + "step": 45110 + }, + { + "epoch": 0.19370959017026868, + "grad_norm": 0.0067241620272397995, + "learning_rate": 8.097712201305589e-05, + "loss": 0.1390742063522339, + "step": 45120 + }, + { + "epoch": 0.19375252226028866, + "grad_norm": 1.7752548456192017, + "learning_rate": 8.097281029293827e-05, + "loss": 0.2600010633468628, + "step": 45130 + }, + { + "epoch": 0.19379545435030868, + "grad_norm": 0.21976950764656067, + "learning_rate": 8.096849857282064e-05, + "loss": 0.19792779684066772, + "step": 45140 + }, + { + "epoch": 0.1938383864403287, + "grad_norm": 0.033322304487228394, + "learning_rate": 8.096418685270302e-05, + "loss": 0.12441951036453247, + "step": 45150 + }, + { + "epoch": 0.1938813185303487, + "grad_norm": 1.1145471334457397, + "learning_rate": 8.09598751325854e-05, + "loss": 0.23317229747772217, + "step": 45160 + }, + { + "epoch": 0.1939242506203687, + "grad_norm": 1.6260700225830078, + "learning_rate": 8.095556341246778e-05, + "loss": 0.39760532379150393, + "step": 45170 + }, + { + "epoch": 0.1939671827103887, + "grad_norm": 1.8564512729644775, + "learning_rate": 8.095125169235015e-05, + "loss": 0.36122798919677734, + "step": 45180 + }, + { + "epoch": 0.19401011480040872, + "grad_norm": 3.138972043991089, + "learning_rate": 8.094693997223253e-05, + "loss": 0.2895623207092285, + "step": 45190 + }, + { + "epoch": 0.1940530468904287, + "grad_norm": 0.09472132474184036, + "learning_rate": 8.094262825211491e-05, + "loss": 0.33398478031158446, + "step": 45200 + }, + { + "epoch": 0.19409597898044872, + "grad_norm": 0.3423907160758972, + "learning_rate": 8.093831653199729e-05, + "loss": 0.182434344291687, + "step": 45210 + }, + { + "epoch": 0.19413891107046874, + "grad_norm": 2.8738393783569336, + "learning_rate": 8.093400481187966e-05, + "loss": 0.2890277624130249, + "step": 45220 + }, + { + "epoch": 0.19418184316048873, + "grad_norm": 29.463497161865234, + "learning_rate": 8.092969309176203e-05, + "loss": 0.3977688789367676, + "step": 45230 + }, + { + "epoch": 0.19422477525050874, + "grad_norm": 0.23220296204090118, + "learning_rate": 8.09253813716444e-05, + "loss": 0.360453462600708, + "step": 45240 + }, + { + "epoch": 0.19426770734052876, + "grad_norm": 0.10836378484964371, + "learning_rate": 8.092106965152678e-05, + "loss": 0.23855133056640626, + "step": 45250 + }, + { + "epoch": 0.19431063943054877, + "grad_norm": 0.307105153799057, + "learning_rate": 8.091675793140916e-05, + "loss": 0.42743630409240724, + "step": 45260 + }, + { + "epoch": 0.19435357152056876, + "grad_norm": 4.20743989944458, + "learning_rate": 8.091244621129154e-05, + "loss": 0.24763765335083007, + "step": 45270 + }, + { + "epoch": 0.19439650361058877, + "grad_norm": 0.007932635955512524, + "learning_rate": 8.090813449117391e-05, + "loss": 0.19730513095855712, + "step": 45280 + }, + { + "epoch": 0.1944394357006088, + "grad_norm": 9.092166900634766, + "learning_rate": 8.090382277105629e-05, + "loss": 0.3102408409118652, + "step": 45290 + }, + { + "epoch": 0.19448236779062877, + "grad_norm": 2.336500406265259, + "learning_rate": 8.089951105093867e-05, + "loss": 0.15682946443557738, + "step": 45300 + }, + { + "epoch": 0.1945252998806488, + "grad_norm": 0.035069532692432404, + "learning_rate": 8.089519933082103e-05, + "loss": 0.13510195016860962, + "step": 45310 + }, + { + "epoch": 0.1945682319706688, + "grad_norm": 0.09467840194702148, + "learning_rate": 8.089088761070341e-05, + "loss": 0.23605630397796631, + "step": 45320 + }, + { + "epoch": 0.1946111640606888, + "grad_norm": 0.30349770188331604, + "learning_rate": 8.088657589058579e-05, + "loss": 0.3186908006668091, + "step": 45330 + }, + { + "epoch": 0.1946540961507088, + "grad_norm": 0.1543532758951187, + "learning_rate": 8.088226417046816e-05, + "loss": 0.10231984853744507, + "step": 45340 + }, + { + "epoch": 0.19469702824072882, + "grad_norm": 0.08216534554958344, + "learning_rate": 8.087795245035054e-05, + "loss": 0.16797515153884887, + "step": 45350 + }, + { + "epoch": 0.19473996033074883, + "grad_norm": 0.15961258113384247, + "learning_rate": 8.087364073023292e-05, + "loss": 0.21318306922912597, + "step": 45360 + }, + { + "epoch": 0.19478289242076882, + "grad_norm": 0.15939001739025116, + "learning_rate": 8.08693290101153e-05, + "loss": 0.2917843580245972, + "step": 45370 + }, + { + "epoch": 0.19482582451078884, + "grad_norm": 1.0433694124221802, + "learning_rate": 8.086501728999769e-05, + "loss": 0.5787501811981202, + "step": 45380 + }, + { + "epoch": 0.19486875660080885, + "grad_norm": 3.5344173908233643, + "learning_rate": 8.086070556988005e-05, + "loss": 0.1859116554260254, + "step": 45390 + }, + { + "epoch": 0.19491168869082884, + "grad_norm": 1.2646311521530151, + "learning_rate": 8.085639384976243e-05, + "loss": 0.3138707399368286, + "step": 45400 + }, + { + "epoch": 0.19495462078084885, + "grad_norm": 1.0844224691390991, + "learning_rate": 8.08520821296448e-05, + "loss": 0.09525437951087952, + "step": 45410 + }, + { + "epoch": 0.19499755287086887, + "grad_norm": 2.304156541824341, + "learning_rate": 8.084777040952718e-05, + "loss": 0.2916342973709106, + "step": 45420 + }, + { + "epoch": 0.19504048496088885, + "grad_norm": 3.35841703414917, + "learning_rate": 8.084345868940956e-05, + "loss": 0.34789600372314455, + "step": 45430 + }, + { + "epoch": 0.19508341705090887, + "grad_norm": 0.004667060449719429, + "learning_rate": 8.083914696929194e-05, + "loss": 0.32075483798980714, + "step": 45440 + }, + { + "epoch": 0.19512634914092888, + "grad_norm": 25.836605072021484, + "learning_rate": 8.083483524917431e-05, + "loss": 0.288403058052063, + "step": 45450 + }, + { + "epoch": 0.1951692812309489, + "grad_norm": 1.2741717100143433, + "learning_rate": 8.083052352905669e-05, + "loss": 0.42087607383728026, + "step": 45460 + }, + { + "epoch": 0.19521221332096889, + "grad_norm": 3.004124879837036, + "learning_rate": 8.082621180893907e-05, + "loss": 0.12746796607971192, + "step": 45470 + }, + { + "epoch": 0.1952551454109889, + "grad_norm": 2.3572230339050293, + "learning_rate": 8.082190008882143e-05, + "loss": 0.3464216947555542, + "step": 45480 + }, + { + "epoch": 0.19529807750100892, + "grad_norm": 2.225754976272583, + "learning_rate": 8.081758836870381e-05, + "loss": 0.23277955055236815, + "step": 45490 + }, + { + "epoch": 0.1953410095910289, + "grad_norm": 0.18235382437705994, + "learning_rate": 8.081327664858619e-05, + "loss": 0.3772527456283569, + "step": 45500 + }, + { + "epoch": 0.19538394168104892, + "grad_norm": 0.11060876399278641, + "learning_rate": 8.080896492846856e-05, + "loss": 0.15486321449279786, + "step": 45510 + }, + { + "epoch": 0.19542687377106893, + "grad_norm": 4.628691673278809, + "learning_rate": 8.080465320835094e-05, + "loss": 0.436336088180542, + "step": 45520 + }, + { + "epoch": 0.19546980586108892, + "grad_norm": 4.060401439666748, + "learning_rate": 8.080034148823332e-05, + "loss": 0.27948029041290284, + "step": 45530 + }, + { + "epoch": 0.19551273795110893, + "grad_norm": 0.030168868601322174, + "learning_rate": 8.07960297681157e-05, + "loss": 0.12020124197006225, + "step": 45540 + }, + { + "epoch": 0.19555567004112895, + "grad_norm": 0.1102621778845787, + "learning_rate": 8.079171804799807e-05, + "loss": 0.2364802360534668, + "step": 45550 + }, + { + "epoch": 0.19559860213114894, + "grad_norm": 1.1610252857208252, + "learning_rate": 8.078740632788044e-05, + "loss": 0.41650800704956054, + "step": 45560 + }, + { + "epoch": 0.19564153422116895, + "grad_norm": 0.2328631728887558, + "learning_rate": 8.078309460776282e-05, + "loss": 0.2034066677093506, + "step": 45570 + }, + { + "epoch": 0.19568446631118896, + "grad_norm": 0.0528264120221138, + "learning_rate": 8.077878288764519e-05, + "loss": 0.22113289833068847, + "step": 45580 + }, + { + "epoch": 0.19572739840120898, + "grad_norm": 0.9496865272521973, + "learning_rate": 8.077447116752757e-05, + "loss": 0.15208523273468016, + "step": 45590 + }, + { + "epoch": 0.19577033049122897, + "grad_norm": 2.1166505813598633, + "learning_rate": 8.077015944740996e-05, + "loss": 0.36841602325439454, + "step": 45600 + }, + { + "epoch": 0.19581326258124898, + "grad_norm": 0.0011364739621058106, + "learning_rate": 8.076584772729234e-05, + "loss": 0.24271912574768068, + "step": 45610 + }, + { + "epoch": 0.195856194671269, + "grad_norm": 0.042179618030786514, + "learning_rate": 8.076153600717472e-05, + "loss": 0.22032694816589354, + "step": 45620 + }, + { + "epoch": 0.19589912676128898, + "grad_norm": 0.07593486458063126, + "learning_rate": 8.075722428705709e-05, + "loss": 0.1293318510055542, + "step": 45630 + }, + { + "epoch": 0.195942058851309, + "grad_norm": 0.23231108486652374, + "learning_rate": 8.075291256693946e-05, + "loss": 0.30386996269226074, + "step": 45640 + }, + { + "epoch": 0.195984990941329, + "grad_norm": 0.14182336628437042, + "learning_rate": 8.074860084682183e-05, + "loss": 0.2837024450302124, + "step": 45650 + }, + { + "epoch": 0.196027923031349, + "grad_norm": 1.3760734796524048, + "learning_rate": 8.074428912670421e-05, + "loss": 0.5788982391357422, + "step": 45660 + }, + { + "epoch": 0.19607085512136901, + "grad_norm": 0.14667318761348724, + "learning_rate": 8.073997740658659e-05, + "loss": 0.20872790813446046, + "step": 45670 + }, + { + "epoch": 0.19611378721138903, + "grad_norm": 0.699029803276062, + "learning_rate": 8.073566568646897e-05, + "loss": 0.3040825605392456, + "step": 45680 + }, + { + "epoch": 0.19615671930140904, + "grad_norm": 1.2123295068740845, + "learning_rate": 8.073135396635134e-05, + "loss": 0.33064205646514894, + "step": 45690 + }, + { + "epoch": 0.19619965139142903, + "grad_norm": 0.019776469096541405, + "learning_rate": 8.072704224623372e-05, + "loss": 0.35395164489746095, + "step": 45700 + }, + { + "epoch": 0.19624258348144905, + "grad_norm": 0.46024268865585327, + "learning_rate": 8.07227305261161e-05, + "loss": 0.18437256813049316, + "step": 45710 + }, + { + "epoch": 0.19628551557146906, + "grad_norm": 2.247520923614502, + "learning_rate": 8.071841880599846e-05, + "loss": 0.48839592933654785, + "step": 45720 + }, + { + "epoch": 0.19632844766148905, + "grad_norm": 0.6118911504745483, + "learning_rate": 8.071410708588084e-05, + "loss": 0.05164738297462464, + "step": 45730 + }, + { + "epoch": 0.19637137975150906, + "grad_norm": 0.27519097924232483, + "learning_rate": 8.070979536576322e-05, + "loss": 0.33614468574523926, + "step": 45740 + }, + { + "epoch": 0.19641431184152908, + "grad_norm": 2.999359130859375, + "learning_rate": 8.07054836456456e-05, + "loss": 0.27184247970581055, + "step": 45750 + }, + { + "epoch": 0.19645724393154906, + "grad_norm": 0.006510626524686813, + "learning_rate": 8.070117192552797e-05, + "loss": 0.35111246109008787, + "step": 45760 + }, + { + "epoch": 0.19650017602156908, + "grad_norm": 4.526912689208984, + "learning_rate": 8.069686020541035e-05, + "loss": 0.2468196392059326, + "step": 45770 + }, + { + "epoch": 0.1965431081115891, + "grad_norm": 0.006337666884064674, + "learning_rate": 8.069254848529273e-05, + "loss": 0.2162738561630249, + "step": 45780 + }, + { + "epoch": 0.1965860402016091, + "grad_norm": 0.05239209532737732, + "learning_rate": 8.06882367651751e-05, + "loss": 0.26372151374816893, + "step": 45790 + }, + { + "epoch": 0.1966289722916291, + "grad_norm": 0.0049435412511229515, + "learning_rate": 8.068392504505748e-05, + "loss": 0.22712130546569825, + "step": 45800 + }, + { + "epoch": 0.1966719043816491, + "grad_norm": 0.3726454973220825, + "learning_rate": 8.067961332493984e-05, + "loss": 0.41004314422607424, + "step": 45810 + }, + { + "epoch": 0.19671483647166912, + "grad_norm": 0.12642237544059753, + "learning_rate": 8.067530160482224e-05, + "loss": 0.282349967956543, + "step": 45820 + }, + { + "epoch": 0.1967577685616891, + "grad_norm": 0.10512173175811768, + "learning_rate": 8.067098988470461e-05, + "loss": 0.09284887313842774, + "step": 45830 + }, + { + "epoch": 0.19680070065170913, + "grad_norm": 15.117431640625, + "learning_rate": 8.066667816458699e-05, + "loss": 0.21897082328796386, + "step": 45840 + }, + { + "epoch": 0.19684363274172914, + "grad_norm": 3.6390509605407715, + "learning_rate": 8.066236644446937e-05, + "loss": 0.3061159610748291, + "step": 45850 + }, + { + "epoch": 0.19688656483174913, + "grad_norm": 0.017141474410891533, + "learning_rate": 8.065805472435174e-05, + "loss": 0.13499906063079833, + "step": 45860 + }, + { + "epoch": 0.19692949692176914, + "grad_norm": 0.03623701259493828, + "learning_rate": 8.065374300423412e-05, + "loss": 0.17264734506607055, + "step": 45870 + }, + { + "epoch": 0.19697242901178916, + "grad_norm": 0.1779487431049347, + "learning_rate": 8.06494312841165e-05, + "loss": 0.3331061601638794, + "step": 45880 + }, + { + "epoch": 0.19701536110180914, + "grad_norm": 0.05001157894730568, + "learning_rate": 8.064511956399886e-05, + "loss": 0.20098867416381835, + "step": 45890 + }, + { + "epoch": 0.19705829319182916, + "grad_norm": 0.002003852277994156, + "learning_rate": 8.064080784388124e-05, + "loss": 0.09526382088661194, + "step": 45900 + }, + { + "epoch": 0.19710122528184917, + "grad_norm": 3.046260118484497, + "learning_rate": 8.063649612376362e-05, + "loss": 0.34651336669921873, + "step": 45910 + }, + { + "epoch": 0.1971441573718692, + "grad_norm": 0.08373679220676422, + "learning_rate": 8.0632184403646e-05, + "loss": 0.12939629554748536, + "step": 45920 + }, + { + "epoch": 0.19718708946188918, + "grad_norm": 0.00509268743917346, + "learning_rate": 8.062787268352837e-05, + "loss": 0.2628002166748047, + "step": 45930 + }, + { + "epoch": 0.1972300215519092, + "grad_norm": 0.006324404384940863, + "learning_rate": 8.062356096341075e-05, + "loss": 0.12146024703979492, + "step": 45940 + }, + { + "epoch": 0.1972729536419292, + "grad_norm": 1.9545040130615234, + "learning_rate": 8.061924924329313e-05, + "loss": 0.31609747409820554, + "step": 45950 + }, + { + "epoch": 0.1973158857319492, + "grad_norm": 0.3493014872074127, + "learning_rate": 8.06149375231755e-05, + "loss": 0.3573979139328003, + "step": 45960 + }, + { + "epoch": 0.1973588178219692, + "grad_norm": 0.21159689128398895, + "learning_rate": 8.061062580305787e-05, + "loss": 0.39376125335693357, + "step": 45970 + }, + { + "epoch": 0.19740174991198922, + "grad_norm": 0.6115752458572388, + "learning_rate": 8.060631408294025e-05, + "loss": 0.32958173751831055, + "step": 45980 + }, + { + "epoch": 0.1974446820020092, + "grad_norm": 0.007062565069645643, + "learning_rate": 8.060200236282262e-05, + "loss": 0.40116105079650877, + "step": 45990 + }, + { + "epoch": 0.19748761409202922, + "grad_norm": 3.6431825160980225, + "learning_rate": 8.0597690642705e-05, + "loss": 0.1228832483291626, + "step": 46000 + }, + { + "epoch": 0.19748761409202922, + "eval_loss": 0.4566121995449066, + "eval_runtime": 27.523, + "eval_samples_per_second": 3.633, + "eval_steps_per_second": 3.633, + "step": 46000 + }, + { + "epoch": 0.19753054618204924, + "grad_norm": 1.6863194704055786, + "learning_rate": 8.059337892258738e-05, + "loss": 0.2996741533279419, + "step": 46010 + }, + { + "epoch": 0.19757347827206925, + "grad_norm": 0.15534216165542603, + "learning_rate": 8.058906720246976e-05, + "loss": 0.22710778713226318, + "step": 46020 + }, + { + "epoch": 0.19761641036208924, + "grad_norm": 1.8629953861236572, + "learning_rate": 8.058475548235213e-05, + "loss": 0.37529573440551756, + "step": 46030 + }, + { + "epoch": 0.19765934245210925, + "grad_norm": 8.502998352050781, + "learning_rate": 8.058044376223451e-05, + "loss": 0.30512685775756837, + "step": 46040 + }, + { + "epoch": 0.19770227454212927, + "grad_norm": 1.7396048307418823, + "learning_rate": 8.057613204211689e-05, + "loss": 0.2545022487640381, + "step": 46050 + }, + { + "epoch": 0.19774520663214926, + "grad_norm": 0.018455829471349716, + "learning_rate": 8.057182032199926e-05, + "loss": 0.2788164377212524, + "step": 46060 + }, + { + "epoch": 0.19778813872216927, + "grad_norm": 0.017023645341396332, + "learning_rate": 8.056750860188164e-05, + "loss": 0.11776541471481324, + "step": 46070 + }, + { + "epoch": 0.19783107081218929, + "grad_norm": 0.05966558679938316, + "learning_rate": 8.056319688176402e-05, + "loss": 0.23098490238189698, + "step": 46080 + }, + { + "epoch": 0.19787400290220927, + "grad_norm": 0.006063948851078749, + "learning_rate": 8.05588851616464e-05, + "loss": 0.500779104232788, + "step": 46090 + }, + { + "epoch": 0.1979169349922293, + "grad_norm": 0.02215203270316124, + "learning_rate": 8.055457344152877e-05, + "loss": 0.2442542552947998, + "step": 46100 + }, + { + "epoch": 0.1979598670822493, + "grad_norm": 0.5211237072944641, + "learning_rate": 8.055026172141115e-05, + "loss": 0.3820308446884155, + "step": 46110 + }, + { + "epoch": 0.19800279917226932, + "grad_norm": 0.09570766985416412, + "learning_rate": 8.054595000129353e-05, + "loss": 0.30534040927886963, + "step": 46120 + }, + { + "epoch": 0.1980457312622893, + "grad_norm": 2.1877946853637695, + "learning_rate": 8.054163828117589e-05, + "loss": 0.20296092033386232, + "step": 46130 + }, + { + "epoch": 0.19808866335230932, + "grad_norm": 0.19210763275623322, + "learning_rate": 8.053732656105827e-05, + "loss": 0.004410789161920547, + "step": 46140 + }, + { + "epoch": 0.19813159544232933, + "grad_norm": 9.662697792053223, + "learning_rate": 8.053301484094065e-05, + "loss": 0.28416242599487307, + "step": 46150 + }, + { + "epoch": 0.19817452753234932, + "grad_norm": 6.199631214141846, + "learning_rate": 8.052870312082302e-05, + "loss": 0.43329510688781736, + "step": 46160 + }, + { + "epoch": 0.19821745962236934, + "grad_norm": 2.9202873706817627, + "learning_rate": 8.05243914007054e-05, + "loss": 0.4505885124206543, + "step": 46170 + }, + { + "epoch": 0.19826039171238935, + "grad_norm": 1.2385050058364868, + "learning_rate": 8.052007968058778e-05, + "loss": 0.3089656114578247, + "step": 46180 + }, + { + "epoch": 0.19830332380240934, + "grad_norm": 0.002173739019781351, + "learning_rate": 8.051576796047016e-05, + "loss": 0.17922030687332152, + "step": 46190 + }, + { + "epoch": 0.19834625589242935, + "grad_norm": 3.4676084518432617, + "learning_rate": 8.051145624035253e-05, + "loss": 0.28135807514190675, + "step": 46200 + }, + { + "epoch": 0.19838918798244937, + "grad_norm": 1.0874375104904175, + "learning_rate": 8.050714452023491e-05, + "loss": 0.4869321346282959, + "step": 46210 + }, + { + "epoch": 0.19843212007246938, + "grad_norm": 0.16380393505096436, + "learning_rate": 8.050283280011727e-05, + "loss": 0.36575796604156496, + "step": 46220 + }, + { + "epoch": 0.19847505216248937, + "grad_norm": 0.09360513091087341, + "learning_rate": 8.049852107999965e-05, + "loss": 0.2708542585372925, + "step": 46230 + }, + { + "epoch": 0.19851798425250938, + "grad_norm": 0.0363796204328537, + "learning_rate": 8.049420935988203e-05, + "loss": 0.324708080291748, + "step": 46240 + }, + { + "epoch": 0.1985609163425294, + "grad_norm": 0.06994099169969559, + "learning_rate": 8.048989763976441e-05, + "loss": 0.25118064880371094, + "step": 46250 + }, + { + "epoch": 0.19860384843254938, + "grad_norm": 2.9435386657714844, + "learning_rate": 8.048558591964678e-05, + "loss": 0.17363402843475342, + "step": 46260 + }, + { + "epoch": 0.1986467805225694, + "grad_norm": 0.10383245348930359, + "learning_rate": 8.048127419952916e-05, + "loss": 0.0918418288230896, + "step": 46270 + }, + { + "epoch": 0.19868971261258941, + "grad_norm": 0.03149556368589401, + "learning_rate": 8.047696247941154e-05, + "loss": 0.2588630199432373, + "step": 46280 + }, + { + "epoch": 0.1987326447026094, + "grad_norm": 1.807033896446228, + "learning_rate": 8.047265075929392e-05, + "loss": 0.45576953887939453, + "step": 46290 + }, + { + "epoch": 0.19877557679262942, + "grad_norm": 1.1389695405960083, + "learning_rate": 8.04683390391763e-05, + "loss": 0.23019568920135497, + "step": 46300 + }, + { + "epoch": 0.19881850888264943, + "grad_norm": 0.5787996053695679, + "learning_rate": 8.046402731905867e-05, + "loss": 0.25288972854614256, + "step": 46310 + }, + { + "epoch": 0.19886144097266942, + "grad_norm": 0.0006389050977304578, + "learning_rate": 8.045971559894105e-05, + "loss": 0.048640355467796326, + "step": 46320 + }, + { + "epoch": 0.19890437306268943, + "grad_norm": 1.3906055688858032, + "learning_rate": 8.045540387882343e-05, + "loss": 0.17603120803833008, + "step": 46330 + }, + { + "epoch": 0.19894730515270945, + "grad_norm": 0.7192102670669556, + "learning_rate": 8.04510921587058e-05, + "loss": 0.06997541189193726, + "step": 46340 + }, + { + "epoch": 0.19899023724272946, + "grad_norm": 1.2777167558670044, + "learning_rate": 8.044678043858818e-05, + "loss": 0.1271460771560669, + "step": 46350 + }, + { + "epoch": 0.19903316933274945, + "grad_norm": 0.14615023136138916, + "learning_rate": 8.044246871847056e-05, + "loss": 0.1982070803642273, + "step": 46360 + }, + { + "epoch": 0.19907610142276946, + "grad_norm": 2.230384588241577, + "learning_rate": 8.043815699835294e-05, + "loss": 0.3583315372467041, + "step": 46370 + }, + { + "epoch": 0.19911903351278948, + "grad_norm": 7.727930545806885, + "learning_rate": 8.04338452782353e-05, + "loss": 0.5741009712219238, + "step": 46380 + }, + { + "epoch": 0.19916196560280947, + "grad_norm": 5.949665069580078, + "learning_rate": 8.042953355811768e-05, + "loss": 0.6069200038909912, + "step": 46390 + }, + { + "epoch": 0.19920489769282948, + "grad_norm": 1.577398419380188, + "learning_rate": 8.042522183800005e-05, + "loss": 0.20890703201293945, + "step": 46400 + }, + { + "epoch": 0.1992478297828495, + "grad_norm": 0.2532603442668915, + "learning_rate": 8.042091011788243e-05, + "loss": 0.22831459045410157, + "step": 46410 + }, + { + "epoch": 0.19929076187286948, + "grad_norm": 0.25338953733444214, + "learning_rate": 8.041659839776481e-05, + "loss": 0.279406476020813, + "step": 46420 + }, + { + "epoch": 0.1993336939628895, + "grad_norm": 0.10169660300016403, + "learning_rate": 8.041228667764719e-05, + "loss": 0.2838901996612549, + "step": 46430 + }, + { + "epoch": 0.1993766260529095, + "grad_norm": 1.0535625219345093, + "learning_rate": 8.040797495752956e-05, + "loss": 0.09383861422538757, + "step": 46440 + }, + { + "epoch": 0.19941955814292953, + "grad_norm": 0.002683205297216773, + "learning_rate": 8.040366323741194e-05, + "loss": 0.14773426055908204, + "step": 46450 + }, + { + "epoch": 0.1994624902329495, + "grad_norm": 9.37723159790039, + "learning_rate": 8.03993515172943e-05, + "loss": 0.3080390691757202, + "step": 46460 + }, + { + "epoch": 0.19950542232296953, + "grad_norm": 0.5413791537284851, + "learning_rate": 8.039503979717668e-05, + "loss": 0.3077335596084595, + "step": 46470 + }, + { + "epoch": 0.19954835441298954, + "grad_norm": 0.00279863178730011, + "learning_rate": 8.039072807705906e-05, + "loss": 0.21013128757476807, + "step": 46480 + }, + { + "epoch": 0.19959128650300953, + "grad_norm": 0.6538106203079224, + "learning_rate": 8.038641635694144e-05, + "loss": 0.23853034973144532, + "step": 46490 + }, + { + "epoch": 0.19963421859302954, + "grad_norm": 1.3677074909210205, + "learning_rate": 8.038210463682381e-05, + "loss": 0.34636821746826174, + "step": 46500 + }, + { + "epoch": 0.19967715068304956, + "grad_norm": 0.2341076135635376, + "learning_rate": 8.037779291670619e-05, + "loss": 0.2113889217376709, + "step": 46510 + }, + { + "epoch": 0.19972008277306955, + "grad_norm": 0.06166021525859833, + "learning_rate": 8.037348119658857e-05, + "loss": 0.14099152088165284, + "step": 46520 + }, + { + "epoch": 0.19976301486308956, + "grad_norm": 5.088956356048584, + "learning_rate": 8.036916947647095e-05, + "loss": 0.32044038772583006, + "step": 46530 + }, + { + "epoch": 0.19980594695310958, + "grad_norm": 0.05206717550754547, + "learning_rate": 8.036485775635332e-05, + "loss": 0.050091874599456784, + "step": 46540 + }, + { + "epoch": 0.1998488790431296, + "grad_norm": 0.015598599798977375, + "learning_rate": 8.03605460362357e-05, + "loss": 0.13630733489990235, + "step": 46550 + }, + { + "epoch": 0.19989181113314958, + "grad_norm": 1.6684378385543823, + "learning_rate": 8.035623431611808e-05, + "loss": 0.16195347309112548, + "step": 46560 + }, + { + "epoch": 0.1999347432231696, + "grad_norm": 0.1458691656589508, + "learning_rate": 8.035192259600045e-05, + "loss": 0.27073700428009034, + "step": 46570 + }, + { + "epoch": 0.1999776753131896, + "grad_norm": 0.6499917507171631, + "learning_rate": 8.034761087588283e-05, + "loss": 0.24764013290405273, + "step": 46580 + }, + { + "epoch": 0.2000206074032096, + "grad_norm": 0.0752706378698349, + "learning_rate": 8.034329915576521e-05, + "loss": 0.26877541542053224, + "step": 46590 + }, + { + "epoch": 0.2000635394932296, + "grad_norm": 2.6770122051239014, + "learning_rate": 8.033898743564759e-05, + "loss": 0.2864994049072266, + "step": 46600 + }, + { + "epoch": 0.20010647158324962, + "grad_norm": 0.03613373264670372, + "learning_rate": 8.033467571552996e-05, + "loss": 0.12052092552185059, + "step": 46610 + }, + { + "epoch": 0.2001494036732696, + "grad_norm": 0.0742354616522789, + "learning_rate": 8.033036399541234e-05, + "loss": 0.25440454483032227, + "step": 46620 + }, + { + "epoch": 0.20019233576328962, + "grad_norm": 0.05916782096028328, + "learning_rate": 8.03260522752947e-05, + "loss": 0.08873311877250671, + "step": 46630 + }, + { + "epoch": 0.20023526785330964, + "grad_norm": 0.015279405750334263, + "learning_rate": 8.032174055517708e-05, + "loss": 0.3313145637512207, + "step": 46640 + }, + { + "epoch": 0.20027819994332965, + "grad_norm": 0.27925458550453186, + "learning_rate": 8.031742883505946e-05, + "loss": 0.3915241718292236, + "step": 46650 + }, + { + "epoch": 0.20032113203334964, + "grad_norm": 0.05769550800323486, + "learning_rate": 8.031311711494184e-05, + "loss": 0.06632600426673889, + "step": 46660 + }, + { + "epoch": 0.20036406412336966, + "grad_norm": 0.20242290198802948, + "learning_rate": 8.030880539482421e-05, + "loss": 0.0038001593202352524, + "step": 46670 + }, + { + "epoch": 0.20040699621338967, + "grad_norm": 1.7053717374801636, + "learning_rate": 8.030449367470659e-05, + "loss": 0.12617350816726686, + "step": 46680 + }, + { + "epoch": 0.20044992830340966, + "grad_norm": 0.10039971768856049, + "learning_rate": 8.030018195458897e-05, + "loss": 0.39122159481048585, + "step": 46690 + }, + { + "epoch": 0.20049286039342967, + "grad_norm": 3.4095280170440674, + "learning_rate": 8.029587023447135e-05, + "loss": 0.14573875665664673, + "step": 46700 + }, + { + "epoch": 0.2005357924834497, + "grad_norm": 0.5203276872634888, + "learning_rate": 8.029155851435371e-05, + "loss": 0.37523181438446046, + "step": 46710 + }, + { + "epoch": 0.20057872457346967, + "grad_norm": 0.043886229395866394, + "learning_rate": 8.028724679423609e-05, + "loss": 0.27476327419281005, + "step": 46720 + }, + { + "epoch": 0.2006216566634897, + "grad_norm": 0.0021896434482187033, + "learning_rate": 8.028293507411847e-05, + "loss": 0.16201258897781373, + "step": 46730 + }, + { + "epoch": 0.2006645887535097, + "grad_norm": 0.07912927120923996, + "learning_rate": 8.027862335400084e-05, + "loss": 0.1216330885887146, + "step": 46740 + }, + { + "epoch": 0.2007075208435297, + "grad_norm": 3.0536763668060303, + "learning_rate": 8.027431163388322e-05, + "loss": 0.20984141826629638, + "step": 46750 + }, + { + "epoch": 0.2007504529335497, + "grad_norm": 0.001347496872767806, + "learning_rate": 8.02699999137656e-05, + "loss": 0.2748946905136108, + "step": 46760 + }, + { + "epoch": 0.20079338502356972, + "grad_norm": 1.7918879985809326, + "learning_rate": 8.026568819364797e-05, + "loss": 0.3410180568695068, + "step": 46770 + }, + { + "epoch": 0.20083631711358974, + "grad_norm": 2.6103463172912598, + "learning_rate": 8.026137647353035e-05, + "loss": 0.36960854530334475, + "step": 46780 + }, + { + "epoch": 0.20087924920360972, + "grad_norm": 0.000803425966296345, + "learning_rate": 8.025706475341273e-05, + "loss": 0.0754163384437561, + "step": 46790 + }, + { + "epoch": 0.20092218129362974, + "grad_norm": 2.1734330654144287, + "learning_rate": 8.02527530332951e-05, + "loss": 0.3364823579788208, + "step": 46800 + }, + { + "epoch": 0.20096511338364975, + "grad_norm": 0.031898133456707, + "learning_rate": 8.024844131317748e-05, + "loss": 0.15868821144104003, + "step": 46810 + }, + { + "epoch": 0.20100804547366974, + "grad_norm": 0.46334612369537354, + "learning_rate": 8.024412959305986e-05, + "loss": 0.1317250609397888, + "step": 46820 + }, + { + "epoch": 0.20105097756368975, + "grad_norm": 5.54944372177124, + "learning_rate": 8.023981787294224e-05, + "loss": 0.26093254089355467, + "step": 46830 + }, + { + "epoch": 0.20109390965370977, + "grad_norm": 0.004121364559978247, + "learning_rate": 8.023550615282462e-05, + "loss": 0.33663909435272216, + "step": 46840 + }, + { + "epoch": 0.20113684174372976, + "grad_norm": 1.8446154594421387, + "learning_rate": 8.0231194432707e-05, + "loss": 0.4318349361419678, + "step": 46850 + }, + { + "epoch": 0.20117977383374977, + "grad_norm": 0.3243059515953064, + "learning_rate": 8.022688271258937e-05, + "loss": 0.1755039095878601, + "step": 46860 + }, + { + "epoch": 0.20122270592376978, + "grad_norm": 0.0003927726356778294, + "learning_rate": 8.022257099247173e-05, + "loss": 0.15141257047653198, + "step": 46870 + }, + { + "epoch": 0.2012656380137898, + "grad_norm": 0.06570783257484436, + "learning_rate": 8.021825927235411e-05, + "loss": 0.04607608914375305, + "step": 46880 + }, + { + "epoch": 0.2013085701038098, + "grad_norm": 0.037901636213064194, + "learning_rate": 8.021394755223649e-05, + "loss": 0.18285064697265624, + "step": 46890 + }, + { + "epoch": 0.2013515021938298, + "grad_norm": 0.009038684889674187, + "learning_rate": 8.020963583211887e-05, + "loss": 0.3561609983444214, + "step": 46900 + }, + { + "epoch": 0.20139443428384982, + "grad_norm": 0.007474920246750116, + "learning_rate": 8.020532411200124e-05, + "loss": 0.16592724323272706, + "step": 46910 + }, + { + "epoch": 0.2014373663738698, + "grad_norm": 2.1837422847747803, + "learning_rate": 8.020101239188362e-05, + "loss": 0.23149852752685546, + "step": 46920 + }, + { + "epoch": 0.20148029846388982, + "grad_norm": 0.005031005013734102, + "learning_rate": 8.0196700671766e-05, + "loss": 0.35479423999786375, + "step": 46930 + }, + { + "epoch": 0.20152323055390983, + "grad_norm": 5.587767601013184, + "learning_rate": 8.019238895164838e-05, + "loss": 0.25224766731262205, + "step": 46940 + }, + { + "epoch": 0.20156616264392982, + "grad_norm": 0.10802139341831207, + "learning_rate": 8.018807723153075e-05, + "loss": 0.3127424240112305, + "step": 46950 + }, + { + "epoch": 0.20160909473394983, + "grad_norm": 2.2037267684936523, + "learning_rate": 8.018376551141312e-05, + "loss": 0.1420138955116272, + "step": 46960 + }, + { + "epoch": 0.20165202682396985, + "grad_norm": 3.9398605823516846, + "learning_rate": 8.01794537912955e-05, + "loss": 0.3298606872558594, + "step": 46970 + }, + { + "epoch": 0.20169495891398986, + "grad_norm": 1.5411865711212158, + "learning_rate": 8.017514207117787e-05, + "loss": 0.4772407054901123, + "step": 46980 + }, + { + "epoch": 0.20173789100400985, + "grad_norm": 4.572746753692627, + "learning_rate": 8.017083035106025e-05, + "loss": 0.16311756372451783, + "step": 46990 + }, + { + "epoch": 0.20178082309402987, + "grad_norm": 0.0878385528922081, + "learning_rate": 8.016651863094263e-05, + "loss": 0.032969492673873904, + "step": 47000 + }, + { + "epoch": 0.20178082309402987, + "eval_loss": 0.4502008557319641, + "eval_runtime": 27.5462, + "eval_samples_per_second": 3.63, + "eval_steps_per_second": 3.63, + "step": 47000 + }, + { + "epoch": 0.20182375518404988, + "grad_norm": 2.5271151065826416, + "learning_rate": 8.016220691082502e-05, + "loss": 0.08027942180633545, + "step": 47010 + }, + { + "epoch": 0.20186668727406987, + "grad_norm": 5.350552082061768, + "learning_rate": 8.01578951907074e-05, + "loss": 0.3245659112930298, + "step": 47020 + }, + { + "epoch": 0.20190961936408988, + "grad_norm": 0.009119064547121525, + "learning_rate": 8.015358347058977e-05, + "loss": 0.23723113536834717, + "step": 47030 + }, + { + "epoch": 0.2019525514541099, + "grad_norm": 0.005342130549252033, + "learning_rate": 8.014927175047214e-05, + "loss": 0.32947022914886476, + "step": 47040 + }, + { + "epoch": 0.20199548354412988, + "grad_norm": 2.6926896572113037, + "learning_rate": 8.014496003035451e-05, + "loss": 0.24790339469909667, + "step": 47050 + }, + { + "epoch": 0.2020384156341499, + "grad_norm": 2.7984976768493652, + "learning_rate": 8.014064831023689e-05, + "loss": 0.41055974960327146, + "step": 47060 + }, + { + "epoch": 0.2020813477241699, + "grad_norm": 1.719354271888733, + "learning_rate": 8.013633659011927e-05, + "loss": 0.3034384727478027, + "step": 47070 + }, + { + "epoch": 0.20212427981418993, + "grad_norm": 1.4050602912902832, + "learning_rate": 8.013202487000165e-05, + "loss": 0.4208504676818848, + "step": 47080 + }, + { + "epoch": 0.20216721190420991, + "grad_norm": 2.7856647968292236, + "learning_rate": 8.012771314988402e-05, + "loss": 0.21937999725341797, + "step": 47090 + }, + { + "epoch": 0.20221014399422993, + "grad_norm": 5.089998722076416, + "learning_rate": 8.01234014297664e-05, + "loss": 0.41013269424438475, + "step": 47100 + }, + { + "epoch": 0.20225307608424994, + "grad_norm": 1.1964845657348633, + "learning_rate": 8.011908970964878e-05, + "loss": 0.3722221374511719, + "step": 47110 + }, + { + "epoch": 0.20229600817426993, + "grad_norm": 0.4336620271205902, + "learning_rate": 8.011477798953114e-05, + "loss": 0.1501123547554016, + "step": 47120 + }, + { + "epoch": 0.20233894026428995, + "grad_norm": 3.1869595050811768, + "learning_rate": 8.011046626941352e-05, + "loss": 0.3358729839324951, + "step": 47130 + }, + { + "epoch": 0.20238187235430996, + "grad_norm": 0.2135515958070755, + "learning_rate": 8.01061545492959e-05, + "loss": 0.3324007987976074, + "step": 47140 + }, + { + "epoch": 0.20242480444432995, + "grad_norm": 2.570157289505005, + "learning_rate": 8.010184282917827e-05, + "loss": 0.25740299224853513, + "step": 47150 + }, + { + "epoch": 0.20246773653434996, + "grad_norm": 1.2199064493179321, + "learning_rate": 8.009753110906065e-05, + "loss": 0.33509321212768556, + "step": 47160 + }, + { + "epoch": 0.20251066862436998, + "grad_norm": 16.68321990966797, + "learning_rate": 8.009321938894303e-05, + "loss": 0.16514809131622316, + "step": 47170 + }, + { + "epoch": 0.20255360071438996, + "grad_norm": 0.36014726758003235, + "learning_rate": 8.00889076688254e-05, + "loss": 0.16338274478912354, + "step": 47180 + }, + { + "epoch": 0.20259653280440998, + "grad_norm": 0.06622673571109772, + "learning_rate": 8.008459594870778e-05, + "loss": 0.1958828091621399, + "step": 47190 + }, + { + "epoch": 0.20263946489443, + "grad_norm": 1.5034043788909912, + "learning_rate": 8.008028422859015e-05, + "loss": 0.07967668771743774, + "step": 47200 + }, + { + "epoch": 0.20268239698445, + "grad_norm": 0.004219403024762869, + "learning_rate": 8.007597250847252e-05, + "loss": 0.17253371477127075, + "step": 47210 + }, + { + "epoch": 0.20272532907447, + "grad_norm": 1.8605031967163086, + "learning_rate": 8.00716607883549e-05, + "loss": 0.30866689682006837, + "step": 47220 + }, + { + "epoch": 0.20276826116449, + "grad_norm": 0.007136452943086624, + "learning_rate": 8.006734906823729e-05, + "loss": 0.09472379088401794, + "step": 47230 + }, + { + "epoch": 0.20281119325451002, + "grad_norm": 0.31589406728744507, + "learning_rate": 8.006303734811967e-05, + "loss": 0.1566672444343567, + "step": 47240 + }, + { + "epoch": 0.20285412534453, + "grad_norm": 0.6907039284706116, + "learning_rate": 8.005872562800205e-05, + "loss": 0.24461641311645507, + "step": 47250 + }, + { + "epoch": 0.20289705743455003, + "grad_norm": 0.014545142650604248, + "learning_rate": 8.005441390788442e-05, + "loss": 0.19249277114868163, + "step": 47260 + }, + { + "epoch": 0.20293998952457004, + "grad_norm": 0.11242897063493729, + "learning_rate": 8.00501021877668e-05, + "loss": 0.19730584621429442, + "step": 47270 + }, + { + "epoch": 0.20298292161459003, + "grad_norm": 7.016503810882568, + "learning_rate": 8.004579046764918e-05, + "loss": 0.3570088863372803, + "step": 47280 + }, + { + "epoch": 0.20302585370461004, + "grad_norm": 0.21266454458236694, + "learning_rate": 8.004147874753154e-05, + "loss": 0.16338672637939453, + "step": 47290 + }, + { + "epoch": 0.20306878579463006, + "grad_norm": 0.05758526921272278, + "learning_rate": 8.003716702741392e-05, + "loss": 0.25838274955749513, + "step": 47300 + }, + { + "epoch": 0.20311171788465007, + "grad_norm": 0.19154588878154755, + "learning_rate": 8.00328553072963e-05, + "loss": 0.11227208375930786, + "step": 47310 + }, + { + "epoch": 0.20315464997467006, + "grad_norm": 0.09942365437746048, + "learning_rate": 8.002854358717867e-05, + "loss": 0.2624546527862549, + "step": 47320 + }, + { + "epoch": 0.20319758206469007, + "grad_norm": 1.0413005352020264, + "learning_rate": 8.002423186706105e-05, + "loss": 0.26270604133605957, + "step": 47330 + }, + { + "epoch": 0.2032405141547101, + "grad_norm": 6.982503414154053, + "learning_rate": 8.001992014694343e-05, + "loss": 0.4072850704193115, + "step": 47340 + }, + { + "epoch": 0.20328344624473008, + "grad_norm": 2.139129877090454, + "learning_rate": 8.00156084268258e-05, + "loss": 0.23219172954559325, + "step": 47350 + }, + { + "epoch": 0.2033263783347501, + "grad_norm": 2.4974660873413086, + "learning_rate": 8.001129670670818e-05, + "loss": 0.388437819480896, + "step": 47360 + }, + { + "epoch": 0.2033693104247701, + "grad_norm": 1.4110585451126099, + "learning_rate": 8.000698498659055e-05, + "loss": 0.3643111944198608, + "step": 47370 + }, + { + "epoch": 0.2034122425147901, + "grad_norm": 0.06494349241256714, + "learning_rate": 8.000267326647292e-05, + "loss": 0.434461784362793, + "step": 47380 + }, + { + "epoch": 0.2034551746048101, + "grad_norm": 2.335343599319458, + "learning_rate": 7.99983615463553e-05, + "loss": 0.15915360450744628, + "step": 47390 + }, + { + "epoch": 0.20349810669483012, + "grad_norm": 0.8555454611778259, + "learning_rate": 7.999404982623768e-05, + "loss": 0.278378963470459, + "step": 47400 + }, + { + "epoch": 0.20354103878485014, + "grad_norm": 0.016613325104117393, + "learning_rate": 7.998973810612006e-05, + "loss": 0.28450276851654055, + "step": 47410 + }, + { + "epoch": 0.20358397087487012, + "grad_norm": 2.753087043762207, + "learning_rate": 7.998542638600243e-05, + "loss": 0.4102034091949463, + "step": 47420 + }, + { + "epoch": 0.20362690296489014, + "grad_norm": 0.015861574560403824, + "learning_rate": 7.998111466588481e-05, + "loss": 0.20828280448913575, + "step": 47430 + }, + { + "epoch": 0.20366983505491015, + "grad_norm": 0.007401157170534134, + "learning_rate": 7.997680294576719e-05, + "loss": 0.22065355777740478, + "step": 47440 + }, + { + "epoch": 0.20371276714493014, + "grad_norm": 0.12466619908809662, + "learning_rate": 7.997249122564957e-05, + "loss": 0.24476051330566406, + "step": 47450 + }, + { + "epoch": 0.20375569923495016, + "grad_norm": 1.485001802444458, + "learning_rate": 7.996817950553194e-05, + "loss": 0.3162158727645874, + "step": 47460 + }, + { + "epoch": 0.20379863132497017, + "grad_norm": 0.3976515829563141, + "learning_rate": 7.996386778541432e-05, + "loss": 0.2629575490951538, + "step": 47470 + }, + { + "epoch": 0.20384156341499016, + "grad_norm": 0.6344526410102844, + "learning_rate": 7.99595560652967e-05, + "loss": 0.3330104112625122, + "step": 47480 + }, + { + "epoch": 0.20388449550501017, + "grad_norm": 0.02858610637485981, + "learning_rate": 7.995524434517908e-05, + "loss": 0.14648046493530273, + "step": 47490 + }, + { + "epoch": 0.2039274275950302, + "grad_norm": 0.2695685029029846, + "learning_rate": 7.995093262506145e-05, + "loss": 0.32228114604949953, + "step": 47500 + }, + { + "epoch": 0.2039703596850502, + "grad_norm": 0.3541853129863739, + "learning_rate": 7.994662090494383e-05, + "loss": 0.30073845386505127, + "step": 47510 + }, + { + "epoch": 0.2040132917750702, + "grad_norm": 1.344916820526123, + "learning_rate": 7.994230918482621e-05, + "loss": 0.3133988857269287, + "step": 47520 + }, + { + "epoch": 0.2040562238650902, + "grad_norm": 7.818868160247803, + "learning_rate": 7.993799746470857e-05, + "loss": 0.18457093238830566, + "step": 47530 + }, + { + "epoch": 0.20409915595511022, + "grad_norm": 0.05128054320812225, + "learning_rate": 7.993368574459095e-05, + "loss": 0.205673885345459, + "step": 47540 + }, + { + "epoch": 0.2041420880451302, + "grad_norm": 3.529289960861206, + "learning_rate": 7.992937402447333e-05, + "loss": 0.242510986328125, + "step": 47550 + }, + { + "epoch": 0.20418502013515022, + "grad_norm": 2.2482082843780518, + "learning_rate": 7.99250623043557e-05, + "loss": 0.2730981111526489, + "step": 47560 + }, + { + "epoch": 0.20422795222517023, + "grad_norm": 2.0920770168304443, + "learning_rate": 7.992075058423808e-05, + "loss": 0.19865950345993041, + "step": 47570 + }, + { + "epoch": 0.20427088431519022, + "grad_norm": 0.25885283946990967, + "learning_rate": 7.991643886412046e-05, + "loss": 0.2900604009628296, + "step": 47580 + }, + { + "epoch": 0.20431381640521024, + "grad_norm": 0.07762018591165543, + "learning_rate": 7.991212714400284e-05, + "loss": 0.08530986309051514, + "step": 47590 + }, + { + "epoch": 0.20435674849523025, + "grad_norm": 4.367431640625, + "learning_rate": 7.990781542388521e-05, + "loss": 0.2783456563949585, + "step": 47600 + }, + { + "epoch": 0.20439968058525024, + "grad_norm": 2.544442892074585, + "learning_rate": 7.990350370376758e-05, + "loss": 0.15141881704330445, + "step": 47610 + }, + { + "epoch": 0.20444261267527025, + "grad_norm": 1.1436129808425903, + "learning_rate": 7.989919198364995e-05, + "loss": 0.3077658176422119, + "step": 47620 + }, + { + "epoch": 0.20448554476529027, + "grad_norm": 0.8395715355873108, + "learning_rate": 7.989488026353233e-05, + "loss": 0.2867176294326782, + "step": 47630 + }, + { + "epoch": 0.20452847685531028, + "grad_norm": 3.702817440032959, + "learning_rate": 7.989056854341471e-05, + "loss": 0.5052554130554199, + "step": 47640 + }, + { + "epoch": 0.20457140894533027, + "grad_norm": 0.28353357315063477, + "learning_rate": 7.988625682329709e-05, + "loss": 0.12239044904708862, + "step": 47650 + }, + { + "epoch": 0.20461434103535028, + "grad_norm": 0.5833907723426819, + "learning_rate": 7.988194510317946e-05, + "loss": 0.322785758972168, + "step": 47660 + }, + { + "epoch": 0.2046572731253703, + "grad_norm": 0.173982173204422, + "learning_rate": 7.987763338306184e-05, + "loss": 0.19359149932861328, + "step": 47670 + }, + { + "epoch": 0.20470020521539029, + "grad_norm": 0.8384180665016174, + "learning_rate": 7.987332166294422e-05, + "loss": 0.11619726419448853, + "step": 47680 + }, + { + "epoch": 0.2047431373054103, + "grad_norm": 0.6231016516685486, + "learning_rate": 7.98690099428266e-05, + "loss": 0.1496443510055542, + "step": 47690 + }, + { + "epoch": 0.20478606939543031, + "grad_norm": 0.0853211060166359, + "learning_rate": 7.986469822270897e-05, + "loss": 0.3600142002105713, + "step": 47700 + }, + { + "epoch": 0.2048290014854503, + "grad_norm": 0.2062334269285202, + "learning_rate": 7.986038650259135e-05, + "loss": 0.23907241821289063, + "step": 47710 + }, + { + "epoch": 0.20487193357547032, + "grad_norm": 0.02482936903834343, + "learning_rate": 7.985607478247373e-05, + "loss": 0.029480090737342833, + "step": 47720 + }, + { + "epoch": 0.20491486566549033, + "grad_norm": 0.01601765677332878, + "learning_rate": 7.98517630623561e-05, + "loss": 0.19208526611328125, + "step": 47730 + }, + { + "epoch": 0.20495779775551035, + "grad_norm": 0.06261662393808365, + "learning_rate": 7.984745134223848e-05, + "loss": 0.24382977485656737, + "step": 47740 + }, + { + "epoch": 0.20500072984553033, + "grad_norm": 0.025355610996484756, + "learning_rate": 7.984313962212086e-05, + "loss": 0.1378118634223938, + "step": 47750 + }, + { + "epoch": 0.20504366193555035, + "grad_norm": 0.8861377239227295, + "learning_rate": 7.983882790200324e-05, + "loss": 0.26656255722045896, + "step": 47760 + }, + { + "epoch": 0.20508659402557036, + "grad_norm": 0.013460827060043812, + "learning_rate": 7.983451618188561e-05, + "loss": 0.3513129234313965, + "step": 47770 + }, + { + "epoch": 0.20512952611559035, + "grad_norm": 0.061860062181949615, + "learning_rate": 7.983020446176798e-05, + "loss": 0.23082308769226073, + "step": 47780 + }, + { + "epoch": 0.20517245820561036, + "grad_norm": 24.27768325805664, + "learning_rate": 7.982589274165036e-05, + "loss": 0.47048091888427734, + "step": 47790 + }, + { + "epoch": 0.20521539029563038, + "grad_norm": 4.150501251220703, + "learning_rate": 7.982158102153273e-05, + "loss": 0.25462424755096436, + "step": 47800 + }, + { + "epoch": 0.20525832238565037, + "grad_norm": 0.03856050595641136, + "learning_rate": 7.981726930141511e-05, + "loss": 0.44991393089294435, + "step": 47810 + }, + { + "epoch": 0.20530125447567038, + "grad_norm": 0.11038416624069214, + "learning_rate": 7.981295758129749e-05, + "loss": 0.16237845420837402, + "step": 47820 + }, + { + "epoch": 0.2053441865656904, + "grad_norm": 0.07443532347679138, + "learning_rate": 7.980864586117986e-05, + "loss": 0.15313451290130614, + "step": 47830 + }, + { + "epoch": 0.2053871186557104, + "grad_norm": 0.5222668647766113, + "learning_rate": 7.980433414106224e-05, + "loss": 0.15321272611618042, + "step": 47840 + }, + { + "epoch": 0.2054300507457304, + "grad_norm": 0.038755644112825394, + "learning_rate": 7.980002242094462e-05, + "loss": 0.29925990104675293, + "step": 47850 + }, + { + "epoch": 0.2054729828357504, + "grad_norm": 0.08253604918718338, + "learning_rate": 7.979571070082698e-05, + "loss": 0.13226243257522582, + "step": 47860 + }, + { + "epoch": 0.20551591492577043, + "grad_norm": 2.4042482376098633, + "learning_rate": 7.979139898070936e-05, + "loss": 0.22691683769226073, + "step": 47870 + }, + { + "epoch": 0.2055588470157904, + "grad_norm": 1.6290377378463745, + "learning_rate": 7.978708726059174e-05, + "loss": 0.15348259210586548, + "step": 47880 + }, + { + "epoch": 0.20560177910581043, + "grad_norm": 0.04041058570146561, + "learning_rate": 7.978277554047412e-05, + "loss": 0.32028086185455323, + "step": 47890 + }, + { + "epoch": 0.20564471119583044, + "grad_norm": 10.279193878173828, + "learning_rate": 7.977846382035649e-05, + "loss": 0.16360957622528077, + "step": 47900 + }, + { + "epoch": 0.20568764328585043, + "grad_norm": 0.9638367295265198, + "learning_rate": 7.977415210023887e-05, + "loss": 0.2235480308532715, + "step": 47910 + }, + { + "epoch": 0.20573057537587044, + "grad_norm": 3.813462495803833, + "learning_rate": 7.976984038012125e-05, + "loss": 0.24002406597137452, + "step": 47920 + }, + { + "epoch": 0.20577350746589046, + "grad_norm": 2.7070462703704834, + "learning_rate": 7.976552866000362e-05, + "loss": 0.12926928997039794, + "step": 47930 + }, + { + "epoch": 0.20581643955591047, + "grad_norm": 1.2344295978546143, + "learning_rate": 7.9761216939886e-05, + "loss": 0.3576169490814209, + "step": 47940 + }, + { + "epoch": 0.20585937164593046, + "grad_norm": 0.0029906737618148327, + "learning_rate": 7.975690521976838e-05, + "loss": 0.0424612283706665, + "step": 47950 + }, + { + "epoch": 0.20590230373595048, + "grad_norm": 0.02020765095949173, + "learning_rate": 7.975259349965076e-05, + "loss": 0.30192534923553466, + "step": 47960 + }, + { + "epoch": 0.2059452358259705, + "grad_norm": 0.011675640009343624, + "learning_rate": 7.974828177953313e-05, + "loss": 0.2707512617111206, + "step": 47970 + }, + { + "epoch": 0.20598816791599048, + "grad_norm": 1.3537681102752686, + "learning_rate": 7.974397005941551e-05, + "loss": 0.4337340831756592, + "step": 47980 + }, + { + "epoch": 0.2060311000060105, + "grad_norm": 0.001996510662138462, + "learning_rate": 7.973965833929789e-05, + "loss": 0.26091752052307127, + "step": 47990 + }, + { + "epoch": 0.2060740320960305, + "grad_norm": 0.003297002287581563, + "learning_rate": 7.973534661918027e-05, + "loss": 0.21362035274505614, + "step": 48000 + }, + { + "epoch": 0.2060740320960305, + "eval_loss": 0.46487903594970703, + "eval_runtime": 27.53, + "eval_samples_per_second": 3.632, + "eval_steps_per_second": 3.632, + "step": 48000 + }, + { + "epoch": 0.2061169641860505, + "grad_norm": 1.6414936780929565, + "learning_rate": 7.973103489906264e-05, + "loss": 0.28532023429870607, + "step": 48010 + }, + { + "epoch": 0.2061598962760705, + "grad_norm": 0.18141505122184753, + "learning_rate": 7.972672317894502e-05, + "loss": 0.07151886224746704, + "step": 48020 + }, + { + "epoch": 0.20620282836609052, + "grad_norm": 0.007624962832778692, + "learning_rate": 7.972241145882738e-05, + "loss": 0.2982606887817383, + "step": 48030 + }, + { + "epoch": 0.2062457604561105, + "grad_norm": 1.486085057258606, + "learning_rate": 7.971809973870976e-05, + "loss": 0.29765031337738035, + "step": 48040 + }, + { + "epoch": 0.20628869254613053, + "grad_norm": 0.03330766037106514, + "learning_rate": 7.971378801859214e-05, + "loss": 0.3565992832183838, + "step": 48050 + }, + { + "epoch": 0.20633162463615054, + "grad_norm": 0.002912812400609255, + "learning_rate": 7.970947629847452e-05, + "loss": 0.047986358404159546, + "step": 48060 + }, + { + "epoch": 0.20637455672617055, + "grad_norm": 0.053293656557798386, + "learning_rate": 7.97051645783569e-05, + "loss": 0.08486682176589966, + "step": 48070 + }, + { + "epoch": 0.20641748881619054, + "grad_norm": 0.6024067997932434, + "learning_rate": 7.970085285823927e-05, + "loss": 0.04551941752433777, + "step": 48080 + }, + { + "epoch": 0.20646042090621056, + "grad_norm": 1.5999566316604614, + "learning_rate": 7.969654113812165e-05, + "loss": 0.21173477172851562, + "step": 48090 + }, + { + "epoch": 0.20650335299623057, + "grad_norm": 5.380755424499512, + "learning_rate": 7.969222941800403e-05, + "loss": 0.21389317512512207, + "step": 48100 + }, + { + "epoch": 0.20654628508625056, + "grad_norm": 1.6914737224578857, + "learning_rate": 7.968791769788639e-05, + "loss": 0.1295459032058716, + "step": 48110 + }, + { + "epoch": 0.20658921717627057, + "grad_norm": 2.0489678382873535, + "learning_rate": 7.968360597776877e-05, + "loss": 0.3609046459197998, + "step": 48120 + }, + { + "epoch": 0.2066321492662906, + "grad_norm": 0.09325380623340607, + "learning_rate": 7.967929425765114e-05, + "loss": 0.31286661624908446, + "step": 48130 + }, + { + "epoch": 0.20667508135631057, + "grad_norm": 0.002050831215456128, + "learning_rate": 7.967498253753352e-05, + "loss": 0.08834596276283264, + "step": 48140 + }, + { + "epoch": 0.2067180134463306, + "grad_norm": 1.406714677810669, + "learning_rate": 7.96706708174159e-05, + "loss": 0.2861358165740967, + "step": 48150 + }, + { + "epoch": 0.2067609455363506, + "grad_norm": 1.750779151916504, + "learning_rate": 7.966635909729828e-05, + "loss": 0.2869063138961792, + "step": 48160 + }, + { + "epoch": 0.20680387762637062, + "grad_norm": 0.0026512339245527983, + "learning_rate": 7.966204737718065e-05, + "loss": 0.028093031048774718, + "step": 48170 + }, + { + "epoch": 0.2068468097163906, + "grad_norm": 0.6258159875869751, + "learning_rate": 7.965773565706303e-05, + "loss": 0.4752193450927734, + "step": 48180 + }, + { + "epoch": 0.20688974180641062, + "grad_norm": 0.0013247814495116472, + "learning_rate": 7.965342393694541e-05, + "loss": 0.3037768840789795, + "step": 48190 + }, + { + "epoch": 0.20693267389643064, + "grad_norm": 0.012789330445230007, + "learning_rate": 7.964911221682779e-05, + "loss": 0.21984691619873048, + "step": 48200 + }, + { + "epoch": 0.20697560598645062, + "grad_norm": 0.0047979154624044895, + "learning_rate": 7.964480049671016e-05, + "loss": 0.26316077709198, + "step": 48210 + }, + { + "epoch": 0.20701853807647064, + "grad_norm": 0.025677207857370377, + "learning_rate": 7.964048877659254e-05, + "loss": 0.15255630016326904, + "step": 48220 + }, + { + "epoch": 0.20706147016649065, + "grad_norm": 2.580648183822632, + "learning_rate": 7.963617705647492e-05, + "loss": 0.45435514450073244, + "step": 48230 + }, + { + "epoch": 0.20710440225651064, + "grad_norm": 0.012255949899554253, + "learning_rate": 7.96318653363573e-05, + "loss": 0.20427842140197755, + "step": 48240 + }, + { + "epoch": 0.20714733434653065, + "grad_norm": 0.17026881873607635, + "learning_rate": 7.962755361623967e-05, + "loss": 0.07440086603164672, + "step": 48250 + }, + { + "epoch": 0.20719026643655067, + "grad_norm": 2.2256710529327393, + "learning_rate": 7.962324189612205e-05, + "loss": 0.1965106248855591, + "step": 48260 + }, + { + "epoch": 0.20723319852657068, + "grad_norm": 0.021333398297429085, + "learning_rate": 7.961893017600441e-05, + "loss": 0.06148759126663208, + "step": 48270 + }, + { + "epoch": 0.20727613061659067, + "grad_norm": 0.03702492266893387, + "learning_rate": 7.961461845588679e-05, + "loss": 0.19015427827835082, + "step": 48280 + }, + { + "epoch": 0.20731906270661069, + "grad_norm": 0.015393697656691074, + "learning_rate": 7.961030673576917e-05, + "loss": 0.2020171880722046, + "step": 48290 + }, + { + "epoch": 0.2073619947966307, + "grad_norm": 0.0388808436691761, + "learning_rate": 7.960599501565155e-05, + "loss": 0.025816604495048523, + "step": 48300 + }, + { + "epoch": 0.2074049268866507, + "grad_norm": 0.2894894778728485, + "learning_rate": 7.960168329553392e-05, + "loss": 0.17407466173171998, + "step": 48310 + }, + { + "epoch": 0.2074478589766707, + "grad_norm": 5.096240043640137, + "learning_rate": 7.95973715754163e-05, + "loss": 0.3877495050430298, + "step": 48320 + }, + { + "epoch": 0.20749079106669072, + "grad_norm": 0.0010817173169925809, + "learning_rate": 7.959305985529868e-05, + "loss": 0.11308754682540893, + "step": 48330 + }, + { + "epoch": 0.2075337231567107, + "grad_norm": 1.192091703414917, + "learning_rate": 7.958874813518105e-05, + "loss": 0.6353956699371338, + "step": 48340 + }, + { + "epoch": 0.20757665524673072, + "grad_norm": 1.444043517112732, + "learning_rate": 7.958443641506343e-05, + "loss": 0.10261656045913696, + "step": 48350 + }, + { + "epoch": 0.20761958733675073, + "grad_norm": 0.001375035266391933, + "learning_rate": 7.95801246949458e-05, + "loss": 0.21776161193847657, + "step": 48360 + }, + { + "epoch": 0.20766251942677075, + "grad_norm": 0.9765628576278687, + "learning_rate": 7.957581297482817e-05, + "loss": 0.3838587522506714, + "step": 48370 + }, + { + "epoch": 0.20770545151679073, + "grad_norm": 2.84651780128479, + "learning_rate": 7.957150125471055e-05, + "loss": 0.287298583984375, + "step": 48380 + }, + { + "epoch": 0.20774838360681075, + "grad_norm": 0.068606436252594, + "learning_rate": 7.956718953459293e-05, + "loss": 0.11763962507247924, + "step": 48390 + }, + { + "epoch": 0.20779131569683076, + "grad_norm": 1.5584431886672974, + "learning_rate": 7.95628778144753e-05, + "loss": 0.03175153732299805, + "step": 48400 + }, + { + "epoch": 0.20783424778685075, + "grad_norm": 0.10847458988428116, + "learning_rate": 7.955856609435768e-05, + "loss": 0.12701599597930907, + "step": 48410 + }, + { + "epoch": 0.20787717987687077, + "grad_norm": 0.516201913356781, + "learning_rate": 7.955425437424007e-05, + "loss": 0.2435328722000122, + "step": 48420 + }, + { + "epoch": 0.20792011196689078, + "grad_norm": 0.09531274437904358, + "learning_rate": 7.954994265412245e-05, + "loss": 0.17576621770858764, + "step": 48430 + }, + { + "epoch": 0.20796304405691077, + "grad_norm": 3.824319839477539, + "learning_rate": 7.954563093400481e-05, + "loss": 0.31635844707489014, + "step": 48440 + }, + { + "epoch": 0.20800597614693078, + "grad_norm": 9.462091445922852, + "learning_rate": 7.954131921388719e-05, + "loss": 0.1305789351463318, + "step": 48450 + }, + { + "epoch": 0.2080489082369508, + "grad_norm": 1.0754421949386597, + "learning_rate": 7.953700749376957e-05, + "loss": 0.359479284286499, + "step": 48460 + }, + { + "epoch": 0.20809184032697078, + "grad_norm": 1.957236647605896, + "learning_rate": 7.953269577365195e-05, + "loss": 0.3150218963623047, + "step": 48470 + }, + { + "epoch": 0.2081347724169908, + "grad_norm": 0.002105705440044403, + "learning_rate": 7.952838405353432e-05, + "loss": 0.1954740524291992, + "step": 48480 + }, + { + "epoch": 0.2081777045070108, + "grad_norm": 12.278348922729492, + "learning_rate": 7.95240723334167e-05, + "loss": 0.28711376190185545, + "step": 48490 + }, + { + "epoch": 0.20822063659703083, + "grad_norm": 2.2454352378845215, + "learning_rate": 7.951976061329908e-05, + "loss": 0.2067957878112793, + "step": 48500 + }, + { + "epoch": 0.20826356868705082, + "grad_norm": 5.584324836730957, + "learning_rate": 7.951544889318146e-05, + "loss": 0.3211017847061157, + "step": 48510 + }, + { + "epoch": 0.20830650077707083, + "grad_norm": 0.7228730916976929, + "learning_rate": 7.951113717306382e-05, + "loss": 0.3107947826385498, + "step": 48520 + }, + { + "epoch": 0.20834943286709084, + "grad_norm": 0.015868451446294785, + "learning_rate": 7.95068254529462e-05, + "loss": 0.2286367654800415, + "step": 48530 + }, + { + "epoch": 0.20839236495711083, + "grad_norm": 1.4947551488876343, + "learning_rate": 7.950251373282857e-05, + "loss": 0.25593032836914065, + "step": 48540 + }, + { + "epoch": 0.20843529704713085, + "grad_norm": 0.027004824951291084, + "learning_rate": 7.949820201271095e-05, + "loss": 0.03589800000190735, + "step": 48550 + }, + { + "epoch": 0.20847822913715086, + "grad_norm": 1.1982914209365845, + "learning_rate": 7.949389029259333e-05, + "loss": 0.24310851097106934, + "step": 48560 + }, + { + "epoch": 0.20852116122717085, + "grad_norm": 7.410946846008301, + "learning_rate": 7.94895785724757e-05, + "loss": 0.16178617477416993, + "step": 48570 + }, + { + "epoch": 0.20856409331719086, + "grad_norm": 0.0017027149442583323, + "learning_rate": 7.948526685235808e-05, + "loss": 0.36240453720092775, + "step": 48580 + }, + { + "epoch": 0.20860702540721088, + "grad_norm": 0.210503488779068, + "learning_rate": 7.948095513224046e-05, + "loss": 0.2611191749572754, + "step": 48590 + }, + { + "epoch": 0.2086499574972309, + "grad_norm": 0.11781711131334305, + "learning_rate": 7.947664341212283e-05, + "loss": 0.19964562654495238, + "step": 48600 + }, + { + "epoch": 0.20869288958725088, + "grad_norm": 0.006439481396228075, + "learning_rate": 7.94723316920052e-05, + "loss": 0.17323254346847533, + "step": 48610 + }, + { + "epoch": 0.2087358216772709, + "grad_norm": 1.1420397758483887, + "learning_rate": 7.946801997188758e-05, + "loss": 0.23409998416900635, + "step": 48620 + }, + { + "epoch": 0.2087787537672909, + "grad_norm": 0.12256089597940445, + "learning_rate": 7.946370825176996e-05, + "loss": 0.17933051586151122, + "step": 48630 + }, + { + "epoch": 0.2088216858573109, + "grad_norm": 0.063877634704113, + "learning_rate": 7.945939653165235e-05, + "loss": 0.181551992893219, + "step": 48640 + }, + { + "epoch": 0.2088646179473309, + "grad_norm": 0.5018807649612427, + "learning_rate": 7.945508481153473e-05, + "loss": 0.1766904592514038, + "step": 48650 + }, + { + "epoch": 0.20890755003735093, + "grad_norm": 0.2746643126010895, + "learning_rate": 7.94507730914171e-05, + "loss": 0.11217392683029175, + "step": 48660 + }, + { + "epoch": 0.2089504821273709, + "grad_norm": 7.02359676361084, + "learning_rate": 7.944646137129948e-05, + "loss": 0.3691625833511353, + "step": 48670 + }, + { + "epoch": 0.20899341421739093, + "grad_norm": 0.07516085356473923, + "learning_rate": 7.944214965118184e-05, + "loss": 0.263259220123291, + "step": 48680 + }, + { + "epoch": 0.20903634630741094, + "grad_norm": 0.17022983729839325, + "learning_rate": 7.943783793106422e-05, + "loss": 0.2957159996032715, + "step": 48690 + }, + { + "epoch": 0.20907927839743096, + "grad_norm": 0.14971184730529785, + "learning_rate": 7.94335262109466e-05, + "loss": 0.3658233880996704, + "step": 48700 + }, + { + "epoch": 0.20912221048745094, + "grad_norm": 0.29934176802635193, + "learning_rate": 7.942921449082898e-05, + "loss": 0.03634549081325531, + "step": 48710 + }, + { + "epoch": 0.20916514257747096, + "grad_norm": 0.04933731257915497, + "learning_rate": 7.942490277071135e-05, + "loss": 0.1716094732284546, + "step": 48720 + }, + { + "epoch": 0.20920807466749097, + "grad_norm": 1.1368640661239624, + "learning_rate": 7.942059105059373e-05, + "loss": 0.12027144432067871, + "step": 48730 + }, + { + "epoch": 0.20925100675751096, + "grad_norm": 0.014941312372684479, + "learning_rate": 7.941627933047611e-05, + "loss": 0.13583672046661377, + "step": 48740 + }, + { + "epoch": 0.20929393884753097, + "grad_norm": 0.12930569052696228, + "learning_rate": 7.941196761035849e-05, + "loss": 0.1166748046875, + "step": 48750 + }, + { + "epoch": 0.209336870937551, + "grad_norm": 0.1529303342103958, + "learning_rate": 7.940765589024086e-05, + "loss": 0.17531336545944215, + "step": 48760 + }, + { + "epoch": 0.20937980302757098, + "grad_norm": 2.075014114379883, + "learning_rate": 7.940334417012323e-05, + "loss": 0.45829410552978517, + "step": 48770 + }, + { + "epoch": 0.209422735117591, + "grad_norm": 1.3795547485351562, + "learning_rate": 7.93990324500056e-05, + "loss": 0.16225976943969728, + "step": 48780 + }, + { + "epoch": 0.209465667207611, + "grad_norm": 5.576574325561523, + "learning_rate": 7.939472072988798e-05, + "loss": 0.16702204942703247, + "step": 48790 + }, + { + "epoch": 0.20950859929763102, + "grad_norm": 1.667356014251709, + "learning_rate": 7.939040900977036e-05, + "loss": 0.5205566883087158, + "step": 48800 + }, + { + "epoch": 0.209551531387651, + "grad_norm": 0.20947889983654022, + "learning_rate": 7.938609728965274e-05, + "loss": 0.03103659749031067, + "step": 48810 + }, + { + "epoch": 0.20959446347767102, + "grad_norm": 0.021028542891144753, + "learning_rate": 7.938178556953511e-05, + "loss": 0.12524155378341675, + "step": 48820 + }, + { + "epoch": 0.20963739556769104, + "grad_norm": 19.887067794799805, + "learning_rate": 7.937747384941749e-05, + "loss": 0.4347548007965088, + "step": 48830 + }, + { + "epoch": 0.20968032765771102, + "grad_norm": 0.003681926289573312, + "learning_rate": 7.937316212929987e-05, + "loss": 0.09043388366699219, + "step": 48840 + }, + { + "epoch": 0.20972325974773104, + "grad_norm": 0.1534494310617447, + "learning_rate": 7.936885040918223e-05, + "loss": 0.2118082046508789, + "step": 48850 + }, + { + "epoch": 0.20976619183775105, + "grad_norm": 4.88729190826416, + "learning_rate": 7.936453868906462e-05, + "loss": 0.4548500061035156, + "step": 48860 + }, + { + "epoch": 0.20980912392777104, + "grad_norm": 2.5274174213409424, + "learning_rate": 7.9360226968947e-05, + "loss": 0.1766461491584778, + "step": 48870 + }, + { + "epoch": 0.20985205601779106, + "grad_norm": 1.0588138103485107, + "learning_rate": 7.935591524882938e-05, + "loss": 0.06500946283340454, + "step": 48880 + }, + { + "epoch": 0.20989498810781107, + "grad_norm": 0.6156249642372131, + "learning_rate": 7.935160352871175e-05, + "loss": 0.32105815410614014, + "step": 48890 + }, + { + "epoch": 0.20993792019783106, + "grad_norm": 0.002214090432971716, + "learning_rate": 7.934729180859413e-05, + "loss": 0.05551689863204956, + "step": 48900 + }, + { + "epoch": 0.20998085228785107, + "grad_norm": 1.122273564338684, + "learning_rate": 7.934298008847651e-05, + "loss": 0.18426462411880493, + "step": 48910 + }, + { + "epoch": 0.2100237843778711, + "grad_norm": 0.9584456086158752, + "learning_rate": 7.933866836835889e-05, + "loss": 0.3217077970504761, + "step": 48920 + }, + { + "epoch": 0.2100667164678911, + "grad_norm": 15.653519630432129, + "learning_rate": 7.933435664824125e-05, + "loss": 0.3382516860961914, + "step": 48930 + }, + { + "epoch": 0.2101096485579111, + "grad_norm": 0.024164466187357903, + "learning_rate": 7.933004492812363e-05, + "loss": 0.2831050157546997, + "step": 48940 + }, + { + "epoch": 0.2101525806479311, + "grad_norm": 0.6815698742866516, + "learning_rate": 7.9325733208006e-05, + "loss": 0.17481757402420045, + "step": 48950 + }, + { + "epoch": 0.21019551273795112, + "grad_norm": 0.1216021403670311, + "learning_rate": 7.932142148788838e-05, + "loss": 0.2150895595550537, + "step": 48960 + }, + { + "epoch": 0.2102384448279711, + "grad_norm": 0.11215940117835999, + "learning_rate": 7.931710976777076e-05, + "loss": 0.005117279291152954, + "step": 48970 + }, + { + "epoch": 0.21028137691799112, + "grad_norm": 0.006101830396801233, + "learning_rate": 7.931279804765314e-05, + "loss": 0.04583222866058349, + "step": 48980 + }, + { + "epoch": 0.21032430900801113, + "grad_norm": 0.2531506419181824, + "learning_rate": 7.930848632753551e-05, + "loss": 0.08759585618972779, + "step": 48990 + }, + { + "epoch": 0.21036724109803112, + "grad_norm": 0.005882403813302517, + "learning_rate": 7.930417460741789e-05, + "loss": 0.2158358573913574, + "step": 49000 + }, + { + "epoch": 0.21036724109803112, + "eval_loss": 0.44679224491119385, + "eval_runtime": 27.4542, + "eval_samples_per_second": 3.642, + "eval_steps_per_second": 3.642, + "step": 49000 + }, + { + "epoch": 0.21041017318805114, + "grad_norm": 0.05826570838689804, + "learning_rate": 7.929986288730026e-05, + "loss": 0.2593266248703003, + "step": 49010 + }, + { + "epoch": 0.21045310527807115, + "grad_norm": 2.3180344104766846, + "learning_rate": 7.929555116718263e-05, + "loss": 0.487042760848999, + "step": 49020 + }, + { + "epoch": 0.21049603736809117, + "grad_norm": 0.07739551365375519, + "learning_rate": 7.929123944706501e-05, + "loss": 0.13294532299041747, + "step": 49030 + }, + { + "epoch": 0.21053896945811115, + "grad_norm": 0.03116386942565441, + "learning_rate": 7.928692772694739e-05, + "loss": 0.24828364849090576, + "step": 49040 + }, + { + "epoch": 0.21058190154813117, + "grad_norm": 0.9739252924919128, + "learning_rate": 7.928261600682976e-05, + "loss": 0.3101269960403442, + "step": 49050 + }, + { + "epoch": 0.21062483363815118, + "grad_norm": 1.394478678703308, + "learning_rate": 7.927830428671214e-05, + "loss": 0.12690937519073486, + "step": 49060 + }, + { + "epoch": 0.21066776572817117, + "grad_norm": 0.045092012733221054, + "learning_rate": 7.927399256659452e-05, + "loss": 0.1577238082885742, + "step": 49070 + }, + { + "epoch": 0.21071069781819118, + "grad_norm": 0.9610133767127991, + "learning_rate": 7.92696808464769e-05, + "loss": 0.26647014617919923, + "step": 49080 + }, + { + "epoch": 0.2107536299082112, + "grad_norm": 0.04043450951576233, + "learning_rate": 7.926536912635927e-05, + "loss": 0.20672433376312255, + "step": 49090 + }, + { + "epoch": 0.21079656199823119, + "grad_norm": 0.27440470457077026, + "learning_rate": 7.926105740624165e-05, + "loss": 0.19790778160095215, + "step": 49100 + }, + { + "epoch": 0.2108394940882512, + "grad_norm": 0.011567025445401669, + "learning_rate": 7.925674568612403e-05, + "loss": 0.2572205066680908, + "step": 49110 + }, + { + "epoch": 0.21088242617827122, + "grad_norm": 0.020354948937892914, + "learning_rate": 7.92524339660064e-05, + "loss": 0.15365511178970337, + "step": 49120 + }, + { + "epoch": 0.21092535826829123, + "grad_norm": 0.011306763626635075, + "learning_rate": 7.924812224588878e-05, + "loss": 0.20448198318481445, + "step": 49130 + }, + { + "epoch": 0.21096829035831122, + "grad_norm": 0.03681041672825813, + "learning_rate": 7.924381052577116e-05, + "loss": 0.1980149745941162, + "step": 49140 + }, + { + "epoch": 0.21101122244833123, + "grad_norm": 1.6105586290359497, + "learning_rate": 7.923949880565354e-05, + "loss": 0.38074944019317625, + "step": 49150 + }, + { + "epoch": 0.21105415453835125, + "grad_norm": 0.0796269029378891, + "learning_rate": 7.923518708553592e-05, + "loss": 0.19975688457489013, + "step": 49160 + }, + { + "epoch": 0.21109708662837123, + "grad_norm": 0.9233969449996948, + "learning_rate": 7.923087536541829e-05, + "loss": 0.4761190414428711, + "step": 49170 + }, + { + "epoch": 0.21114001871839125, + "grad_norm": 0.034418150782585144, + "learning_rate": 7.922656364530066e-05, + "loss": 0.02614889442920685, + "step": 49180 + }, + { + "epoch": 0.21118295080841126, + "grad_norm": 2.6076526641845703, + "learning_rate": 7.922225192518303e-05, + "loss": 0.10714485645294189, + "step": 49190 + }, + { + "epoch": 0.21122588289843125, + "grad_norm": 5.970362186431885, + "learning_rate": 7.921794020506541e-05, + "loss": 0.3683354616165161, + "step": 49200 + }, + { + "epoch": 0.21126881498845126, + "grad_norm": 0.006423947401344776, + "learning_rate": 7.921362848494779e-05, + "loss": 0.23473081588745118, + "step": 49210 + }, + { + "epoch": 0.21131174707847128, + "grad_norm": 1.2695651054382324, + "learning_rate": 7.920931676483017e-05, + "loss": 0.12597305774688722, + "step": 49220 + }, + { + "epoch": 0.2113546791684913, + "grad_norm": 6.1923909187316895, + "learning_rate": 7.920500504471254e-05, + "loss": 0.20782461166381835, + "step": 49230 + }, + { + "epoch": 0.21139761125851128, + "grad_norm": 0.12185300886631012, + "learning_rate": 7.920069332459492e-05, + "loss": 0.36885390281677244, + "step": 49240 + }, + { + "epoch": 0.2114405433485313, + "grad_norm": 1.584035873413086, + "learning_rate": 7.91963816044773e-05, + "loss": 0.19908725023269652, + "step": 49250 + }, + { + "epoch": 0.2114834754385513, + "grad_norm": 5.6183271408081055, + "learning_rate": 7.919206988435966e-05, + "loss": 0.22750062942504884, + "step": 49260 + }, + { + "epoch": 0.2115264075285713, + "grad_norm": 0.7646486759185791, + "learning_rate": 7.918775816424204e-05, + "loss": 0.15708142518997192, + "step": 49270 + }, + { + "epoch": 0.2115693396185913, + "grad_norm": 4.9651360511779785, + "learning_rate": 7.918344644412442e-05, + "loss": 0.33751084804534914, + "step": 49280 + }, + { + "epoch": 0.21161227170861133, + "grad_norm": 0.23684753477573395, + "learning_rate": 7.91791347240068e-05, + "loss": 0.18519192934036255, + "step": 49290 + }, + { + "epoch": 0.21165520379863131, + "grad_norm": 0.07096804678440094, + "learning_rate": 7.917482300388917e-05, + "loss": 0.47649459838867186, + "step": 49300 + }, + { + "epoch": 0.21169813588865133, + "grad_norm": 0.22593224048614502, + "learning_rate": 7.917051128377155e-05, + "loss": 0.06051828265190125, + "step": 49310 + }, + { + "epoch": 0.21174106797867134, + "grad_norm": 0.11356212198734283, + "learning_rate": 7.916619956365393e-05, + "loss": 0.3678733348846436, + "step": 49320 + }, + { + "epoch": 0.21178400006869133, + "grad_norm": 0.17641082406044006, + "learning_rate": 7.91618878435363e-05, + "loss": 0.14328333139419555, + "step": 49330 + }, + { + "epoch": 0.21182693215871135, + "grad_norm": 0.1647193282842636, + "learning_rate": 7.915757612341868e-05, + "loss": 0.200999116897583, + "step": 49340 + }, + { + "epoch": 0.21186986424873136, + "grad_norm": 0.04526711627840996, + "learning_rate": 7.915326440330106e-05, + "loss": 0.23270890712738038, + "step": 49350 + }, + { + "epoch": 0.21191279633875137, + "grad_norm": 1.2517226934432983, + "learning_rate": 7.914895268318344e-05, + "loss": 0.3122552394866943, + "step": 49360 + }, + { + "epoch": 0.21195572842877136, + "grad_norm": 5.0854363441467285, + "learning_rate": 7.914464096306581e-05, + "loss": 0.2330098867416382, + "step": 49370 + }, + { + "epoch": 0.21199866051879138, + "grad_norm": 1.9728223085403442, + "learning_rate": 7.914032924294819e-05, + "loss": 0.17663989067077637, + "step": 49380 + }, + { + "epoch": 0.2120415926088114, + "grad_norm": 7.4587082862854, + "learning_rate": 7.913601752283057e-05, + "loss": 0.34292399883270264, + "step": 49390 + }, + { + "epoch": 0.21208452469883138, + "grad_norm": 1.4386489391326904, + "learning_rate": 7.913170580271294e-05, + "loss": 0.21951770782470703, + "step": 49400 + }, + { + "epoch": 0.2121274567888514, + "grad_norm": 0.9777666926383972, + "learning_rate": 7.912739408259532e-05, + "loss": 0.3396186828613281, + "step": 49410 + }, + { + "epoch": 0.2121703888788714, + "grad_norm": 0.07708708941936493, + "learning_rate": 7.912308236247769e-05, + "loss": 0.16404324769973755, + "step": 49420 + }, + { + "epoch": 0.2122133209688914, + "grad_norm": 0.13779999315738678, + "learning_rate": 7.911877064236006e-05, + "loss": 0.08597908020019532, + "step": 49430 + }, + { + "epoch": 0.2122562530589114, + "grad_norm": 0.14609119296073914, + "learning_rate": 7.911445892224244e-05, + "loss": 0.08698570728302002, + "step": 49440 + }, + { + "epoch": 0.21229918514893142, + "grad_norm": 1.8430030345916748, + "learning_rate": 7.911014720212482e-05, + "loss": 0.3098950147628784, + "step": 49450 + }, + { + "epoch": 0.21234211723895144, + "grad_norm": 0.020075861364603043, + "learning_rate": 7.91058354820072e-05, + "loss": 0.3637603044509888, + "step": 49460 + }, + { + "epoch": 0.21238504932897143, + "grad_norm": 2.2435684204101562, + "learning_rate": 7.910152376188957e-05, + "loss": 0.2776512861251831, + "step": 49470 + }, + { + "epoch": 0.21242798141899144, + "grad_norm": 0.01798534207046032, + "learning_rate": 7.909721204177195e-05, + "loss": 0.06801215410232545, + "step": 49480 + }, + { + "epoch": 0.21247091350901146, + "grad_norm": 1.8715423345565796, + "learning_rate": 7.909290032165433e-05, + "loss": 0.19331105947494506, + "step": 49490 + }, + { + "epoch": 0.21251384559903144, + "grad_norm": 2.531074047088623, + "learning_rate": 7.90885886015367e-05, + "loss": 0.36147854328155515, + "step": 49500 + }, + { + "epoch": 0.21255677768905146, + "grad_norm": 0.11733974516391754, + "learning_rate": 7.908427688141907e-05, + "loss": 0.07555552721023559, + "step": 49510 + }, + { + "epoch": 0.21259970977907147, + "grad_norm": 1.9904005527496338, + "learning_rate": 7.907996516130145e-05, + "loss": 0.2713768720626831, + "step": 49520 + }, + { + "epoch": 0.21264264186909146, + "grad_norm": 2.1456587314605713, + "learning_rate": 7.907565344118382e-05, + "loss": 0.11105477809906006, + "step": 49530 + }, + { + "epoch": 0.21268557395911147, + "grad_norm": 0.11958852410316467, + "learning_rate": 7.90713417210662e-05, + "loss": 0.07827035188674927, + "step": 49540 + }, + { + "epoch": 0.2127285060491315, + "grad_norm": 1.6413260698318481, + "learning_rate": 7.906703000094858e-05, + "loss": 0.33407251834869384, + "step": 49550 + }, + { + "epoch": 0.2127714381391515, + "grad_norm": 0.09317167103290558, + "learning_rate": 7.906271828083096e-05, + "loss": 0.3209323167800903, + "step": 49560 + }, + { + "epoch": 0.2128143702291715, + "grad_norm": 0.28604403138160706, + "learning_rate": 7.905840656071333e-05, + "loss": 0.34248862266540525, + "step": 49570 + }, + { + "epoch": 0.2128573023191915, + "grad_norm": 0.20116518437862396, + "learning_rate": 7.905409484059571e-05, + "loss": 0.2403996467590332, + "step": 49580 + }, + { + "epoch": 0.21290023440921152, + "grad_norm": 0.008949404582381248, + "learning_rate": 7.904978312047809e-05, + "loss": 0.21593551635742186, + "step": 49590 + }, + { + "epoch": 0.2129431664992315, + "grad_norm": 0.48243486881256104, + "learning_rate": 7.904547140036046e-05, + "loss": 0.26777894496917726, + "step": 49600 + }, + { + "epoch": 0.21298609858925152, + "grad_norm": 0.39803624153137207, + "learning_rate": 7.904115968024284e-05, + "loss": 0.3349623680114746, + "step": 49610 + }, + { + "epoch": 0.21302903067927154, + "grad_norm": 0.3646968901157379, + "learning_rate": 7.903684796012522e-05, + "loss": 0.14400217533111573, + "step": 49620 + }, + { + "epoch": 0.21307196276929152, + "grad_norm": 0.3641029894351959, + "learning_rate": 7.90325362400076e-05, + "loss": 0.1390596866607666, + "step": 49630 + }, + { + "epoch": 0.21311489485931154, + "grad_norm": 0.07330887019634247, + "learning_rate": 7.902822451988997e-05, + "loss": 0.21187987327575683, + "step": 49640 + }, + { + "epoch": 0.21315782694933155, + "grad_norm": 0.018047798424959183, + "learning_rate": 7.902391279977235e-05, + "loss": 0.07641729712486267, + "step": 49650 + }, + { + "epoch": 0.21320075903935157, + "grad_norm": 4.7064056396484375, + "learning_rate": 7.901960107965473e-05, + "loss": 0.15568535327911376, + "step": 49660 + }, + { + "epoch": 0.21324369112937155, + "grad_norm": 1.504948616027832, + "learning_rate": 7.901528935953709e-05, + "loss": 0.39885008335113525, + "step": 49670 + }, + { + "epoch": 0.21328662321939157, + "grad_norm": 1.1078081130981445, + "learning_rate": 7.901097763941947e-05, + "loss": 0.2893169164657593, + "step": 49680 + }, + { + "epoch": 0.21332955530941158, + "grad_norm": 0.1008782684803009, + "learning_rate": 7.900666591930185e-05, + "loss": 0.15592665672302247, + "step": 49690 + }, + { + "epoch": 0.21337248739943157, + "grad_norm": 0.4503695070743561, + "learning_rate": 7.900235419918422e-05, + "loss": 0.3227583646774292, + "step": 49700 + }, + { + "epoch": 0.21341541948945159, + "grad_norm": 0.0678926333785057, + "learning_rate": 7.89980424790666e-05, + "loss": 0.21938278675079345, + "step": 49710 + }, + { + "epoch": 0.2134583515794716, + "grad_norm": 8.68582534790039, + "learning_rate": 7.899373075894898e-05, + "loss": 0.27250258922576903, + "step": 49720 + }, + { + "epoch": 0.2135012836694916, + "grad_norm": 3.055732011795044, + "learning_rate": 7.898941903883136e-05, + "loss": 0.07037267684936524, + "step": 49730 + }, + { + "epoch": 0.2135442157595116, + "grad_norm": 1.5734680891036987, + "learning_rate": 7.898510731871373e-05, + "loss": 0.19788484573364257, + "step": 49740 + }, + { + "epoch": 0.21358714784953162, + "grad_norm": 1.193137288093567, + "learning_rate": 7.89807955985961e-05, + "loss": 0.4758903026580811, + "step": 49750 + }, + { + "epoch": 0.2136300799395516, + "grad_norm": 4.80873441696167, + "learning_rate": 7.897648387847847e-05, + "loss": 0.40531315803527834, + "step": 49760 + }, + { + "epoch": 0.21367301202957162, + "grad_norm": 0.38741931319236755, + "learning_rate": 7.897217215836085e-05, + "loss": 0.035053136944770816, + "step": 49770 + }, + { + "epoch": 0.21371594411959163, + "grad_norm": 0.19839559495449066, + "learning_rate": 7.896786043824323e-05, + "loss": 0.2304708480834961, + "step": 49780 + }, + { + "epoch": 0.21375887620961165, + "grad_norm": 2.424915313720703, + "learning_rate": 7.896354871812561e-05, + "loss": 0.21277079582214356, + "step": 49790 + }, + { + "epoch": 0.21380180829963163, + "grad_norm": 1.4982346296310425, + "learning_rate": 7.895923699800798e-05, + "loss": 0.14394724369049072, + "step": 49800 + }, + { + "epoch": 0.21384474038965165, + "grad_norm": 0.07861235737800598, + "learning_rate": 7.895492527789036e-05, + "loss": 0.301897668838501, + "step": 49810 + }, + { + "epoch": 0.21388767247967166, + "grad_norm": 0.12516728043556213, + "learning_rate": 7.895061355777275e-05, + "loss": 0.2530207633972168, + "step": 49820 + }, + { + "epoch": 0.21393060456969165, + "grad_norm": 0.12502363324165344, + "learning_rate": 7.894630183765513e-05, + "loss": 0.29697988033294676, + "step": 49830 + }, + { + "epoch": 0.21397353665971167, + "grad_norm": 2.6676998138427734, + "learning_rate": 7.89419901175375e-05, + "loss": 0.22099952697753905, + "step": 49840 + }, + { + "epoch": 0.21401646874973168, + "grad_norm": 0.7491877675056458, + "learning_rate": 7.893767839741987e-05, + "loss": 0.24905600547790527, + "step": 49850 + }, + { + "epoch": 0.21405940083975167, + "grad_norm": 2.600252866744995, + "learning_rate": 7.893336667730225e-05, + "loss": 0.2663122177124023, + "step": 49860 + }, + { + "epoch": 0.21410233292977168, + "grad_norm": 0.0847255140542984, + "learning_rate": 7.892905495718463e-05, + "loss": 0.19019591808319092, + "step": 49870 + }, + { + "epoch": 0.2141452650197917, + "grad_norm": 0.29786983132362366, + "learning_rate": 7.8924743237067e-05, + "loss": 0.3501892566680908, + "step": 49880 + }, + { + "epoch": 0.2141881971098117, + "grad_norm": 0.028485197573900223, + "learning_rate": 7.892043151694938e-05, + "loss": 0.20272502899169922, + "step": 49890 + }, + { + "epoch": 0.2142311291998317, + "grad_norm": 0.3347903788089752, + "learning_rate": 7.891611979683176e-05, + "loss": 0.23856263160705565, + "step": 49900 + }, + { + "epoch": 0.2142740612898517, + "grad_norm": 0.2699025273323059, + "learning_rate": 7.891180807671413e-05, + "loss": 0.44274020195007324, + "step": 49910 + }, + { + "epoch": 0.21431699337987173, + "grad_norm": 2.3436944484710693, + "learning_rate": 7.89074963565965e-05, + "loss": 0.19832544326782225, + "step": 49920 + }, + { + "epoch": 0.21435992546989172, + "grad_norm": 0.07889475673437119, + "learning_rate": 7.890318463647888e-05, + "loss": 0.261244535446167, + "step": 49930 + }, + { + "epoch": 0.21440285755991173, + "grad_norm": 0.9712432026863098, + "learning_rate": 7.889887291636125e-05, + "loss": 0.12264488935470581, + "step": 49940 + }, + { + "epoch": 0.21444578964993175, + "grad_norm": 0.015600843355059624, + "learning_rate": 7.889456119624363e-05, + "loss": 0.12793511152267456, + "step": 49950 + }, + { + "epoch": 0.21448872173995173, + "grad_norm": 1.866066813468933, + "learning_rate": 7.889024947612601e-05, + "loss": 0.14210785627365113, + "step": 49960 + }, + { + "epoch": 0.21453165382997175, + "grad_norm": 0.024746781215071678, + "learning_rate": 7.888593775600839e-05, + "loss": 0.0965128481388092, + "step": 49970 + }, + { + "epoch": 0.21457458591999176, + "grad_norm": 5.134984970092773, + "learning_rate": 7.888162603589076e-05, + "loss": 0.2880469560623169, + "step": 49980 + }, + { + "epoch": 0.21461751801001178, + "grad_norm": 0.09031741321086884, + "learning_rate": 7.887731431577314e-05, + "loss": 0.24112091064453126, + "step": 49990 + }, + { + "epoch": 0.21466045010003176, + "grad_norm": 2.713540554046631, + "learning_rate": 7.88730025956555e-05, + "loss": 0.20801658630371095, + "step": 50000 + }, + { + "epoch": 0.21466045010003176, + "eval_loss": 0.4581822454929352, + "eval_runtime": 27.4412, + "eval_samples_per_second": 3.644, + "eval_steps_per_second": 3.644, + "step": 50000 + }, + { + "epoch": 0.21470338219005178, + "grad_norm": 0.019549906253814697, + "learning_rate": 7.886869087553788e-05, + "loss": 0.38978161811828616, + "step": 50010 + }, + { + "epoch": 0.2147463142800718, + "grad_norm": 0.023972727358341217, + "learning_rate": 7.886437915542026e-05, + "loss": 0.1592766284942627, + "step": 50020 + }, + { + "epoch": 0.21478924637009178, + "grad_norm": 0.38454845547676086, + "learning_rate": 7.886006743530264e-05, + "loss": 0.14972034692764283, + "step": 50030 + }, + { + "epoch": 0.2148321784601118, + "grad_norm": 0.04076255112886429, + "learning_rate": 7.885575571518503e-05, + "loss": 0.1856519103050232, + "step": 50040 + }, + { + "epoch": 0.2148751105501318, + "grad_norm": 1.6896454095840454, + "learning_rate": 7.88514439950674e-05, + "loss": 0.20486097335815429, + "step": 50050 + }, + { + "epoch": 0.2149180426401518, + "grad_norm": 2.6618430614471436, + "learning_rate": 7.884713227494978e-05, + "loss": 0.13558430671691896, + "step": 50060 + }, + { + "epoch": 0.2149609747301718, + "grad_norm": 0.027252428233623505, + "learning_rate": 7.884282055483216e-05, + "loss": 0.1477035403251648, + "step": 50070 + }, + { + "epoch": 0.21500390682019183, + "grad_norm": 3.012190580368042, + "learning_rate": 7.883850883471452e-05, + "loss": 0.3262253046035767, + "step": 50080 + }, + { + "epoch": 0.21504683891021184, + "grad_norm": 3.021601438522339, + "learning_rate": 7.88341971145969e-05, + "loss": 0.20172638893127443, + "step": 50090 + }, + { + "epoch": 0.21508977100023183, + "grad_norm": 0.03279409557580948, + "learning_rate": 7.882988539447928e-05, + "loss": 0.16631543636322021, + "step": 50100 + }, + { + "epoch": 0.21513270309025184, + "grad_norm": 2.786000967025757, + "learning_rate": 7.882557367436165e-05, + "loss": 0.27362570762634275, + "step": 50110 + }, + { + "epoch": 0.21517563518027186, + "grad_norm": 1.3738958835601807, + "learning_rate": 7.882126195424403e-05, + "loss": 0.3563390254974365, + "step": 50120 + }, + { + "epoch": 0.21521856727029184, + "grad_norm": 0.10527623444795609, + "learning_rate": 7.881695023412641e-05, + "loss": 0.29485454559326174, + "step": 50130 + }, + { + "epoch": 0.21526149936031186, + "grad_norm": 0.037768494337797165, + "learning_rate": 7.881263851400879e-05, + "loss": 0.317557168006897, + "step": 50140 + }, + { + "epoch": 0.21530443145033187, + "grad_norm": 0.04559774324297905, + "learning_rate": 7.880832679389116e-05, + "loss": 0.1404987096786499, + "step": 50150 + }, + { + "epoch": 0.21534736354035186, + "grad_norm": 2.323525905609131, + "learning_rate": 7.880401507377354e-05, + "loss": 0.19473350048065186, + "step": 50160 + }, + { + "epoch": 0.21539029563037188, + "grad_norm": 0.11371485143899918, + "learning_rate": 7.87997033536559e-05, + "loss": 0.32835702896118163, + "step": 50170 + }, + { + "epoch": 0.2154332277203919, + "grad_norm": 0.05827485769987106, + "learning_rate": 7.879539163353828e-05, + "loss": 0.2388458251953125, + "step": 50180 + }, + { + "epoch": 0.21547615981041188, + "grad_norm": 0.2014988660812378, + "learning_rate": 7.879107991342066e-05, + "loss": 0.16453293561935425, + "step": 50190 + }, + { + "epoch": 0.2155190919004319, + "grad_norm": 0.03847496211528778, + "learning_rate": 7.878676819330304e-05, + "loss": 0.22010433673858643, + "step": 50200 + }, + { + "epoch": 0.2155620239904519, + "grad_norm": 0.2672198414802551, + "learning_rate": 7.878245647318541e-05, + "loss": 0.24380803108215332, + "step": 50210 + }, + { + "epoch": 0.21560495608047192, + "grad_norm": 32.0848388671875, + "learning_rate": 7.877814475306779e-05, + "loss": 0.12520097494125365, + "step": 50220 + }, + { + "epoch": 0.2156478881704919, + "grad_norm": 0.7219252586364746, + "learning_rate": 7.877383303295017e-05, + "loss": 0.3051914691925049, + "step": 50230 + }, + { + "epoch": 0.21569082026051192, + "grad_norm": 0.18801699578762054, + "learning_rate": 7.876952131283255e-05, + "loss": 0.35453855991363525, + "step": 50240 + }, + { + "epoch": 0.21573375235053194, + "grad_norm": 5.704795837402344, + "learning_rate": 7.876520959271491e-05, + "loss": 0.4324212074279785, + "step": 50250 + }, + { + "epoch": 0.21577668444055192, + "grad_norm": 9.323872566223145, + "learning_rate": 7.87608978725973e-05, + "loss": 0.3148200273513794, + "step": 50260 + }, + { + "epoch": 0.21581961653057194, + "grad_norm": 0.027610288932919502, + "learning_rate": 7.875658615247968e-05, + "loss": 0.32387876510620117, + "step": 50270 + }, + { + "epoch": 0.21586254862059195, + "grad_norm": 0.2569706439971924, + "learning_rate": 7.875227443236206e-05, + "loss": 0.2268768310546875, + "step": 50280 + }, + { + "epoch": 0.21590548071061194, + "grad_norm": 2.0304300785064697, + "learning_rate": 7.874796271224443e-05, + "loss": 0.16933313608169556, + "step": 50290 + }, + { + "epoch": 0.21594841280063196, + "grad_norm": 5.008077621459961, + "learning_rate": 7.874365099212681e-05, + "loss": 0.20965242385864258, + "step": 50300 + }, + { + "epoch": 0.21599134489065197, + "grad_norm": 0.01447244081646204, + "learning_rate": 7.873933927200919e-05, + "loss": 0.16726946830749512, + "step": 50310 + }, + { + "epoch": 0.21603427698067199, + "grad_norm": 0.36545318365097046, + "learning_rate": 7.873502755189157e-05, + "loss": 0.10592725276947021, + "step": 50320 + }, + { + "epoch": 0.21607720907069197, + "grad_norm": 0.00590835977345705, + "learning_rate": 7.873071583177393e-05, + "loss": 0.4637115001678467, + "step": 50330 + }, + { + "epoch": 0.216120141160712, + "grad_norm": 0.16791151463985443, + "learning_rate": 7.87264041116563e-05, + "loss": 0.1303458571434021, + "step": 50340 + }, + { + "epoch": 0.216163073250732, + "grad_norm": 0.9499484300613403, + "learning_rate": 7.872209239153868e-05, + "loss": 0.19090571403503417, + "step": 50350 + }, + { + "epoch": 0.216206005340752, + "grad_norm": 0.0053385356441140175, + "learning_rate": 7.871778067142106e-05, + "loss": 0.13217315673828126, + "step": 50360 + }, + { + "epoch": 0.216248937430772, + "grad_norm": 1.6161103248596191, + "learning_rate": 7.871346895130344e-05, + "loss": 0.27801764011383057, + "step": 50370 + }, + { + "epoch": 0.21629186952079202, + "grad_norm": 1.6639463901519775, + "learning_rate": 7.870915723118582e-05, + "loss": 0.09341565370559693, + "step": 50380 + }, + { + "epoch": 0.216334801610812, + "grad_norm": 1.7329559326171875, + "learning_rate": 7.87048455110682e-05, + "loss": 0.24032018184661866, + "step": 50390 + }, + { + "epoch": 0.21637773370083202, + "grad_norm": 1.2680294513702393, + "learning_rate": 7.870053379095057e-05, + "loss": 0.3167436599731445, + "step": 50400 + }, + { + "epoch": 0.21642066579085203, + "grad_norm": 0.31326672434806824, + "learning_rate": 7.869622207083293e-05, + "loss": 0.10480024814605712, + "step": 50410 + }, + { + "epoch": 0.21646359788087205, + "grad_norm": 0.006115011405199766, + "learning_rate": 7.869191035071531e-05, + "loss": 0.12340999841690063, + "step": 50420 + }, + { + "epoch": 0.21650652997089204, + "grad_norm": 1.273048996925354, + "learning_rate": 7.868759863059769e-05, + "loss": 0.19264719486236573, + "step": 50430 + }, + { + "epoch": 0.21654946206091205, + "grad_norm": 0.18304884433746338, + "learning_rate": 7.868328691048007e-05, + "loss": 0.20949811935424806, + "step": 50440 + }, + { + "epoch": 0.21659239415093207, + "grad_norm": 1.0161921977996826, + "learning_rate": 7.867897519036244e-05, + "loss": 0.19382405281066895, + "step": 50450 + }, + { + "epoch": 0.21663532624095205, + "grad_norm": 0.22904996573925018, + "learning_rate": 7.867466347024482e-05, + "loss": 0.25605947971343995, + "step": 50460 + }, + { + "epoch": 0.21667825833097207, + "grad_norm": 0.9911039471626282, + "learning_rate": 7.86703517501272e-05, + "loss": 0.2965787410736084, + "step": 50470 + }, + { + "epoch": 0.21672119042099208, + "grad_norm": 4.8198137283325195, + "learning_rate": 7.866604003000958e-05, + "loss": 0.4654858589172363, + "step": 50480 + }, + { + "epoch": 0.21676412251101207, + "grad_norm": 0.00046917685540392995, + "learning_rate": 7.866172830989195e-05, + "loss": 0.06836865544319153, + "step": 50490 + }, + { + "epoch": 0.21680705460103208, + "grad_norm": 2.65632700920105, + "learning_rate": 7.865741658977433e-05, + "loss": 0.24790689945220948, + "step": 50500 + }, + { + "epoch": 0.2168499866910521, + "grad_norm": 0.031371332705020905, + "learning_rate": 7.865310486965671e-05, + "loss": 0.14242541790008545, + "step": 50510 + }, + { + "epoch": 0.2168929187810721, + "grad_norm": 0.8935719728469849, + "learning_rate": 7.864879314953909e-05, + "loss": 0.33764503002166746, + "step": 50520 + }, + { + "epoch": 0.2169358508710921, + "grad_norm": 0.015160330571234226, + "learning_rate": 7.864448142942146e-05, + "loss": 0.20800716876983644, + "step": 50530 + }, + { + "epoch": 0.21697878296111212, + "grad_norm": 0.004046137910336256, + "learning_rate": 7.864016970930384e-05, + "loss": 0.06134725213050842, + "step": 50540 + }, + { + "epoch": 0.21702171505113213, + "grad_norm": 0.5130903124809265, + "learning_rate": 7.863585798918622e-05, + "loss": 0.4054374694824219, + "step": 50550 + }, + { + "epoch": 0.21706464714115212, + "grad_norm": 1.5228362083435059, + "learning_rate": 7.86315462690686e-05, + "loss": 0.2966635227203369, + "step": 50560 + }, + { + "epoch": 0.21710757923117213, + "grad_norm": 3.034628391265869, + "learning_rate": 7.862723454895097e-05, + "loss": 0.4351348400115967, + "step": 50570 + }, + { + "epoch": 0.21715051132119215, + "grad_norm": 0.5908950567245483, + "learning_rate": 7.862292282883334e-05, + "loss": 0.1472018241882324, + "step": 50580 + }, + { + "epoch": 0.21719344341121213, + "grad_norm": 4.488155364990234, + "learning_rate": 7.861861110871571e-05, + "loss": 0.3412757635116577, + "step": 50590 + }, + { + "epoch": 0.21723637550123215, + "grad_norm": 0.028289830312132835, + "learning_rate": 7.861429938859809e-05, + "loss": 0.35039472579956055, + "step": 50600 + }, + { + "epoch": 0.21727930759125216, + "grad_norm": 0.017389526590704918, + "learning_rate": 7.860998766848047e-05, + "loss": 0.23729236125946046, + "step": 50610 + }, + { + "epoch": 0.21732223968127215, + "grad_norm": 0.01772734522819519, + "learning_rate": 7.860567594836285e-05, + "loss": 0.11716750860214234, + "step": 50620 + }, + { + "epoch": 0.21736517177129216, + "grad_norm": 0.2764197587966919, + "learning_rate": 7.860136422824522e-05, + "loss": 0.3149959325790405, + "step": 50630 + }, + { + "epoch": 0.21740810386131218, + "grad_norm": 0.022921325638890266, + "learning_rate": 7.85970525081276e-05, + "loss": 0.22837281227111816, + "step": 50640 + }, + { + "epoch": 0.2174510359513322, + "grad_norm": 1.474914312362671, + "learning_rate": 7.859274078800998e-05, + "loss": 0.2354658842086792, + "step": 50650 + }, + { + "epoch": 0.21749396804135218, + "grad_norm": 0.021082280203700066, + "learning_rate": 7.858842906789234e-05, + "loss": 0.0798882246017456, + "step": 50660 + }, + { + "epoch": 0.2175369001313722, + "grad_norm": 2.043031692504883, + "learning_rate": 7.858411734777472e-05, + "loss": 0.34849650859832765, + "step": 50670 + }, + { + "epoch": 0.2175798322213922, + "grad_norm": 4.649127006530762, + "learning_rate": 7.85798056276571e-05, + "loss": 0.27826337814331054, + "step": 50680 + }, + { + "epoch": 0.2176227643114122, + "grad_norm": 0.021960316225886345, + "learning_rate": 7.857549390753947e-05, + "loss": 0.5113425254821777, + "step": 50690 + }, + { + "epoch": 0.2176656964014322, + "grad_norm": 2.443013906478882, + "learning_rate": 7.857118218742185e-05, + "loss": 0.14948725700378418, + "step": 50700 + }, + { + "epoch": 0.21770862849145223, + "grad_norm": 0.029022017493844032, + "learning_rate": 7.856687046730423e-05, + "loss": 0.44149346351623536, + "step": 50710 + }, + { + "epoch": 0.21775156058147221, + "grad_norm": 0.3463537096977234, + "learning_rate": 7.85625587471866e-05, + "loss": 0.3035578727722168, + "step": 50720 + }, + { + "epoch": 0.21779449267149223, + "grad_norm": 0.6163672804832458, + "learning_rate": 7.855824702706898e-05, + "loss": 0.1137476921081543, + "step": 50730 + }, + { + "epoch": 0.21783742476151224, + "grad_norm": 0.005244885571300983, + "learning_rate": 7.855393530695136e-05, + "loss": 0.13025596141815185, + "step": 50740 + }, + { + "epoch": 0.21788035685153226, + "grad_norm": 0.7355568408966064, + "learning_rate": 7.854962358683374e-05, + "loss": 0.2603000164031982, + "step": 50750 + }, + { + "epoch": 0.21792328894155225, + "grad_norm": 1.664304494857788, + "learning_rate": 7.854531186671611e-05, + "loss": 0.19281330108642578, + "step": 50760 + }, + { + "epoch": 0.21796622103157226, + "grad_norm": 0.004859315697103739, + "learning_rate": 7.854100014659849e-05, + "loss": 0.2792229175567627, + "step": 50770 + }, + { + "epoch": 0.21800915312159228, + "grad_norm": 0.006048300769180059, + "learning_rate": 7.853668842648087e-05, + "loss": 0.20728557109832763, + "step": 50780 + }, + { + "epoch": 0.21805208521161226, + "grad_norm": 0.02996540069580078, + "learning_rate": 7.853237670636325e-05, + "loss": 0.14032127857208251, + "step": 50790 + }, + { + "epoch": 0.21809501730163228, + "grad_norm": 1.4114487171173096, + "learning_rate": 7.852806498624562e-05, + "loss": 0.2734131097793579, + "step": 50800 + }, + { + "epoch": 0.2181379493916523, + "grad_norm": 0.00662500225007534, + "learning_rate": 7.8523753266128e-05, + "loss": 0.11571886539459228, + "step": 50810 + }, + { + "epoch": 0.21818088148167228, + "grad_norm": 0.08047362416982651, + "learning_rate": 7.851944154601036e-05, + "loss": 0.07731739282608033, + "step": 50820 + }, + { + "epoch": 0.2182238135716923, + "grad_norm": 9.156729698181152, + "learning_rate": 7.851512982589274e-05, + "loss": 0.3950646162033081, + "step": 50830 + }, + { + "epoch": 0.2182667456617123, + "grad_norm": 0.11400093883275986, + "learning_rate": 7.851081810577512e-05, + "loss": 0.09669422507286071, + "step": 50840 + }, + { + "epoch": 0.21830967775173232, + "grad_norm": 0.0044946083799004555, + "learning_rate": 7.85065063856575e-05, + "loss": 0.22031335830688475, + "step": 50850 + }, + { + "epoch": 0.2183526098417523, + "grad_norm": 1.9205845594406128, + "learning_rate": 7.850219466553987e-05, + "loss": 0.47330074310302733, + "step": 50860 + }, + { + "epoch": 0.21839554193177232, + "grad_norm": 0.13260437548160553, + "learning_rate": 7.849788294542225e-05, + "loss": 0.09311110377311707, + "step": 50870 + }, + { + "epoch": 0.21843847402179234, + "grad_norm": 1.7268750667572021, + "learning_rate": 7.849357122530463e-05, + "loss": 0.22776448726654053, + "step": 50880 + }, + { + "epoch": 0.21848140611181233, + "grad_norm": 0.041452158242464066, + "learning_rate": 7.8489259505187e-05, + "loss": 0.2687647581100464, + "step": 50890 + }, + { + "epoch": 0.21852433820183234, + "grad_norm": 0.011913030408322811, + "learning_rate": 7.848494778506938e-05, + "loss": 0.14741060733795167, + "step": 50900 + }, + { + "epoch": 0.21856727029185236, + "grad_norm": 1.0277550220489502, + "learning_rate": 7.848063606495175e-05, + "loss": 0.1561746120452881, + "step": 50910 + }, + { + "epoch": 0.21861020238187234, + "grad_norm": 2.3949055671691895, + "learning_rate": 7.847632434483412e-05, + "loss": 0.2793707847595215, + "step": 50920 + }, + { + "epoch": 0.21865313447189236, + "grad_norm": 0.0005611925153061748, + "learning_rate": 7.84720126247165e-05, + "loss": 0.1184156894683838, + "step": 50930 + }, + { + "epoch": 0.21869606656191237, + "grad_norm": 0.06453302502632141, + "learning_rate": 7.846770090459888e-05, + "loss": 0.2204530954360962, + "step": 50940 + }, + { + "epoch": 0.2187389986519324, + "grad_norm": 7.096034049987793, + "learning_rate": 7.846338918448126e-05, + "loss": 0.25634169578552246, + "step": 50950 + }, + { + "epoch": 0.21878193074195237, + "grad_norm": 5.026423454284668, + "learning_rate": 7.845907746436363e-05, + "loss": 0.2579002857208252, + "step": 50960 + }, + { + "epoch": 0.2188248628319724, + "grad_norm": 0.1146092489361763, + "learning_rate": 7.845476574424601e-05, + "loss": 0.2994891405105591, + "step": 50970 + }, + { + "epoch": 0.2188677949219924, + "grad_norm": 4.33756685256958, + "learning_rate": 7.845045402412839e-05, + "loss": 0.2136240243911743, + "step": 50980 + }, + { + "epoch": 0.2189107270120124, + "grad_norm": 0.13741017878055573, + "learning_rate": 7.844614230401077e-05, + "loss": 0.3473642826080322, + "step": 50990 + }, + { + "epoch": 0.2189536591020324, + "grad_norm": 4.680849552154541, + "learning_rate": 7.844183058389314e-05, + "loss": 0.3754714012145996, + "step": 51000 + }, + { + "epoch": 0.2189536591020324, + "eval_loss": 0.4405979812145233, + "eval_runtime": 27.5677, + "eval_samples_per_second": 3.627, + "eval_steps_per_second": 3.627, + "step": 51000 + }, + { + "epoch": 0.21899659119205242, + "grad_norm": 0.8056715130805969, + "learning_rate": 7.843751886377552e-05, + "loss": 0.283284330368042, + "step": 51010 + }, + { + "epoch": 0.2190395232820724, + "grad_norm": 1.782196283340454, + "learning_rate": 7.84332071436579e-05, + "loss": 0.2841160297393799, + "step": 51020 + }, + { + "epoch": 0.21908245537209242, + "grad_norm": 0.09319104999303818, + "learning_rate": 7.842889542354028e-05, + "loss": 0.4059459686279297, + "step": 51030 + }, + { + "epoch": 0.21912538746211244, + "grad_norm": 0.29425910115242004, + "learning_rate": 7.842458370342265e-05, + "loss": 0.2115602970123291, + "step": 51040 + }, + { + "epoch": 0.21916831955213242, + "grad_norm": 4.665130615234375, + "learning_rate": 7.842027198330503e-05, + "loss": 0.0615256130695343, + "step": 51050 + }, + { + "epoch": 0.21921125164215244, + "grad_norm": 0.13821355998516083, + "learning_rate": 7.841596026318741e-05, + "loss": 0.1643990159034729, + "step": 51060 + }, + { + "epoch": 0.21925418373217245, + "grad_norm": 0.00384420994669199, + "learning_rate": 7.841164854306977e-05, + "loss": 0.08403640389442443, + "step": 51070 + }, + { + "epoch": 0.21929711582219247, + "grad_norm": 0.0846899077296257, + "learning_rate": 7.840733682295215e-05, + "loss": 0.34184112548828127, + "step": 51080 + }, + { + "epoch": 0.21934004791221245, + "grad_norm": 0.30739402770996094, + "learning_rate": 7.840302510283453e-05, + "loss": 0.17862628698348998, + "step": 51090 + }, + { + "epoch": 0.21938298000223247, + "grad_norm": 0.09101126343011856, + "learning_rate": 7.83987133827169e-05, + "loss": 0.3576796531677246, + "step": 51100 + }, + { + "epoch": 0.21942591209225248, + "grad_norm": 3.593675374984741, + "learning_rate": 7.839440166259928e-05, + "loss": 0.13683393001556396, + "step": 51110 + }, + { + "epoch": 0.21946884418227247, + "grad_norm": 0.01158614456653595, + "learning_rate": 7.839008994248166e-05, + "loss": 0.24310684204101562, + "step": 51120 + }, + { + "epoch": 0.21951177627229249, + "grad_norm": 0.03459831327199936, + "learning_rate": 7.838577822236404e-05, + "loss": 0.2010591983795166, + "step": 51130 + }, + { + "epoch": 0.2195547083623125, + "grad_norm": 0.8882039785385132, + "learning_rate": 7.838146650224641e-05, + "loss": 0.23802576065063477, + "step": 51140 + }, + { + "epoch": 0.2195976404523325, + "grad_norm": 0.25156083703041077, + "learning_rate": 7.837715478212878e-05, + "loss": 0.19319934844970704, + "step": 51150 + }, + { + "epoch": 0.2196405725423525, + "grad_norm": 0.1898663491010666, + "learning_rate": 7.837284306201115e-05, + "loss": 0.21830298900604247, + "step": 51160 + }, + { + "epoch": 0.21968350463237252, + "grad_norm": 0.0030881662387400866, + "learning_rate": 7.836853134189353e-05, + "loss": 0.272199010848999, + "step": 51170 + }, + { + "epoch": 0.21972643672239253, + "grad_norm": 0.13047702610492706, + "learning_rate": 7.836421962177591e-05, + "loss": 0.2839015483856201, + "step": 51180 + }, + { + "epoch": 0.21976936881241252, + "grad_norm": 0.03696379438042641, + "learning_rate": 7.835990790165829e-05, + "loss": 0.2720863103866577, + "step": 51190 + }, + { + "epoch": 0.21981230090243253, + "grad_norm": 0.05394358187913895, + "learning_rate": 7.835559618154066e-05, + "loss": 0.4942734718322754, + "step": 51200 + }, + { + "epoch": 0.21985523299245255, + "grad_norm": 0.6661820411682129, + "learning_rate": 7.835128446142304e-05, + "loss": 0.1750645637512207, + "step": 51210 + }, + { + "epoch": 0.21989816508247254, + "grad_norm": 0.453449010848999, + "learning_rate": 7.834697274130542e-05, + "loss": 0.17545816898345948, + "step": 51220 + }, + { + "epoch": 0.21994109717249255, + "grad_norm": 1.2218490839004517, + "learning_rate": 7.83426610211878e-05, + "loss": 0.2454913854598999, + "step": 51230 + }, + { + "epoch": 0.21998402926251256, + "grad_norm": 3.4764392375946045, + "learning_rate": 7.833834930107017e-05, + "loss": 0.43323559761047364, + "step": 51240 + }, + { + "epoch": 0.22002696135253255, + "grad_norm": 0.006227872334420681, + "learning_rate": 7.833403758095255e-05, + "loss": 0.22940664291381835, + "step": 51250 + }, + { + "epoch": 0.22006989344255257, + "grad_norm": 0.045368924736976624, + "learning_rate": 7.832972586083493e-05, + "loss": 0.4383875370025635, + "step": 51260 + }, + { + "epoch": 0.22011282553257258, + "grad_norm": 1.0571563243865967, + "learning_rate": 7.83254141407173e-05, + "loss": 0.374654483795166, + "step": 51270 + }, + { + "epoch": 0.2201557576225926, + "grad_norm": 0.04682515189051628, + "learning_rate": 7.832110242059968e-05, + "loss": 0.38846986293792723, + "step": 51280 + }, + { + "epoch": 0.22019868971261258, + "grad_norm": 4.054689407348633, + "learning_rate": 7.831679070048206e-05, + "loss": 0.4694014072418213, + "step": 51290 + }, + { + "epoch": 0.2202416218026326, + "grad_norm": 0.7244497537612915, + "learning_rate": 7.831247898036444e-05, + "loss": 0.17690551280975342, + "step": 51300 + }, + { + "epoch": 0.2202845538926526, + "grad_norm": 1.9599987268447876, + "learning_rate": 7.830816726024681e-05, + "loss": 0.28881473541259767, + "step": 51310 + }, + { + "epoch": 0.2203274859826726, + "grad_norm": 0.9945737719535828, + "learning_rate": 7.830385554012918e-05, + "loss": 0.2523778438568115, + "step": 51320 + }, + { + "epoch": 0.22037041807269261, + "grad_norm": 0.011761073023080826, + "learning_rate": 7.829954382001156e-05, + "loss": 0.08957783579826355, + "step": 51330 + }, + { + "epoch": 0.22041335016271263, + "grad_norm": 0.03757171332836151, + "learning_rate": 7.829523209989393e-05, + "loss": 0.2829038858413696, + "step": 51340 + }, + { + "epoch": 0.22045628225273262, + "grad_norm": 4.216167449951172, + "learning_rate": 7.829092037977631e-05, + "loss": 0.33401927947998045, + "step": 51350 + }, + { + "epoch": 0.22049921434275263, + "grad_norm": 0.00842796266078949, + "learning_rate": 7.828660865965869e-05, + "loss": 0.34130475521087644, + "step": 51360 + }, + { + "epoch": 0.22054214643277265, + "grad_norm": 1.3092318773269653, + "learning_rate": 7.828229693954106e-05, + "loss": 0.37724757194519043, + "step": 51370 + }, + { + "epoch": 0.22058507852279266, + "grad_norm": 6.285627365112305, + "learning_rate": 7.827798521942344e-05, + "loss": 0.22345025539398194, + "step": 51380 + }, + { + "epoch": 0.22062801061281265, + "grad_norm": 1.7613346576690674, + "learning_rate": 7.827367349930582e-05, + "loss": 0.3523125410079956, + "step": 51390 + }, + { + "epoch": 0.22067094270283266, + "grad_norm": 1.3106528520584106, + "learning_rate": 7.826936177918818e-05, + "loss": 0.2903813362121582, + "step": 51400 + }, + { + "epoch": 0.22071387479285268, + "grad_norm": 0.02184089459478855, + "learning_rate": 7.826505005907056e-05, + "loss": 0.142237389087677, + "step": 51410 + }, + { + "epoch": 0.22075680688287266, + "grad_norm": 0.05910355970263481, + "learning_rate": 7.826073833895294e-05, + "loss": 0.0735704779624939, + "step": 51420 + }, + { + "epoch": 0.22079973897289268, + "grad_norm": 1.5817798376083374, + "learning_rate": 7.825642661883531e-05, + "loss": 0.345813250541687, + "step": 51430 + }, + { + "epoch": 0.2208426710629127, + "grad_norm": 0.017727894708514214, + "learning_rate": 7.825211489871769e-05, + "loss": 0.30463998317718505, + "step": 51440 + }, + { + "epoch": 0.22088560315293268, + "grad_norm": 21.531631469726562, + "learning_rate": 7.824780317860008e-05, + "loss": 0.4145090103149414, + "step": 51450 + }, + { + "epoch": 0.2209285352429527, + "grad_norm": 0.38469067215919495, + "learning_rate": 7.824349145848246e-05, + "loss": 0.3056338310241699, + "step": 51460 + }, + { + "epoch": 0.2209714673329727, + "grad_norm": 3.433922052383423, + "learning_rate": 7.823917973836484e-05, + "loss": 0.1726033091545105, + "step": 51470 + }, + { + "epoch": 0.2210143994229927, + "grad_norm": 0.021967153996229172, + "learning_rate": 7.82348680182472e-05, + "loss": 0.3242464303970337, + "step": 51480 + }, + { + "epoch": 0.2210573315130127, + "grad_norm": 0.008535942994058132, + "learning_rate": 7.823055629812958e-05, + "loss": 0.11328502893447875, + "step": 51490 + }, + { + "epoch": 0.22110026360303273, + "grad_norm": 0.030140530318021774, + "learning_rate": 7.822624457801196e-05, + "loss": 0.21845624446868897, + "step": 51500 + }, + { + "epoch": 0.22114319569305274, + "grad_norm": 0.3550052046775818, + "learning_rate": 7.822193285789433e-05, + "loss": 0.23196914196014404, + "step": 51510 + }, + { + "epoch": 0.22118612778307273, + "grad_norm": 0.03214557096362114, + "learning_rate": 7.821762113777671e-05, + "loss": 0.3088571071624756, + "step": 51520 + }, + { + "epoch": 0.22122905987309274, + "grad_norm": 0.009252496063709259, + "learning_rate": 7.821330941765909e-05, + "loss": 0.2549436569213867, + "step": 51530 + }, + { + "epoch": 0.22127199196311276, + "grad_norm": 0.009182988665997982, + "learning_rate": 7.820899769754147e-05, + "loss": 0.13270962238311768, + "step": 51540 + }, + { + "epoch": 0.22131492405313274, + "grad_norm": 0.061013370752334595, + "learning_rate": 7.820468597742384e-05, + "loss": 0.4141830921173096, + "step": 51550 + }, + { + "epoch": 0.22135785614315276, + "grad_norm": 0.0408286526799202, + "learning_rate": 7.820037425730621e-05, + "loss": 0.232647705078125, + "step": 51560 + }, + { + "epoch": 0.22140078823317277, + "grad_norm": 0.02865544892847538, + "learning_rate": 7.819606253718858e-05, + "loss": 0.12392929792404175, + "step": 51570 + }, + { + "epoch": 0.22144372032319276, + "grad_norm": 0.14595501124858856, + "learning_rate": 7.819175081707096e-05, + "loss": 0.2198183059692383, + "step": 51580 + }, + { + "epoch": 0.22148665241321278, + "grad_norm": 0.2862647473812103, + "learning_rate": 7.818743909695334e-05, + "loss": 0.22051844596862794, + "step": 51590 + }, + { + "epoch": 0.2215295845032328, + "grad_norm": 1.7248533964157104, + "learning_rate": 7.818312737683572e-05, + "loss": 0.32520184516906736, + "step": 51600 + }, + { + "epoch": 0.2215725165932528, + "grad_norm": 0.4680907726287842, + "learning_rate": 7.81788156567181e-05, + "loss": 0.14819936752319335, + "step": 51610 + }, + { + "epoch": 0.2216154486832728, + "grad_norm": 0.028154416009783745, + "learning_rate": 7.817450393660047e-05, + "loss": 0.16450451612472533, + "step": 51620 + }, + { + "epoch": 0.2216583807732928, + "grad_norm": 5.077931880950928, + "learning_rate": 7.817019221648285e-05, + "loss": 0.36899194717407224, + "step": 51630 + }, + { + "epoch": 0.22170131286331282, + "grad_norm": 3.5552923679351807, + "learning_rate": 7.816588049636523e-05, + "loss": 0.4686445713043213, + "step": 51640 + }, + { + "epoch": 0.2217442449533328, + "grad_norm": 0.06549005210399628, + "learning_rate": 7.816156877624759e-05, + "loss": 0.16026378870010377, + "step": 51650 + }, + { + "epoch": 0.22178717704335282, + "grad_norm": 0.31749969720840454, + "learning_rate": 7.815725705612997e-05, + "loss": 0.13566253185272217, + "step": 51660 + }, + { + "epoch": 0.22183010913337284, + "grad_norm": 0.7159654498100281, + "learning_rate": 7.815294533601236e-05, + "loss": 0.2575033903121948, + "step": 51670 + }, + { + "epoch": 0.22187304122339283, + "grad_norm": 0.003270721761509776, + "learning_rate": 7.814863361589473e-05, + "loss": 0.21550092697143555, + "step": 51680 + }, + { + "epoch": 0.22191597331341284, + "grad_norm": 1.244299054145813, + "learning_rate": 7.814432189577711e-05, + "loss": 0.45678186416625977, + "step": 51690 + }, + { + "epoch": 0.22195890540343285, + "grad_norm": 0.0017754979198798537, + "learning_rate": 7.814001017565949e-05, + "loss": 0.21440205574035645, + "step": 51700 + }, + { + "epoch": 0.22200183749345287, + "grad_norm": 0.15123091638088226, + "learning_rate": 7.813569845554187e-05, + "loss": 0.32442948818206785, + "step": 51710 + }, + { + "epoch": 0.22204476958347286, + "grad_norm": 0.16990530490875244, + "learning_rate": 7.813138673542424e-05, + "loss": 0.3471397638320923, + "step": 51720 + }, + { + "epoch": 0.22208770167349287, + "grad_norm": 0.029147816821932793, + "learning_rate": 7.812707501530661e-05, + "loss": 0.1523631453514099, + "step": 51730 + }, + { + "epoch": 0.22213063376351289, + "grad_norm": 4.789177417755127, + "learning_rate": 7.812276329518899e-05, + "loss": 0.32821714878082275, + "step": 51740 + }, + { + "epoch": 0.22217356585353287, + "grad_norm": 0.01579742506146431, + "learning_rate": 7.811845157507136e-05, + "loss": 0.3338596820831299, + "step": 51750 + }, + { + "epoch": 0.2222164979435529, + "grad_norm": 0.010001111775636673, + "learning_rate": 7.811413985495374e-05, + "loss": 0.279772424697876, + "step": 51760 + }, + { + "epoch": 0.2222594300335729, + "grad_norm": 0.09585186839103699, + "learning_rate": 7.810982813483612e-05, + "loss": 0.322173547744751, + "step": 51770 + }, + { + "epoch": 0.2223023621235929, + "grad_norm": 0.8465486764907837, + "learning_rate": 7.81055164147185e-05, + "loss": 0.26959757804870604, + "step": 51780 + }, + { + "epoch": 0.2223452942136129, + "grad_norm": 1.0950313806533813, + "learning_rate": 7.810120469460087e-05, + "loss": 0.30342302322387693, + "step": 51790 + }, + { + "epoch": 0.22238822630363292, + "grad_norm": 2.7534327507019043, + "learning_rate": 7.809689297448325e-05, + "loss": 0.2312720537185669, + "step": 51800 + }, + { + "epoch": 0.22243115839365293, + "grad_norm": 4.048535346984863, + "learning_rate": 7.809258125436561e-05, + "loss": 0.3151477575302124, + "step": 51810 + }, + { + "epoch": 0.22247409048367292, + "grad_norm": 0.23564355075359344, + "learning_rate": 7.808826953424799e-05, + "loss": 0.2035139799118042, + "step": 51820 + }, + { + "epoch": 0.22251702257369294, + "grad_norm": 0.7216160893440247, + "learning_rate": 7.808395781413037e-05, + "loss": 0.30786886215209963, + "step": 51830 + }, + { + "epoch": 0.22255995466371295, + "grad_norm": 6.129444122314453, + "learning_rate": 7.807964609401275e-05, + "loss": 0.14896565675735474, + "step": 51840 + }, + { + "epoch": 0.22260288675373294, + "grad_norm": 0.6112780570983887, + "learning_rate": 7.807533437389512e-05, + "loss": 0.2353865385055542, + "step": 51850 + }, + { + "epoch": 0.22264581884375295, + "grad_norm": 0.012277986854314804, + "learning_rate": 7.80710226537775e-05, + "loss": 0.18124105930328369, + "step": 51860 + }, + { + "epoch": 0.22268875093377297, + "grad_norm": 0.8171926140785217, + "learning_rate": 7.806671093365988e-05, + "loss": 0.23719098567962646, + "step": 51870 + }, + { + "epoch": 0.22273168302379295, + "grad_norm": 0.03393758833408356, + "learning_rate": 7.806239921354225e-05, + "loss": 0.37589733600616454, + "step": 51880 + }, + { + "epoch": 0.22277461511381297, + "grad_norm": 0.15871426463127136, + "learning_rate": 7.805808749342463e-05, + "loss": 0.2771576404571533, + "step": 51890 + }, + { + "epoch": 0.22281754720383298, + "grad_norm": 0.05300932005047798, + "learning_rate": 7.805377577330701e-05, + "loss": 0.0938086450099945, + "step": 51900 + }, + { + "epoch": 0.22286047929385297, + "grad_norm": 1.3278213739395142, + "learning_rate": 7.804946405318939e-05, + "loss": 0.364089298248291, + "step": 51910 + }, + { + "epoch": 0.22290341138387298, + "grad_norm": 2.5431289672851562, + "learning_rate": 7.804515233307176e-05, + "loss": 0.20374348163604736, + "step": 51920 + }, + { + "epoch": 0.222946343473893, + "grad_norm": 0.009660803712904453, + "learning_rate": 7.804084061295414e-05, + "loss": 0.016839735209941864, + "step": 51930 + }, + { + "epoch": 0.22298927556391301, + "grad_norm": 30.04834747314453, + "learning_rate": 7.803652889283652e-05, + "loss": 0.23668146133422852, + "step": 51940 + }, + { + "epoch": 0.223032207653933, + "grad_norm": 0.14079128205776215, + "learning_rate": 7.80322171727189e-05, + "loss": 0.16221317052841186, + "step": 51950 + }, + { + "epoch": 0.22307513974395302, + "grad_norm": 0.22580532729625702, + "learning_rate": 7.802790545260127e-05, + "loss": 0.13811932802200316, + "step": 51960 + }, + { + "epoch": 0.22311807183397303, + "grad_norm": 0.21858012676239014, + "learning_rate": 7.802359373248364e-05, + "loss": 0.21348247528076172, + "step": 51970 + }, + { + "epoch": 0.22316100392399302, + "grad_norm": 1.3461596965789795, + "learning_rate": 7.801928201236601e-05, + "loss": 0.3414191246032715, + "step": 51980 + }, + { + "epoch": 0.22320393601401303, + "grad_norm": 0.048035021871328354, + "learning_rate": 7.801497029224839e-05, + "loss": 0.17132971286773682, + "step": 51990 + }, + { + "epoch": 0.22324686810403305, + "grad_norm": 1.9410839080810547, + "learning_rate": 7.801065857213077e-05, + "loss": 0.35149712562561036, + "step": 52000 + }, + { + "epoch": 0.22324686810403305, + "eval_loss": 0.4512121081352234, + "eval_runtime": 27.411, + "eval_samples_per_second": 3.648, + "eval_steps_per_second": 3.648, + "step": 52000 + }, + { + "epoch": 0.22328980019405303, + "grad_norm": 0.03847186267375946, + "learning_rate": 7.800634685201315e-05, + "loss": 0.1860820770263672, + "step": 52010 + }, + { + "epoch": 0.22333273228407305, + "grad_norm": 3.8256406784057617, + "learning_rate": 7.800203513189552e-05, + "loss": 0.4085991859436035, + "step": 52020 + }, + { + "epoch": 0.22337566437409306, + "grad_norm": 0.0686342716217041, + "learning_rate": 7.79977234117779e-05, + "loss": 0.1791319727897644, + "step": 52030 + }, + { + "epoch": 0.22341859646411308, + "grad_norm": 0.07664453983306885, + "learning_rate": 7.799341169166028e-05, + "loss": 0.1927349805831909, + "step": 52040 + }, + { + "epoch": 0.22346152855413307, + "grad_norm": 0.023954255506396294, + "learning_rate": 7.798909997154266e-05, + "loss": 0.07940815091133117, + "step": 52050 + }, + { + "epoch": 0.22350446064415308, + "grad_norm": 0.02951730787754059, + "learning_rate": 7.798478825142502e-05, + "loss": 0.2144526720046997, + "step": 52060 + }, + { + "epoch": 0.2235473927341731, + "grad_norm": 1.5430526733398438, + "learning_rate": 7.79804765313074e-05, + "loss": 0.24770851135253907, + "step": 52070 + }, + { + "epoch": 0.22359032482419308, + "grad_norm": 0.08437793701887131, + "learning_rate": 7.797616481118977e-05, + "loss": 0.24535858631134033, + "step": 52080 + }, + { + "epoch": 0.2236332569142131, + "grad_norm": 4.446104049682617, + "learning_rate": 7.797185309107215e-05, + "loss": 0.3557150840759277, + "step": 52090 + }, + { + "epoch": 0.2236761890042331, + "grad_norm": 0.9419364929199219, + "learning_rate": 7.796754137095453e-05, + "loss": 0.3013280391693115, + "step": 52100 + }, + { + "epoch": 0.2237191210942531, + "grad_norm": 0.004197876434773207, + "learning_rate": 7.79632296508369e-05, + "loss": 0.197326397895813, + "step": 52110 + }, + { + "epoch": 0.2237620531842731, + "grad_norm": 15.142932891845703, + "learning_rate": 7.795891793071928e-05, + "loss": 0.18400282859802247, + "step": 52120 + }, + { + "epoch": 0.22380498527429313, + "grad_norm": 0.013756810687482357, + "learning_rate": 7.795460621060166e-05, + "loss": 0.1299996018409729, + "step": 52130 + }, + { + "epoch": 0.22384791736431314, + "grad_norm": 10.383787155151367, + "learning_rate": 7.795029449048404e-05, + "loss": 0.28579258918762207, + "step": 52140 + }, + { + "epoch": 0.22389084945433313, + "grad_norm": 1.5638196468353271, + "learning_rate": 7.794598277036642e-05, + "loss": 0.30528721809387205, + "step": 52150 + }, + { + "epoch": 0.22393378154435314, + "grad_norm": 0.033577535301446915, + "learning_rate": 7.794167105024879e-05, + "loss": 0.10886262655258179, + "step": 52160 + }, + { + "epoch": 0.22397671363437316, + "grad_norm": 4.6401872634887695, + "learning_rate": 7.793735933013117e-05, + "loss": 0.33708269596099855, + "step": 52170 + }, + { + "epoch": 0.22401964572439315, + "grad_norm": 0.004224882926791906, + "learning_rate": 7.793304761001355e-05, + "loss": 0.029603365063667297, + "step": 52180 + }, + { + "epoch": 0.22406257781441316, + "grad_norm": 0.027500273659825325, + "learning_rate": 7.792873588989593e-05, + "loss": 0.12923910617828369, + "step": 52190 + }, + { + "epoch": 0.22410550990443318, + "grad_norm": 24.483905792236328, + "learning_rate": 7.79244241697783e-05, + "loss": 0.08223460912704468, + "step": 52200 + }, + { + "epoch": 0.22414844199445316, + "grad_norm": 0.11597905308008194, + "learning_rate": 7.792011244966068e-05, + "loss": 0.1549830913543701, + "step": 52210 + }, + { + "epoch": 0.22419137408447318, + "grad_norm": 1.219777226448059, + "learning_rate": 7.791580072954304e-05, + "loss": 0.2020240068435669, + "step": 52220 + }, + { + "epoch": 0.2242343061744932, + "grad_norm": 0.08328064531087875, + "learning_rate": 7.791148900942542e-05, + "loss": 0.017869633436203004, + "step": 52230 + }, + { + "epoch": 0.2242772382645132, + "grad_norm": 0.06575699150562286, + "learning_rate": 7.79071772893078e-05, + "loss": 0.3781913757324219, + "step": 52240 + }, + { + "epoch": 0.2243201703545332, + "grad_norm": 0.034926220774650574, + "learning_rate": 7.790286556919018e-05, + "loss": 0.2552025556564331, + "step": 52250 + }, + { + "epoch": 0.2243631024445532, + "grad_norm": 2.1973154544830322, + "learning_rate": 7.789855384907255e-05, + "loss": 0.3631817102432251, + "step": 52260 + }, + { + "epoch": 0.22440603453457322, + "grad_norm": 0.027149952948093414, + "learning_rate": 7.789424212895493e-05, + "loss": 0.1516263008117676, + "step": 52270 + }, + { + "epoch": 0.2244489666245932, + "grad_norm": 3.6887452602386475, + "learning_rate": 7.788993040883731e-05, + "loss": 0.18411051034927367, + "step": 52280 + }, + { + "epoch": 0.22449189871461322, + "grad_norm": 0.003940201830118895, + "learning_rate": 7.788561868871969e-05, + "loss": 0.1607654571533203, + "step": 52290 + }, + { + "epoch": 0.22453483080463324, + "grad_norm": 2.560877561569214, + "learning_rate": 7.788130696860205e-05, + "loss": 0.17027069330215455, + "step": 52300 + }, + { + "epoch": 0.22457776289465323, + "grad_norm": 1.8159563541412354, + "learning_rate": 7.787699524848443e-05, + "loss": 0.2474687099456787, + "step": 52310 + }, + { + "epoch": 0.22462069498467324, + "grad_norm": 0.7241863012313843, + "learning_rate": 7.78726835283668e-05, + "loss": 0.34167027473449707, + "step": 52320 + }, + { + "epoch": 0.22466362707469326, + "grad_norm": 0.028914542868733406, + "learning_rate": 7.786837180824918e-05, + "loss": 0.36235129833221436, + "step": 52330 + }, + { + "epoch": 0.22470655916471324, + "grad_norm": 0.052308402955532074, + "learning_rate": 7.786406008813156e-05, + "loss": 0.3940187215805054, + "step": 52340 + }, + { + "epoch": 0.22474949125473326, + "grad_norm": 0.37863701581954956, + "learning_rate": 7.785974836801394e-05, + "loss": 0.07285253405570984, + "step": 52350 + }, + { + "epoch": 0.22479242334475327, + "grad_norm": 0.019866731017827988, + "learning_rate": 7.785543664789631e-05, + "loss": 0.3209496021270752, + "step": 52360 + }, + { + "epoch": 0.2248353554347733, + "grad_norm": 23.818340301513672, + "learning_rate": 7.785112492777869e-05, + "loss": 0.2861109733581543, + "step": 52370 + }, + { + "epoch": 0.22487828752479327, + "grad_norm": 0.8396596908569336, + "learning_rate": 7.784681320766107e-05, + "loss": 0.23325965404510499, + "step": 52380 + }, + { + "epoch": 0.2249212196148133, + "grad_norm": 0.047412410378456116, + "learning_rate": 7.784250148754344e-05, + "loss": 0.5725963592529297, + "step": 52390 + }, + { + "epoch": 0.2249641517048333, + "grad_norm": 0.1795731782913208, + "learning_rate": 7.783818976742582e-05, + "loss": 0.20154619216918945, + "step": 52400 + }, + { + "epoch": 0.2250070837948533, + "grad_norm": 1.0902587175369263, + "learning_rate": 7.78338780473082e-05, + "loss": 0.1822667360305786, + "step": 52410 + }, + { + "epoch": 0.2250500158848733, + "grad_norm": 0.8377712368965149, + "learning_rate": 7.782956632719058e-05, + "loss": 0.1981184482574463, + "step": 52420 + }, + { + "epoch": 0.22509294797489332, + "grad_norm": 1.0425002574920654, + "learning_rate": 7.782525460707295e-05, + "loss": 0.2192686080932617, + "step": 52430 + }, + { + "epoch": 0.2251358800649133, + "grad_norm": 0.003001261968165636, + "learning_rate": 7.782094288695533e-05, + "loss": 0.3034151554107666, + "step": 52440 + }, + { + "epoch": 0.22517881215493332, + "grad_norm": 0.33139747381210327, + "learning_rate": 7.781663116683771e-05, + "loss": 0.10466808080673218, + "step": 52450 + }, + { + "epoch": 0.22522174424495334, + "grad_norm": 7.508664608001709, + "learning_rate": 7.781231944672009e-05, + "loss": 0.25111525058746337, + "step": 52460 + }, + { + "epoch": 0.22526467633497335, + "grad_norm": 0.8088374137878418, + "learning_rate": 7.780800772660245e-05, + "loss": 0.18727099895477295, + "step": 52470 + }, + { + "epoch": 0.22530760842499334, + "grad_norm": 0.007046096492558718, + "learning_rate": 7.780369600648483e-05, + "loss": 0.1947243928909302, + "step": 52480 + }, + { + "epoch": 0.22535054051501335, + "grad_norm": 0.04045693576335907, + "learning_rate": 7.77993842863672e-05, + "loss": 0.10588670969009399, + "step": 52490 + }, + { + "epoch": 0.22539347260503337, + "grad_norm": 0.5456598401069641, + "learning_rate": 7.779507256624958e-05, + "loss": 0.3418902397155762, + "step": 52500 + }, + { + "epoch": 0.22543640469505336, + "grad_norm": 0.7057582139968872, + "learning_rate": 7.779076084613196e-05, + "loss": 0.3276927947998047, + "step": 52510 + }, + { + "epoch": 0.22547933678507337, + "grad_norm": 0.7780113816261292, + "learning_rate": 7.778644912601434e-05, + "loss": 0.1883944869041443, + "step": 52520 + }, + { + "epoch": 0.22552226887509338, + "grad_norm": 1.5828453302383423, + "learning_rate": 7.778213740589671e-05, + "loss": 0.12598092555999757, + "step": 52530 + }, + { + "epoch": 0.22556520096511337, + "grad_norm": 0.01957198604941368, + "learning_rate": 7.777782568577909e-05, + "loss": 0.1770286202430725, + "step": 52540 + }, + { + "epoch": 0.2256081330551334, + "grad_norm": 1.2329034805297852, + "learning_rate": 7.777351396566146e-05, + "loss": 0.12175513505935669, + "step": 52550 + }, + { + "epoch": 0.2256510651451534, + "grad_norm": 0.43187209963798523, + "learning_rate": 7.776920224554383e-05, + "loss": 0.3777428865432739, + "step": 52560 + }, + { + "epoch": 0.22569399723517342, + "grad_norm": 2.0818798542022705, + "learning_rate": 7.776489052542621e-05, + "loss": 0.26906890869140626, + "step": 52570 + }, + { + "epoch": 0.2257369293251934, + "grad_norm": 0.02331465110182762, + "learning_rate": 7.776057880530859e-05, + "loss": 0.1667281985282898, + "step": 52580 + }, + { + "epoch": 0.22577986141521342, + "grad_norm": 0.013449318706989288, + "learning_rate": 7.775626708519096e-05, + "loss": 0.2873204708099365, + "step": 52590 + }, + { + "epoch": 0.22582279350523343, + "grad_norm": 1.743377447128296, + "learning_rate": 7.775195536507334e-05, + "loss": 0.197990345954895, + "step": 52600 + }, + { + "epoch": 0.22586572559525342, + "grad_norm": 0.013360187411308289, + "learning_rate": 7.774764364495572e-05, + "loss": 0.1657071590423584, + "step": 52610 + }, + { + "epoch": 0.22590865768527343, + "grad_norm": 0.7187745571136475, + "learning_rate": 7.77433319248381e-05, + "loss": 0.2067204475402832, + "step": 52620 + }, + { + "epoch": 0.22595158977529345, + "grad_norm": 1.9448988437652588, + "learning_rate": 7.773902020472047e-05, + "loss": 0.40744667053222655, + "step": 52630 + }, + { + "epoch": 0.22599452186531344, + "grad_norm": 11.273550987243652, + "learning_rate": 7.773470848460285e-05, + "loss": 0.05754717588424683, + "step": 52640 + }, + { + "epoch": 0.22603745395533345, + "grad_norm": 1.3800010681152344, + "learning_rate": 7.773039676448523e-05, + "loss": 0.33999783992767335, + "step": 52650 + }, + { + "epoch": 0.22608038604535347, + "grad_norm": 0.02123466692864895, + "learning_rate": 7.77260850443676e-05, + "loss": 0.2705632209777832, + "step": 52660 + }, + { + "epoch": 0.22612331813537348, + "grad_norm": 38.30419921875, + "learning_rate": 7.772177332424998e-05, + "loss": 0.14641456604003905, + "step": 52670 + }, + { + "epoch": 0.22616625022539347, + "grad_norm": 0.04562552645802498, + "learning_rate": 7.771746160413236e-05, + "loss": 0.31936705112457275, + "step": 52680 + }, + { + "epoch": 0.22620918231541348, + "grad_norm": 1.2478864192962646, + "learning_rate": 7.771314988401474e-05, + "loss": 0.2298142910003662, + "step": 52690 + }, + { + "epoch": 0.2262521144054335, + "grad_norm": 2.6947596073150635, + "learning_rate": 7.770883816389712e-05, + "loss": 0.337100887298584, + "step": 52700 + }, + { + "epoch": 0.22629504649545348, + "grad_norm": 0.1122974082827568, + "learning_rate": 7.770452644377949e-05, + "loss": 0.08905890583992004, + "step": 52710 + }, + { + "epoch": 0.2263379785854735, + "grad_norm": 0.01768355444073677, + "learning_rate": 7.770021472366186e-05, + "loss": 0.08977657556533813, + "step": 52720 + }, + { + "epoch": 0.2263809106754935, + "grad_norm": 7.6935954093933105, + "learning_rate": 7.769590300354423e-05, + "loss": 0.35249192714691163, + "step": 52730 + }, + { + "epoch": 0.2264238427655135, + "grad_norm": 0.0323822908103466, + "learning_rate": 7.769159128342661e-05, + "loss": 0.21579148769378662, + "step": 52740 + }, + { + "epoch": 0.22646677485553351, + "grad_norm": 0.024706387892365456, + "learning_rate": 7.768727956330899e-05, + "loss": 0.42194533348083496, + "step": 52750 + }, + { + "epoch": 0.22650970694555353, + "grad_norm": 1.9699130058288574, + "learning_rate": 7.768296784319137e-05, + "loss": 0.39843249320983887, + "step": 52760 + }, + { + "epoch": 0.22655263903557352, + "grad_norm": 0.020310785621404648, + "learning_rate": 7.767865612307374e-05, + "loss": 0.18961342573165893, + "step": 52770 + }, + { + "epoch": 0.22659557112559353, + "grad_norm": 3.1800918579101562, + "learning_rate": 7.767434440295612e-05, + "loss": 0.40066027641296387, + "step": 52780 + }, + { + "epoch": 0.22663850321561355, + "grad_norm": 0.09538385272026062, + "learning_rate": 7.76700326828385e-05, + "loss": 0.13351293802261352, + "step": 52790 + }, + { + "epoch": 0.22668143530563356, + "grad_norm": 4.516862869262695, + "learning_rate": 7.766572096272086e-05, + "loss": 0.29952595233917234, + "step": 52800 + }, + { + "epoch": 0.22672436739565355, + "grad_norm": 0.4217206537723541, + "learning_rate": 7.766140924260324e-05, + "loss": 0.28686773777008057, + "step": 52810 + }, + { + "epoch": 0.22676729948567356, + "grad_norm": 0.08405343443155289, + "learning_rate": 7.765709752248562e-05, + "loss": 0.09559805393218994, + "step": 52820 + }, + { + "epoch": 0.22681023157569358, + "grad_norm": 0.01651051454246044, + "learning_rate": 7.7652785802368e-05, + "loss": 0.25448591709136964, + "step": 52830 + }, + { + "epoch": 0.22685316366571356, + "grad_norm": 0.9871359467506409, + "learning_rate": 7.764847408225037e-05, + "loss": 0.2132502317428589, + "step": 52840 + }, + { + "epoch": 0.22689609575573358, + "grad_norm": 1.333776593208313, + "learning_rate": 7.764416236213275e-05, + "loss": 0.08467223048210144, + "step": 52850 + }, + { + "epoch": 0.2269390278457536, + "grad_norm": 0.1418377310037613, + "learning_rate": 7.763985064201514e-05, + "loss": 0.2629718780517578, + "step": 52860 + }, + { + "epoch": 0.22698195993577358, + "grad_norm": 0.13721498847007751, + "learning_rate": 7.763553892189752e-05, + "loss": 0.24046893119812013, + "step": 52870 + }, + { + "epoch": 0.2270248920257936, + "grad_norm": 0.06743840873241425, + "learning_rate": 7.763122720177988e-05, + "loss": 0.37756991386413574, + "step": 52880 + }, + { + "epoch": 0.2270678241158136, + "grad_norm": 3.4788317680358887, + "learning_rate": 7.762691548166226e-05, + "loss": 0.24660749435424806, + "step": 52890 + }, + { + "epoch": 0.22711075620583362, + "grad_norm": 3.4126675128936768, + "learning_rate": 7.762260376154464e-05, + "loss": 0.26963019371032715, + "step": 52900 + }, + { + "epoch": 0.2271536882958536, + "grad_norm": 2.278189182281494, + "learning_rate": 7.761829204142701e-05, + "loss": 0.41298890113830566, + "step": 52910 + }, + { + "epoch": 0.22719662038587363, + "grad_norm": 0.010370615869760513, + "learning_rate": 7.761398032130939e-05, + "loss": 0.15059603452682496, + "step": 52920 + }, + { + "epoch": 0.22723955247589364, + "grad_norm": 3.394544839859009, + "learning_rate": 7.760966860119177e-05, + "loss": 0.273115348815918, + "step": 52930 + }, + { + "epoch": 0.22728248456591363, + "grad_norm": 1.2083812952041626, + "learning_rate": 7.760535688107414e-05, + "loss": 0.16042221784591676, + "step": 52940 + }, + { + "epoch": 0.22732541665593364, + "grad_norm": 0.2176787108182907, + "learning_rate": 7.760104516095652e-05, + "loss": 0.08922134637832642, + "step": 52950 + }, + { + "epoch": 0.22736834874595366, + "grad_norm": 1.2652984857559204, + "learning_rate": 7.759673344083889e-05, + "loss": 0.34654817581176756, + "step": 52960 + }, + { + "epoch": 0.22741128083597364, + "grad_norm": 0.0635618269443512, + "learning_rate": 7.759242172072126e-05, + "loss": 0.3084728240966797, + "step": 52970 + }, + { + "epoch": 0.22745421292599366, + "grad_norm": 1.6466708183288574, + "learning_rate": 7.758811000060364e-05, + "loss": 0.03555763363838196, + "step": 52980 + }, + { + "epoch": 0.22749714501601367, + "grad_norm": 7.08722448348999, + "learning_rate": 7.758379828048602e-05, + "loss": 0.1829333186149597, + "step": 52990 + }, + { + "epoch": 0.2275400771060337, + "grad_norm": 9.112763404846191, + "learning_rate": 7.75794865603684e-05, + "loss": 0.5256698131561279, + "step": 53000 + }, + { + "epoch": 0.2275400771060337, + "eval_loss": 0.4434048533439636, + "eval_runtime": 27.4709, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 53000 + }, + { + "epoch": 0.22758300919605368, + "grad_norm": 1.2349168062210083, + "learning_rate": 7.757517484025077e-05, + "loss": 0.35252578258514405, + "step": 53010 + }, + { + "epoch": 0.2276259412860737, + "grad_norm": 0.010905789211392403, + "learning_rate": 7.757086312013315e-05, + "loss": 0.21037817001342773, + "step": 53020 + }, + { + "epoch": 0.2276688733760937, + "grad_norm": 1.4286820888519287, + "learning_rate": 7.756655140001553e-05, + "loss": 0.546109676361084, + "step": 53030 + }, + { + "epoch": 0.2277118054661137, + "grad_norm": 0.023641018196940422, + "learning_rate": 7.756223967989789e-05, + "loss": 0.1473099946975708, + "step": 53040 + }, + { + "epoch": 0.2277547375561337, + "grad_norm": 0.14704179763793945, + "learning_rate": 7.755792795978027e-05, + "loss": 0.15295494794845582, + "step": 53050 + }, + { + "epoch": 0.22779766964615372, + "grad_norm": 0.004135909024626017, + "learning_rate": 7.755361623966265e-05, + "loss": 0.2492366313934326, + "step": 53060 + }, + { + "epoch": 0.2278406017361737, + "grad_norm": 0.040785808116197586, + "learning_rate": 7.754930451954502e-05, + "loss": 0.32564263343811034, + "step": 53070 + }, + { + "epoch": 0.22788353382619372, + "grad_norm": 2.620953321456909, + "learning_rate": 7.754499279942741e-05, + "loss": 0.3289715051651001, + "step": 53080 + }, + { + "epoch": 0.22792646591621374, + "grad_norm": 0.7488147616386414, + "learning_rate": 7.754068107930979e-05, + "loss": 0.4312422275543213, + "step": 53090 + }, + { + "epoch": 0.22796939800623375, + "grad_norm": 4.4017510414123535, + "learning_rate": 7.753636935919217e-05, + "loss": 0.33054094314575194, + "step": 53100 + }, + { + "epoch": 0.22801233009625374, + "grad_norm": 0.023379260674118996, + "learning_rate": 7.753205763907455e-05, + "loss": 0.21203324794769288, + "step": 53110 + }, + { + "epoch": 0.22805526218627376, + "grad_norm": 3.7151591777801514, + "learning_rate": 7.752774591895692e-05, + "loss": 0.1834011673927307, + "step": 53120 + }, + { + "epoch": 0.22809819427629377, + "grad_norm": 2.511679172515869, + "learning_rate": 7.752343419883929e-05, + "loss": 0.13200852870941163, + "step": 53130 + }, + { + "epoch": 0.22814112636631376, + "grad_norm": 3.0376436710357666, + "learning_rate": 7.751912247872166e-05, + "loss": 0.3159782886505127, + "step": 53140 + }, + { + "epoch": 0.22818405845633377, + "grad_norm": 1.4104591608047485, + "learning_rate": 7.751481075860404e-05, + "loss": 0.33030409812927247, + "step": 53150 + }, + { + "epoch": 0.2282269905463538, + "grad_norm": 0.09129254519939423, + "learning_rate": 7.751049903848642e-05, + "loss": 0.0683474063873291, + "step": 53160 + }, + { + "epoch": 0.22826992263637377, + "grad_norm": 0.025976408272981644, + "learning_rate": 7.75061873183688e-05, + "loss": 0.22922194004058838, + "step": 53170 + }, + { + "epoch": 0.2283128547263938, + "grad_norm": 0.007180104032158852, + "learning_rate": 7.750187559825117e-05, + "loss": 0.2635721445083618, + "step": 53180 + }, + { + "epoch": 0.2283557868164138, + "grad_norm": 1.0476505756378174, + "learning_rate": 7.749756387813355e-05, + "loss": 0.24734883308410643, + "step": 53190 + }, + { + "epoch": 0.2283987189064338, + "grad_norm": 0.4360477328300476, + "learning_rate": 7.749325215801593e-05, + "loss": 0.37479491233825685, + "step": 53200 + }, + { + "epoch": 0.2284416509964538, + "grad_norm": 0.3107700049877167, + "learning_rate": 7.748894043789829e-05, + "loss": 0.11275169849395753, + "step": 53210 + }, + { + "epoch": 0.22848458308647382, + "grad_norm": 0.023104490712285042, + "learning_rate": 7.748462871778067e-05, + "loss": 0.3658841848373413, + "step": 53220 + }, + { + "epoch": 0.22852751517649383, + "grad_norm": 1.2826489210128784, + "learning_rate": 7.748031699766305e-05, + "loss": 0.31947362422943115, + "step": 53230 + }, + { + "epoch": 0.22857044726651382, + "grad_norm": 0.03451520577073097, + "learning_rate": 7.747600527754542e-05, + "loss": 0.10847625732421876, + "step": 53240 + }, + { + "epoch": 0.22861337935653384, + "grad_norm": 5.027333736419678, + "learning_rate": 7.74716935574278e-05, + "loss": 0.328680682182312, + "step": 53250 + }, + { + "epoch": 0.22865631144655385, + "grad_norm": 0.01842356100678444, + "learning_rate": 7.746738183731018e-05, + "loss": 0.1819998264312744, + "step": 53260 + }, + { + "epoch": 0.22869924353657384, + "grad_norm": 0.000408778665587306, + "learning_rate": 7.746307011719256e-05, + "loss": 0.2438007116317749, + "step": 53270 + }, + { + "epoch": 0.22874217562659385, + "grad_norm": 0.10073164105415344, + "learning_rate": 7.745875839707493e-05, + "loss": 0.2796761989593506, + "step": 53280 + }, + { + "epoch": 0.22878510771661387, + "grad_norm": 0.0005582142039202154, + "learning_rate": 7.74544466769573e-05, + "loss": 0.12742369174957274, + "step": 53290 + }, + { + "epoch": 0.22882803980663385, + "grad_norm": 3.7237370014190674, + "learning_rate": 7.745013495683969e-05, + "loss": 0.3490560293197632, + "step": 53300 + }, + { + "epoch": 0.22887097189665387, + "grad_norm": 0.01251291949301958, + "learning_rate": 7.744582323672207e-05, + "loss": 0.13892886638641358, + "step": 53310 + }, + { + "epoch": 0.22891390398667388, + "grad_norm": 0.03795626387000084, + "learning_rate": 7.744151151660444e-05, + "loss": 0.22728416919708253, + "step": 53320 + }, + { + "epoch": 0.2289568360766939, + "grad_norm": 2.3000359535217285, + "learning_rate": 7.743719979648682e-05, + "loss": 0.3141765117645264, + "step": 53330 + }, + { + "epoch": 0.22899976816671389, + "grad_norm": 0.028027264401316643, + "learning_rate": 7.74328880763692e-05, + "loss": 0.12524042129516602, + "step": 53340 + }, + { + "epoch": 0.2290427002567339, + "grad_norm": 0.43016451597213745, + "learning_rate": 7.742857635625158e-05, + "loss": 0.3535402536392212, + "step": 53350 + }, + { + "epoch": 0.22908563234675391, + "grad_norm": 0.16803216934204102, + "learning_rate": 7.742426463613395e-05, + "loss": 0.14710566997528077, + "step": 53360 + }, + { + "epoch": 0.2291285644367739, + "grad_norm": 0.01865394227206707, + "learning_rate": 7.741995291601632e-05, + "loss": 0.19491530656814576, + "step": 53370 + }, + { + "epoch": 0.22917149652679392, + "grad_norm": 2.5932819843292236, + "learning_rate": 7.74156411958987e-05, + "loss": 0.2611079692840576, + "step": 53380 + }, + { + "epoch": 0.22921442861681393, + "grad_norm": 6.514557838439941, + "learning_rate": 7.741132947578107e-05, + "loss": 0.26472296714782717, + "step": 53390 + }, + { + "epoch": 0.22925736070683392, + "grad_norm": 0.017307903617620468, + "learning_rate": 7.740701775566345e-05, + "loss": 0.27357475757598876, + "step": 53400 + }, + { + "epoch": 0.22930029279685393, + "grad_norm": 0.8172531723976135, + "learning_rate": 7.740270603554583e-05, + "loss": 0.2582735300064087, + "step": 53410 + }, + { + "epoch": 0.22934322488687395, + "grad_norm": 0.009025703184306622, + "learning_rate": 7.73983943154282e-05, + "loss": 0.3182665824890137, + "step": 53420 + }, + { + "epoch": 0.22938615697689396, + "grad_norm": 0.002536490559577942, + "learning_rate": 7.739408259531058e-05, + "loss": 0.42736306190490725, + "step": 53430 + }, + { + "epoch": 0.22942908906691395, + "grad_norm": 0.06554874032735825, + "learning_rate": 7.738977087519296e-05, + "loss": 0.4487005710601807, + "step": 53440 + }, + { + "epoch": 0.22947202115693396, + "grad_norm": 1.08189058303833, + "learning_rate": 7.738545915507533e-05, + "loss": 0.3008427143096924, + "step": 53450 + }, + { + "epoch": 0.22951495324695398, + "grad_norm": 0.0046480633318424225, + "learning_rate": 7.73811474349577e-05, + "loss": 0.23597433567047119, + "step": 53460 + }, + { + "epoch": 0.22955788533697397, + "grad_norm": 0.08075997978448868, + "learning_rate": 7.737683571484008e-05, + "loss": 0.15743098258972169, + "step": 53470 + }, + { + "epoch": 0.22960081742699398, + "grad_norm": 0.0488070547580719, + "learning_rate": 7.737252399472245e-05, + "loss": 0.3878929138183594, + "step": 53480 + }, + { + "epoch": 0.229643749517014, + "grad_norm": 3.296687126159668, + "learning_rate": 7.736821227460483e-05, + "loss": 0.3832553863525391, + "step": 53490 + }, + { + "epoch": 0.22968668160703398, + "grad_norm": 0.013115516863763332, + "learning_rate": 7.736390055448721e-05, + "loss": 0.3670145750045776, + "step": 53500 + }, + { + "epoch": 0.229729613697054, + "grad_norm": 0.26563796401023865, + "learning_rate": 7.735958883436959e-05, + "loss": 0.20897796154022216, + "step": 53510 + }, + { + "epoch": 0.229772545787074, + "grad_norm": 0.6128681302070618, + "learning_rate": 7.735527711425196e-05, + "loss": 0.17986410856246948, + "step": 53520 + }, + { + "epoch": 0.22981547787709403, + "grad_norm": 4.410193920135498, + "learning_rate": 7.735096539413434e-05, + "loss": 0.282076096534729, + "step": 53530 + }, + { + "epoch": 0.229858409967114, + "grad_norm": 0.06367503851652145, + "learning_rate": 7.734665367401672e-05, + "loss": 0.29574127197265626, + "step": 53540 + }, + { + "epoch": 0.22990134205713403, + "grad_norm": 0.0185005571693182, + "learning_rate": 7.73423419538991e-05, + "loss": 0.34318268299102783, + "step": 53550 + }, + { + "epoch": 0.22994427414715404, + "grad_norm": 1.0150419473648071, + "learning_rate": 7.733803023378147e-05, + "loss": 0.301274037361145, + "step": 53560 + }, + { + "epoch": 0.22998720623717403, + "grad_norm": 0.0939207673072815, + "learning_rate": 7.733371851366385e-05, + "loss": 0.1255855679512024, + "step": 53570 + }, + { + "epoch": 0.23003013832719404, + "grad_norm": 0.126878023147583, + "learning_rate": 7.732940679354623e-05, + "loss": 0.11698832511901855, + "step": 53580 + }, + { + "epoch": 0.23007307041721406, + "grad_norm": 0.0007859831675887108, + "learning_rate": 7.73250950734286e-05, + "loss": 0.12006430625915528, + "step": 53590 + }, + { + "epoch": 0.23011600250723405, + "grad_norm": 1.9789891242980957, + "learning_rate": 7.732078335331098e-05, + "loss": 0.1413771390914917, + "step": 53600 + }, + { + "epoch": 0.23015893459725406, + "grad_norm": 0.010845298878848553, + "learning_rate": 7.731647163319336e-05, + "loss": 0.07959707379341126, + "step": 53610 + }, + { + "epoch": 0.23020186668727408, + "grad_norm": 0.02713046595454216, + "learning_rate": 7.731215991307572e-05, + "loss": 0.11618701219558716, + "step": 53620 + }, + { + "epoch": 0.23024479877729406, + "grad_norm": 1.4147077798843384, + "learning_rate": 7.73078481929581e-05, + "loss": 0.45875911712646483, + "step": 53630 + }, + { + "epoch": 0.23028773086731408, + "grad_norm": 0.2035217583179474, + "learning_rate": 7.730353647284048e-05, + "loss": 0.29228217601776124, + "step": 53640 + }, + { + "epoch": 0.2303306629573341, + "grad_norm": 2.1338682174682617, + "learning_rate": 7.729922475272285e-05, + "loss": 0.31885855197906493, + "step": 53650 + }, + { + "epoch": 0.2303735950473541, + "grad_norm": 0.08031083643436432, + "learning_rate": 7.729491303260523e-05, + "loss": 0.1363338828086853, + "step": 53660 + }, + { + "epoch": 0.2304165271373741, + "grad_norm": 0.00543932942673564, + "learning_rate": 7.729060131248761e-05, + "loss": 0.2697357177734375, + "step": 53670 + }, + { + "epoch": 0.2304594592273941, + "grad_norm": 0.0035474589094519615, + "learning_rate": 7.728628959236999e-05, + "loss": 0.12337535619735718, + "step": 53680 + }, + { + "epoch": 0.23050239131741412, + "grad_norm": 0.11553837358951569, + "learning_rate": 7.728197787225236e-05, + "loss": 0.40886611938476564, + "step": 53690 + }, + { + "epoch": 0.2305453234074341, + "grad_norm": 1.4451885223388672, + "learning_rate": 7.727766615213473e-05, + "loss": 0.3630185127258301, + "step": 53700 + }, + { + "epoch": 0.23058825549745413, + "grad_norm": 0.3057561218738556, + "learning_rate": 7.72733544320171e-05, + "loss": 0.12788234949111937, + "step": 53710 + }, + { + "epoch": 0.23063118758747414, + "grad_norm": 2.0138747692108154, + "learning_rate": 7.726904271189948e-05, + "loss": 0.2607387065887451, + "step": 53720 + }, + { + "epoch": 0.23067411967749413, + "grad_norm": 1.4901790618896484, + "learning_rate": 7.726473099178186e-05, + "loss": 0.5151829242706298, + "step": 53730 + }, + { + "epoch": 0.23071705176751414, + "grad_norm": 0.6877745389938354, + "learning_rate": 7.726041927166424e-05, + "loss": 0.44553771018981936, + "step": 53740 + }, + { + "epoch": 0.23075998385753416, + "grad_norm": 0.06983120739459991, + "learning_rate": 7.725610755154661e-05, + "loss": 0.2058812141418457, + "step": 53750 + }, + { + "epoch": 0.23080291594755417, + "grad_norm": 0.10931398719549179, + "learning_rate": 7.725179583142899e-05, + "loss": 0.11302660703659058, + "step": 53760 + }, + { + "epoch": 0.23084584803757416, + "grad_norm": 0.3568594753742218, + "learning_rate": 7.724748411131137e-05, + "loss": 0.09234002232551575, + "step": 53770 + }, + { + "epoch": 0.23088878012759417, + "grad_norm": 0.11842846125364304, + "learning_rate": 7.724317239119375e-05, + "loss": 0.1256554365158081, + "step": 53780 + }, + { + "epoch": 0.2309317122176142, + "grad_norm": 0.07234813272953033, + "learning_rate": 7.723886067107612e-05, + "loss": 0.049526515603065493, + "step": 53790 + }, + { + "epoch": 0.23097464430763417, + "grad_norm": 2.2963318824768066, + "learning_rate": 7.72345489509585e-05, + "loss": 0.20251133441925048, + "step": 53800 + }, + { + "epoch": 0.2310175763976542, + "grad_norm": 0.0010859980247914791, + "learning_rate": 7.723023723084088e-05, + "loss": 0.1640162944793701, + "step": 53810 + }, + { + "epoch": 0.2310605084876742, + "grad_norm": 0.024189893156290054, + "learning_rate": 7.722592551072326e-05, + "loss": 0.07351300120353699, + "step": 53820 + }, + { + "epoch": 0.2311034405776942, + "grad_norm": 0.46556511521339417, + "learning_rate": 7.722161379060563e-05, + "loss": 0.20398468971252443, + "step": 53830 + }, + { + "epoch": 0.2311463726677142, + "grad_norm": 4.37669563293457, + "learning_rate": 7.721730207048801e-05, + "loss": 0.23910939693450928, + "step": 53840 + }, + { + "epoch": 0.23118930475773422, + "grad_norm": 1.5534576177597046, + "learning_rate": 7.721299035037039e-05, + "loss": 0.21463370323181152, + "step": 53850 + }, + { + "epoch": 0.23123223684775424, + "grad_norm": 2.1490960121154785, + "learning_rate": 7.720867863025277e-05, + "loss": 0.3465791940689087, + "step": 53860 + }, + { + "epoch": 0.23127516893777422, + "grad_norm": 23.90878677368164, + "learning_rate": 7.720436691013513e-05, + "loss": 0.5125598430633544, + "step": 53870 + }, + { + "epoch": 0.23131810102779424, + "grad_norm": 0.027876272797584534, + "learning_rate": 7.72000551900175e-05, + "loss": 0.06181851625442505, + "step": 53880 + }, + { + "epoch": 0.23136103311781425, + "grad_norm": 4.011077880859375, + "learning_rate": 7.719574346989988e-05, + "loss": 0.18830808401107788, + "step": 53890 + }, + { + "epoch": 0.23140396520783424, + "grad_norm": 5.805874824523926, + "learning_rate": 7.719143174978226e-05, + "loss": 0.09969301223754883, + "step": 53900 + }, + { + "epoch": 0.23144689729785425, + "grad_norm": 0.0011275582946836948, + "learning_rate": 7.718712002966464e-05, + "loss": 0.13450464010238647, + "step": 53910 + }, + { + "epoch": 0.23148982938787427, + "grad_norm": 1.8485000133514404, + "learning_rate": 7.718280830954702e-05, + "loss": 0.25277459621429443, + "step": 53920 + }, + { + "epoch": 0.23153276147789426, + "grad_norm": 2.114288091659546, + "learning_rate": 7.717849658942939e-05, + "loss": 0.07729903459548951, + "step": 53930 + }, + { + "epoch": 0.23157569356791427, + "grad_norm": 0.2792128324508667, + "learning_rate": 7.717418486931177e-05, + "loss": 0.23469130992889403, + "step": 53940 + }, + { + "epoch": 0.23161862565793429, + "grad_norm": 2.9970715045928955, + "learning_rate": 7.716987314919413e-05, + "loss": 0.3448359727859497, + "step": 53950 + }, + { + "epoch": 0.2316615577479543, + "grad_norm": 2.2078139781951904, + "learning_rate": 7.716556142907651e-05, + "loss": 0.40146422386169434, + "step": 53960 + }, + { + "epoch": 0.2317044898379743, + "grad_norm": 1.174996256828308, + "learning_rate": 7.716124970895889e-05, + "loss": 0.24278640747070312, + "step": 53970 + }, + { + "epoch": 0.2317474219279943, + "grad_norm": 1.443159580230713, + "learning_rate": 7.715693798884127e-05, + "loss": 0.1192315697669983, + "step": 53980 + }, + { + "epoch": 0.23179035401801432, + "grad_norm": 0.5862539410591125, + "learning_rate": 7.715262626872364e-05, + "loss": 0.2521751642227173, + "step": 53990 + }, + { + "epoch": 0.2318332861080343, + "grad_norm": 0.0605054534971714, + "learning_rate": 7.714831454860602e-05, + "loss": 0.17406871318817138, + "step": 54000 + }, + { + "epoch": 0.2318332861080343, + "eval_loss": 0.4490349292755127, + "eval_runtime": 27.4617, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 54000 + }, + { + "epoch": 0.23187621819805432, + "grad_norm": 1.6182562112808228, + "learning_rate": 7.71440028284884e-05, + "loss": 0.3006648778915405, + "step": 54010 + }, + { + "epoch": 0.23191915028807433, + "grad_norm": 0.0011926308507099748, + "learning_rate": 7.713969110837078e-05, + "loss": 0.2160343647003174, + "step": 54020 + }, + { + "epoch": 0.23196208237809432, + "grad_norm": 0.0024554634001106024, + "learning_rate": 7.713537938825315e-05, + "loss": 0.07509487271308898, + "step": 54030 + }, + { + "epoch": 0.23200501446811433, + "grad_norm": 0.013232232071459293, + "learning_rate": 7.713106766813553e-05, + "loss": 0.2723365068435669, + "step": 54040 + }, + { + "epoch": 0.23204794655813435, + "grad_norm": 0.032803673297166824, + "learning_rate": 7.712675594801791e-05, + "loss": 0.129007089138031, + "step": 54050 + }, + { + "epoch": 0.23209087864815434, + "grad_norm": 1.168349266052246, + "learning_rate": 7.712244422790029e-05, + "loss": 0.16661056280136108, + "step": 54060 + }, + { + "epoch": 0.23213381073817435, + "grad_norm": 5.672686576843262, + "learning_rate": 7.711813250778266e-05, + "loss": 0.17700178623199464, + "step": 54070 + }, + { + "epoch": 0.23217674282819437, + "grad_norm": 0.29278627038002014, + "learning_rate": 7.711382078766504e-05, + "loss": 0.23657283782958985, + "step": 54080 + }, + { + "epoch": 0.23221967491821438, + "grad_norm": 2.1834146976470947, + "learning_rate": 7.710950906754742e-05, + "loss": 0.39358129501342776, + "step": 54090 + }, + { + "epoch": 0.23226260700823437, + "grad_norm": 0.009153024293482304, + "learning_rate": 7.71051973474298e-05, + "loss": 0.002811253070831299, + "step": 54100 + }, + { + "epoch": 0.23230553909825438, + "grad_norm": 1.72090482711792, + "learning_rate": 7.710088562731216e-05, + "loss": 0.3168710470199585, + "step": 54110 + }, + { + "epoch": 0.2323484711882744, + "grad_norm": 3.161130666732788, + "learning_rate": 7.709657390719454e-05, + "loss": 0.26244750022888186, + "step": 54120 + }, + { + "epoch": 0.23239140327829438, + "grad_norm": 0.01198434829711914, + "learning_rate": 7.709226218707691e-05, + "loss": 0.2590029239654541, + "step": 54130 + }, + { + "epoch": 0.2324343353683144, + "grad_norm": 0.01950439251959324, + "learning_rate": 7.708795046695929e-05, + "loss": 0.2357182741165161, + "step": 54140 + }, + { + "epoch": 0.2324772674583344, + "grad_norm": 15.0639066696167, + "learning_rate": 7.708363874684167e-05, + "loss": 0.14678561687469482, + "step": 54150 + }, + { + "epoch": 0.2325201995483544, + "grad_norm": 0.0018443474546074867, + "learning_rate": 7.707932702672404e-05, + "loss": 0.22300183773040771, + "step": 54160 + }, + { + "epoch": 0.23256313163837442, + "grad_norm": 2.399331569671631, + "learning_rate": 7.707501530660642e-05, + "loss": 0.2935328483581543, + "step": 54170 + }, + { + "epoch": 0.23260606372839443, + "grad_norm": 0.046965453773736954, + "learning_rate": 7.70707035864888e-05, + "loss": 0.2203977346420288, + "step": 54180 + }, + { + "epoch": 0.23264899581841444, + "grad_norm": 0.029289549216628075, + "learning_rate": 7.706639186637118e-05, + "loss": 0.09207029342651367, + "step": 54190 + }, + { + "epoch": 0.23269192790843443, + "grad_norm": 0.40686535835266113, + "learning_rate": 7.706208014625354e-05, + "loss": 0.16840204000473022, + "step": 54200 + }, + { + "epoch": 0.23273485999845445, + "grad_norm": 46.78797912597656, + "learning_rate": 7.705776842613592e-05, + "loss": 0.3027779579162598, + "step": 54210 + }, + { + "epoch": 0.23277779208847446, + "grad_norm": 0.009091447107493877, + "learning_rate": 7.70534567060183e-05, + "loss": 0.19085679054260254, + "step": 54220 + }, + { + "epoch": 0.23282072417849445, + "grad_norm": 0.13714081048965454, + "learning_rate": 7.704914498590067e-05, + "loss": 0.24470322132110595, + "step": 54230 + }, + { + "epoch": 0.23286365626851446, + "grad_norm": 0.0014627090422436595, + "learning_rate": 7.704483326578305e-05, + "loss": 0.2286834239959717, + "step": 54240 + }, + { + "epoch": 0.23290658835853448, + "grad_norm": 0.0014132530195638537, + "learning_rate": 7.704052154566543e-05, + "loss": 0.1336017966270447, + "step": 54250 + }, + { + "epoch": 0.23294952044855446, + "grad_norm": 1.9230753183364868, + "learning_rate": 7.70362098255478e-05, + "loss": 0.2634952306747437, + "step": 54260 + }, + { + "epoch": 0.23299245253857448, + "grad_norm": 0.004964998923242092, + "learning_rate": 7.70318981054302e-05, + "loss": 0.2046299934387207, + "step": 54270 + }, + { + "epoch": 0.2330353846285945, + "grad_norm": 2.2121899127960205, + "learning_rate": 7.702758638531256e-05, + "loss": 0.2678264379501343, + "step": 54280 + }, + { + "epoch": 0.2330783167186145, + "grad_norm": 0.22293893992900848, + "learning_rate": 7.702327466519494e-05, + "loss": 0.1930667996406555, + "step": 54290 + }, + { + "epoch": 0.2331212488086345, + "grad_norm": 0.5882869958877563, + "learning_rate": 7.701896294507731e-05, + "loss": 0.2323695182800293, + "step": 54300 + }, + { + "epoch": 0.2331641808986545, + "grad_norm": 8.391152381896973, + "learning_rate": 7.701465122495969e-05, + "loss": 0.19190495014190673, + "step": 54310 + }, + { + "epoch": 0.23320711298867453, + "grad_norm": 2.274019956588745, + "learning_rate": 7.701033950484207e-05, + "loss": 0.3386978626251221, + "step": 54320 + }, + { + "epoch": 0.2332500450786945, + "grad_norm": 0.2136969417333603, + "learning_rate": 7.700602778472445e-05, + "loss": 0.08582958579063416, + "step": 54330 + }, + { + "epoch": 0.23329297716871453, + "grad_norm": 0.07544512301683426, + "learning_rate": 7.700171606460682e-05, + "loss": 0.2182629346847534, + "step": 54340 + }, + { + "epoch": 0.23333590925873454, + "grad_norm": 0.00047073987661860883, + "learning_rate": 7.69974043444892e-05, + "loss": 0.15393882989883423, + "step": 54350 + }, + { + "epoch": 0.23337884134875453, + "grad_norm": 0.22471746802330017, + "learning_rate": 7.699309262437156e-05, + "loss": 0.33650147914886475, + "step": 54360 + }, + { + "epoch": 0.23342177343877454, + "grad_norm": 0.00527210533618927, + "learning_rate": 7.698878090425394e-05, + "loss": 0.09793091416358948, + "step": 54370 + }, + { + "epoch": 0.23346470552879456, + "grad_norm": 0.011522647924721241, + "learning_rate": 7.698446918413632e-05, + "loss": 0.21272644996643067, + "step": 54380 + }, + { + "epoch": 0.23350763761881455, + "grad_norm": 1.7179675102233887, + "learning_rate": 7.69801574640187e-05, + "loss": 0.3838799238204956, + "step": 54390 + }, + { + "epoch": 0.23355056970883456, + "grad_norm": 1.6328366994857788, + "learning_rate": 7.697584574390107e-05, + "loss": 0.1944947361946106, + "step": 54400 + }, + { + "epoch": 0.23359350179885457, + "grad_norm": 0.007709296885877848, + "learning_rate": 7.697153402378345e-05, + "loss": 0.06385926604270935, + "step": 54410 + }, + { + "epoch": 0.2336364338888746, + "grad_norm": 1.1928542852401733, + "learning_rate": 7.696722230366583e-05, + "loss": 0.24402003288269042, + "step": 54420 + }, + { + "epoch": 0.23367936597889458, + "grad_norm": 7.334529876708984, + "learning_rate": 7.69629105835482e-05, + "loss": 0.31890459060668946, + "step": 54430 + }, + { + "epoch": 0.2337222980689146, + "grad_norm": 0.03360019251704216, + "learning_rate": 7.695859886343057e-05, + "loss": 0.11006944179534912, + "step": 54440 + }, + { + "epoch": 0.2337652301589346, + "grad_norm": 0.17549341917037964, + "learning_rate": 7.695428714331295e-05, + "loss": 0.4170397758483887, + "step": 54450 + }, + { + "epoch": 0.2338081622489546, + "grad_norm": 12.329401969909668, + "learning_rate": 7.694997542319532e-05, + "loss": 0.32720706462860105, + "step": 54460 + }, + { + "epoch": 0.2338510943389746, + "grad_norm": 1.2573758363723755, + "learning_rate": 7.69456637030777e-05, + "loss": 0.22783832550048827, + "step": 54470 + }, + { + "epoch": 0.23389402642899462, + "grad_norm": 0.006898627616465092, + "learning_rate": 7.694135198296008e-05, + "loss": 0.30946395397186277, + "step": 54480 + }, + { + "epoch": 0.2339369585190146, + "grad_norm": 0.024012990295886993, + "learning_rate": 7.693704026284247e-05, + "loss": 0.1696092128753662, + "step": 54490 + }, + { + "epoch": 0.23397989060903462, + "grad_norm": 1.0321308374404907, + "learning_rate": 7.693272854272485e-05, + "loss": 0.2145857572555542, + "step": 54500 + }, + { + "epoch": 0.23402282269905464, + "grad_norm": 2.9807746410369873, + "learning_rate": 7.692841682260722e-05, + "loss": 0.43792023658752444, + "step": 54510 + }, + { + "epoch": 0.23406575478907465, + "grad_norm": 8.073158264160156, + "learning_rate": 7.692410510248959e-05, + "loss": 0.19019222259521484, + "step": 54520 + }, + { + "epoch": 0.23410868687909464, + "grad_norm": 0.033755868673324585, + "learning_rate": 7.691979338237197e-05, + "loss": 0.14710177183151246, + "step": 54530 + }, + { + "epoch": 0.23415161896911466, + "grad_norm": 0.09791547805070877, + "learning_rate": 7.691548166225434e-05, + "loss": 0.31218585968017576, + "step": 54540 + }, + { + "epoch": 0.23419455105913467, + "grad_norm": 0.0015638612676411867, + "learning_rate": 7.691116994213672e-05, + "loss": 0.17328212261199952, + "step": 54550 + }, + { + "epoch": 0.23423748314915466, + "grad_norm": 0.015360639430582523, + "learning_rate": 7.69068582220191e-05, + "loss": 0.13364744186401367, + "step": 54560 + }, + { + "epoch": 0.23428041523917467, + "grad_norm": 0.06556349992752075, + "learning_rate": 7.690254650190148e-05, + "loss": 0.33662867546081543, + "step": 54570 + }, + { + "epoch": 0.2343233473291947, + "grad_norm": 0.049693044275045395, + "learning_rate": 7.689823478178385e-05, + "loss": 0.09475327134132386, + "step": 54580 + }, + { + "epoch": 0.23436627941921467, + "grad_norm": 0.03847317025065422, + "learning_rate": 7.689392306166623e-05, + "loss": 0.06982783675193786, + "step": 54590 + }, + { + "epoch": 0.2344092115092347, + "grad_norm": 0.09052468091249466, + "learning_rate": 7.688961134154861e-05, + "loss": 0.34469263553619384, + "step": 54600 + }, + { + "epoch": 0.2344521435992547, + "grad_norm": 2.4752421379089355, + "learning_rate": 7.688529962143097e-05, + "loss": 0.1300334930419922, + "step": 54610 + }, + { + "epoch": 0.23449507568927472, + "grad_norm": 0.7740895748138428, + "learning_rate": 7.688098790131335e-05, + "loss": 0.3067962646484375, + "step": 54620 + }, + { + "epoch": 0.2345380077792947, + "grad_norm": 0.32579708099365234, + "learning_rate": 7.687667618119573e-05, + "loss": 0.3612945795059204, + "step": 54630 + }, + { + "epoch": 0.23458093986931472, + "grad_norm": 0.7105182409286499, + "learning_rate": 7.68723644610781e-05, + "loss": 0.2577735662460327, + "step": 54640 + }, + { + "epoch": 0.23462387195933473, + "grad_norm": 0.016220765188336372, + "learning_rate": 7.686805274096048e-05, + "loss": 0.3978657007217407, + "step": 54650 + }, + { + "epoch": 0.23466680404935472, + "grad_norm": 0.26025742292404175, + "learning_rate": 7.686374102084286e-05, + "loss": 0.10141566991806031, + "step": 54660 + }, + { + "epoch": 0.23470973613937474, + "grad_norm": 1.8236098289489746, + "learning_rate": 7.685942930072524e-05, + "loss": 0.23605387210845946, + "step": 54670 + }, + { + "epoch": 0.23475266822939475, + "grad_norm": 0.005074288230389357, + "learning_rate": 7.685511758060761e-05, + "loss": 0.19536244869232178, + "step": 54680 + }, + { + "epoch": 0.23479560031941474, + "grad_norm": 0.013264862820506096, + "learning_rate": 7.685080586048998e-05, + "loss": 0.14476516246795654, + "step": 54690 + }, + { + "epoch": 0.23483853240943475, + "grad_norm": 1.0898460149765015, + "learning_rate": 7.684649414037235e-05, + "loss": 0.5413854598999024, + "step": 54700 + }, + { + "epoch": 0.23488146449945477, + "grad_norm": 2.291823625564575, + "learning_rate": 7.684218242025474e-05, + "loss": 0.2193145990371704, + "step": 54710 + }, + { + "epoch": 0.23492439658947478, + "grad_norm": 0.01983988657593727, + "learning_rate": 7.683787070013712e-05, + "loss": 0.0784152865409851, + "step": 54720 + }, + { + "epoch": 0.23496732867949477, + "grad_norm": 3.9424784183502197, + "learning_rate": 7.68335589800195e-05, + "loss": 0.2317206859588623, + "step": 54730 + }, + { + "epoch": 0.23501026076951478, + "grad_norm": 0.19252511858940125, + "learning_rate": 7.682924725990188e-05, + "loss": 0.2624601125717163, + "step": 54740 + }, + { + "epoch": 0.2350531928595348, + "grad_norm": 0.22246962785720825, + "learning_rate": 7.682493553978425e-05, + "loss": 0.3319044351577759, + "step": 54750 + }, + { + "epoch": 0.23509612494955479, + "grad_norm": 0.001225695596076548, + "learning_rate": 7.682062381966663e-05, + "loss": 0.118217933177948, + "step": 54760 + }, + { + "epoch": 0.2351390570395748, + "grad_norm": 7.75153923034668, + "learning_rate": 7.6816312099549e-05, + "loss": 0.218511700630188, + "step": 54770 + }, + { + "epoch": 0.23518198912959482, + "grad_norm": 3.493736743927002, + "learning_rate": 7.681200037943137e-05, + "loss": 0.4513263702392578, + "step": 54780 + }, + { + "epoch": 0.2352249212196148, + "grad_norm": 1.2474606037139893, + "learning_rate": 7.680768865931375e-05, + "loss": 0.3103821277618408, + "step": 54790 + }, + { + "epoch": 0.23526785330963482, + "grad_norm": 0.39426982402801514, + "learning_rate": 7.680337693919613e-05, + "loss": 0.04423748850822449, + "step": 54800 + }, + { + "epoch": 0.23531078539965483, + "grad_norm": 0.11555972695350647, + "learning_rate": 7.67990652190785e-05, + "loss": 0.12222168445587159, + "step": 54810 + }, + { + "epoch": 0.23535371748967482, + "grad_norm": 0.9844247698783875, + "learning_rate": 7.679475349896088e-05, + "loss": 0.2519564151763916, + "step": 54820 + }, + { + "epoch": 0.23539664957969483, + "grad_norm": 0.5366085171699524, + "learning_rate": 7.679044177884326e-05, + "loss": 0.29870994091033937, + "step": 54830 + }, + { + "epoch": 0.23543958166971485, + "grad_norm": 0.006447316147387028, + "learning_rate": 7.678613005872564e-05, + "loss": 0.2809410810470581, + "step": 54840 + }, + { + "epoch": 0.23548251375973486, + "grad_norm": 0.05402093753218651, + "learning_rate": 7.6781818338608e-05, + "loss": 0.24879958629608154, + "step": 54850 + }, + { + "epoch": 0.23552544584975485, + "grad_norm": 0.024269670248031616, + "learning_rate": 7.677750661849038e-05, + "loss": 0.16608129739761351, + "step": 54860 + }, + { + "epoch": 0.23556837793977486, + "grad_norm": 0.48942995071411133, + "learning_rate": 7.677319489837275e-05, + "loss": 0.12601350545883178, + "step": 54870 + }, + { + "epoch": 0.23561131002979488, + "grad_norm": 0.03332279622554779, + "learning_rate": 7.676888317825513e-05, + "loss": 0.1936497211456299, + "step": 54880 + }, + { + "epoch": 0.23565424211981487, + "grad_norm": 0.03991253674030304, + "learning_rate": 7.676457145813751e-05, + "loss": 0.3474759101867676, + "step": 54890 + }, + { + "epoch": 0.23569717420983488, + "grad_norm": 4.706600666046143, + "learning_rate": 7.676025973801989e-05, + "loss": 0.29954354763031005, + "step": 54900 + }, + { + "epoch": 0.2357401062998549, + "grad_norm": 1.776058316230774, + "learning_rate": 7.675594801790226e-05, + "loss": 0.20013415813446045, + "step": 54910 + }, + { + "epoch": 0.23578303838987488, + "grad_norm": 1.0966745615005493, + "learning_rate": 7.675163629778464e-05, + "loss": 0.454547119140625, + "step": 54920 + }, + { + "epoch": 0.2358259704798949, + "grad_norm": 0.009377327747642994, + "learning_rate": 7.674732457766702e-05, + "loss": 0.08590722680091858, + "step": 54930 + }, + { + "epoch": 0.2358689025699149, + "grad_norm": 0.002148397732526064, + "learning_rate": 7.67430128575494e-05, + "loss": 0.45615296363830565, + "step": 54940 + }, + { + "epoch": 0.23591183465993493, + "grad_norm": 4.385854721069336, + "learning_rate": 7.673870113743177e-05, + "loss": 0.20920178890228272, + "step": 54950 + }, + { + "epoch": 0.23595476674995491, + "grad_norm": 0.004134920425713062, + "learning_rate": 7.673438941731415e-05, + "loss": 0.21172521114349366, + "step": 54960 + }, + { + "epoch": 0.23599769883997493, + "grad_norm": 10.730145454406738, + "learning_rate": 7.673007769719653e-05, + "loss": 0.44742717742919924, + "step": 54970 + }, + { + "epoch": 0.23604063092999494, + "grad_norm": 1.315672755241394, + "learning_rate": 7.67257659770789e-05, + "loss": 0.2971693277359009, + "step": 54980 + }, + { + "epoch": 0.23608356302001493, + "grad_norm": 0.08187035471200943, + "learning_rate": 7.672145425696128e-05, + "loss": 0.1317402720451355, + "step": 54990 + }, + { + "epoch": 0.23612649511003495, + "grad_norm": 0.010946123860776424, + "learning_rate": 7.671714253684366e-05, + "loss": 0.2432734489440918, + "step": 55000 + }, + { + "epoch": 0.23612649511003495, + "eval_loss": 0.4557407796382904, + "eval_runtime": 27.4059, + "eval_samples_per_second": 3.649, + "eval_steps_per_second": 3.649, + "step": 55000 + }, + { + "epoch": 0.23616942720005496, + "grad_norm": 1.7199127674102783, + "learning_rate": 7.671283081672604e-05, + "loss": 0.33434352874755857, + "step": 55010 + }, + { + "epoch": 0.23621235929007495, + "grad_norm": 0.16656917333602905, + "learning_rate": 7.67085190966084e-05, + "loss": 0.23526394367218018, + "step": 55020 + }, + { + "epoch": 0.23625529138009496, + "grad_norm": 1.2178751230239868, + "learning_rate": 7.670420737649078e-05, + "loss": 0.3808588027954102, + "step": 55030 + }, + { + "epoch": 0.23629822347011498, + "grad_norm": 0.9661920666694641, + "learning_rate": 7.669989565637316e-05, + "loss": 0.19774439334869384, + "step": 55040 + }, + { + "epoch": 0.236341155560135, + "grad_norm": 7.985832214355469, + "learning_rate": 7.669558393625553e-05, + "loss": 0.3549797058105469, + "step": 55050 + }, + { + "epoch": 0.23638408765015498, + "grad_norm": 0.17887651920318604, + "learning_rate": 7.669127221613791e-05, + "loss": 0.06525392532348633, + "step": 55060 + }, + { + "epoch": 0.236427019740175, + "grad_norm": 0.2913016676902771, + "learning_rate": 7.668696049602029e-05, + "loss": 0.2070707082748413, + "step": 55070 + }, + { + "epoch": 0.236469951830195, + "grad_norm": 0.9555326700210571, + "learning_rate": 7.668264877590267e-05, + "loss": 0.09528316259384155, + "step": 55080 + }, + { + "epoch": 0.236512883920215, + "grad_norm": 2.1643640995025635, + "learning_rate": 7.667833705578504e-05, + "loss": 0.16205003261566162, + "step": 55090 + }, + { + "epoch": 0.236555816010235, + "grad_norm": 0.15171417593955994, + "learning_rate": 7.66740253356674e-05, + "loss": 0.2057335615158081, + "step": 55100 + }, + { + "epoch": 0.23659874810025502, + "grad_norm": 1.3308279514312744, + "learning_rate": 7.666971361554978e-05, + "loss": 0.24363973140716552, + "step": 55110 + }, + { + "epoch": 0.236641680190275, + "grad_norm": 2.389375925064087, + "learning_rate": 7.666540189543216e-05, + "loss": 0.4016840934753418, + "step": 55120 + }, + { + "epoch": 0.23668461228029503, + "grad_norm": 0.001500018173828721, + "learning_rate": 7.666109017531454e-05, + "loss": 0.13236029148101808, + "step": 55130 + }, + { + "epoch": 0.23672754437031504, + "grad_norm": 1.7535425424575806, + "learning_rate": 7.665677845519692e-05, + "loss": 0.2635908603668213, + "step": 55140 + }, + { + "epoch": 0.23677047646033506, + "grad_norm": 0.0043300143443048, + "learning_rate": 7.66524667350793e-05, + "loss": 0.3937635660171509, + "step": 55150 + }, + { + "epoch": 0.23681340855035504, + "grad_norm": 1.2087223529815674, + "learning_rate": 7.664815501496167e-05, + "loss": 0.19942669868469237, + "step": 55160 + }, + { + "epoch": 0.23685634064037506, + "grad_norm": 0.04313937947154045, + "learning_rate": 7.664384329484405e-05, + "loss": 0.37582294940948485, + "step": 55170 + }, + { + "epoch": 0.23689927273039507, + "grad_norm": 1.0092852115631104, + "learning_rate": 7.663953157472643e-05, + "loss": 0.1623198390007019, + "step": 55180 + }, + { + "epoch": 0.23694220482041506, + "grad_norm": 1.9008547067642212, + "learning_rate": 7.66352198546088e-05, + "loss": 0.25249829292297366, + "step": 55190 + }, + { + "epoch": 0.23698513691043507, + "grad_norm": 2.1248939037323, + "learning_rate": 7.663090813449118e-05, + "loss": 0.30942888259887696, + "step": 55200 + }, + { + "epoch": 0.2370280690004551, + "grad_norm": 0.10744863003492355, + "learning_rate": 7.662659641437356e-05, + "loss": 0.20057756900787355, + "step": 55210 + }, + { + "epoch": 0.23707100109047508, + "grad_norm": 0.06358012557029724, + "learning_rate": 7.662228469425593e-05, + "loss": 0.09414655566215516, + "step": 55220 + }, + { + "epoch": 0.2371139331804951, + "grad_norm": 1.4875483512878418, + "learning_rate": 7.661797297413831e-05, + "loss": 0.4498072624206543, + "step": 55230 + }, + { + "epoch": 0.2371568652705151, + "grad_norm": 0.09652744233608246, + "learning_rate": 7.661366125402069e-05, + "loss": 0.2348034620285034, + "step": 55240 + }, + { + "epoch": 0.2371997973605351, + "grad_norm": 0.00626254640519619, + "learning_rate": 7.660934953390307e-05, + "loss": 0.24348032474517822, + "step": 55250 + }, + { + "epoch": 0.2372427294505551, + "grad_norm": 0.47817111015319824, + "learning_rate": 7.660503781378544e-05, + "loss": 0.22016806602478028, + "step": 55260 + }, + { + "epoch": 0.23728566154057512, + "grad_norm": 2.5576000213623047, + "learning_rate": 7.660072609366781e-05, + "loss": 0.45274782180786133, + "step": 55270 + }, + { + "epoch": 0.23732859363059514, + "grad_norm": 0.03902919217944145, + "learning_rate": 7.659641437355019e-05, + "loss": 0.12822468280792237, + "step": 55280 + }, + { + "epoch": 0.23737152572061512, + "grad_norm": 3.338451623916626, + "learning_rate": 7.659210265343256e-05, + "loss": 0.10585402250289917, + "step": 55290 + }, + { + "epoch": 0.23741445781063514, + "grad_norm": 3.582664966583252, + "learning_rate": 7.658779093331494e-05, + "loss": 0.31997177600860593, + "step": 55300 + }, + { + "epoch": 0.23745738990065515, + "grad_norm": 0.2807246446609497, + "learning_rate": 7.658347921319732e-05, + "loss": 0.3692594289779663, + "step": 55310 + }, + { + "epoch": 0.23750032199067514, + "grad_norm": 1.0550538301467896, + "learning_rate": 7.65791674930797e-05, + "loss": 0.17689560651779174, + "step": 55320 + }, + { + "epoch": 0.23754325408069515, + "grad_norm": 0.022598102688789368, + "learning_rate": 7.657485577296207e-05, + "loss": 0.29569780826568604, + "step": 55330 + }, + { + "epoch": 0.23758618617071517, + "grad_norm": 1.544817566871643, + "learning_rate": 7.657054405284445e-05, + "loss": 0.21274950504302978, + "step": 55340 + }, + { + "epoch": 0.23762911826073516, + "grad_norm": 1.3617031574249268, + "learning_rate": 7.656623233272681e-05, + "loss": 0.4670865058898926, + "step": 55350 + }, + { + "epoch": 0.23767205035075517, + "grad_norm": 0.3003019690513611, + "learning_rate": 7.656192061260919e-05, + "loss": 0.48572516441345215, + "step": 55360 + }, + { + "epoch": 0.23771498244077519, + "grad_norm": 1.8010348081588745, + "learning_rate": 7.655760889249157e-05, + "loss": 0.27597956657409667, + "step": 55370 + }, + { + "epoch": 0.2377579145307952, + "grad_norm": 3.2601473331451416, + "learning_rate": 7.655329717237395e-05, + "loss": 0.3330512523651123, + "step": 55380 + }, + { + "epoch": 0.2378008466208152, + "grad_norm": 2.0501697063446045, + "learning_rate": 7.654898545225632e-05, + "loss": 0.37035629749298093, + "step": 55390 + }, + { + "epoch": 0.2378437787108352, + "grad_norm": 2.374547004699707, + "learning_rate": 7.65446737321387e-05, + "loss": 0.2998343944549561, + "step": 55400 + }, + { + "epoch": 0.23788671080085522, + "grad_norm": 0.07484851777553558, + "learning_rate": 7.654036201202108e-05, + "loss": 0.3185651063919067, + "step": 55410 + }, + { + "epoch": 0.2379296428908752, + "grad_norm": 0.2303788661956787, + "learning_rate": 7.653605029190345e-05, + "loss": 0.09342051148414612, + "step": 55420 + }, + { + "epoch": 0.23797257498089522, + "grad_norm": 0.06799449026584625, + "learning_rate": 7.653173857178583e-05, + "loss": 0.10728926658630371, + "step": 55430 + }, + { + "epoch": 0.23801550707091523, + "grad_norm": 1.861104130744934, + "learning_rate": 7.652742685166821e-05, + "loss": 0.2747479438781738, + "step": 55440 + }, + { + "epoch": 0.23805843916093522, + "grad_norm": 1.7472214698791504, + "learning_rate": 7.652311513155059e-05, + "loss": 0.23076362609863282, + "step": 55450 + }, + { + "epoch": 0.23810137125095523, + "grad_norm": 4.591130256652832, + "learning_rate": 7.651880341143296e-05, + "loss": 0.3691649198532104, + "step": 55460 + }, + { + "epoch": 0.23814430334097525, + "grad_norm": 0.005838882178068161, + "learning_rate": 7.651449169131534e-05, + "loss": 0.3147615909576416, + "step": 55470 + }, + { + "epoch": 0.23818723543099526, + "grad_norm": 0.44340893626213074, + "learning_rate": 7.651017997119772e-05, + "loss": 0.22155678272247314, + "step": 55480 + }, + { + "epoch": 0.23823016752101525, + "grad_norm": 1.6629033088684082, + "learning_rate": 7.65058682510801e-05, + "loss": 0.42931222915649414, + "step": 55490 + }, + { + "epoch": 0.23827309961103527, + "grad_norm": 0.005505688022822142, + "learning_rate": 7.650155653096247e-05, + "loss": 0.11621900796890258, + "step": 55500 + }, + { + "epoch": 0.23831603170105528, + "grad_norm": 2.3384525775909424, + "learning_rate": 7.649724481084484e-05, + "loss": 0.32160401344299316, + "step": 55510 + }, + { + "epoch": 0.23835896379107527, + "grad_norm": 0.007327724248170853, + "learning_rate": 7.649293309072721e-05, + "loss": 0.11470060348510742, + "step": 55520 + }, + { + "epoch": 0.23840189588109528, + "grad_norm": 1.3797262907028198, + "learning_rate": 7.648862137060959e-05, + "loss": 0.13075582981109618, + "step": 55530 + }, + { + "epoch": 0.2384448279711153, + "grad_norm": 0.02711120806634426, + "learning_rate": 7.648430965049197e-05, + "loss": 0.23798139095306398, + "step": 55540 + }, + { + "epoch": 0.23848776006113528, + "grad_norm": 1.339031457901001, + "learning_rate": 7.647999793037435e-05, + "loss": 0.3770133972167969, + "step": 55550 + }, + { + "epoch": 0.2385306921511553, + "grad_norm": 0.011666370555758476, + "learning_rate": 7.647568621025672e-05, + "loss": 0.17394804954528809, + "step": 55560 + }, + { + "epoch": 0.2385736242411753, + "grad_norm": 0.018904006108641624, + "learning_rate": 7.64713744901391e-05, + "loss": 0.19867415428161622, + "step": 55570 + }, + { + "epoch": 0.23861655633119533, + "grad_norm": 0.21838410198688507, + "learning_rate": 7.646706277002148e-05, + "loss": 0.1692600965499878, + "step": 55580 + }, + { + "epoch": 0.23865948842121532, + "grad_norm": 10.107283592224121, + "learning_rate": 7.646275104990384e-05, + "loss": 0.3083909273147583, + "step": 55590 + }, + { + "epoch": 0.23870242051123533, + "grad_norm": 3.883472204208374, + "learning_rate": 7.645843932978622e-05, + "loss": 0.42784271240234373, + "step": 55600 + }, + { + "epoch": 0.23874535260125535, + "grad_norm": 1.2638131380081177, + "learning_rate": 7.64541276096686e-05, + "loss": 0.45655550956726076, + "step": 55610 + }, + { + "epoch": 0.23878828469127533, + "grad_norm": 0.11055152118206024, + "learning_rate": 7.644981588955097e-05, + "loss": 0.08500316143035888, + "step": 55620 + }, + { + "epoch": 0.23883121678129535, + "grad_norm": 2.233280658721924, + "learning_rate": 7.644550416943335e-05, + "loss": 0.10072469711303711, + "step": 55630 + }, + { + "epoch": 0.23887414887131536, + "grad_norm": 0.38892972469329834, + "learning_rate": 7.644119244931573e-05, + "loss": 0.13284404277801515, + "step": 55640 + }, + { + "epoch": 0.23891708096133535, + "grad_norm": 0.12464083731174469, + "learning_rate": 7.64368807291981e-05, + "loss": 0.09709331393241882, + "step": 55650 + }, + { + "epoch": 0.23896001305135536, + "grad_norm": 0.0404987558722496, + "learning_rate": 7.643256900908048e-05, + "loss": 0.2879699945449829, + "step": 55660 + }, + { + "epoch": 0.23900294514137538, + "grad_norm": 1.0981417894363403, + "learning_rate": 7.642825728896286e-05, + "loss": 0.21002488136291503, + "step": 55670 + }, + { + "epoch": 0.23904587723139537, + "grad_norm": 0.09827089309692383, + "learning_rate": 7.642394556884524e-05, + "loss": 0.2971586942672729, + "step": 55680 + }, + { + "epoch": 0.23908880932141538, + "grad_norm": 0.030421625822782516, + "learning_rate": 7.641963384872762e-05, + "loss": 0.17070308923721314, + "step": 55690 + }, + { + "epoch": 0.2391317414114354, + "grad_norm": 2.331212043762207, + "learning_rate": 7.641532212860999e-05, + "loss": 0.2533195972442627, + "step": 55700 + }, + { + "epoch": 0.2391746735014554, + "grad_norm": 18.21701431274414, + "learning_rate": 7.641101040849237e-05, + "loss": 0.3127460479736328, + "step": 55710 + }, + { + "epoch": 0.2392176055914754, + "grad_norm": 2.263760566711426, + "learning_rate": 7.640669868837475e-05, + "loss": 0.2071403980255127, + "step": 55720 + }, + { + "epoch": 0.2392605376814954, + "grad_norm": 0.051928453147411346, + "learning_rate": 7.640238696825713e-05, + "loss": 0.1774152398109436, + "step": 55730 + }, + { + "epoch": 0.23930346977151543, + "grad_norm": 0.07828470319509506, + "learning_rate": 7.63980752481395e-05, + "loss": 0.04226884543895722, + "step": 55740 + }, + { + "epoch": 0.2393464018615354, + "grad_norm": 0.04483730345964432, + "learning_rate": 7.639376352802188e-05, + "loss": 0.24645552635192872, + "step": 55750 + }, + { + "epoch": 0.23938933395155543, + "grad_norm": 0.9185608625411987, + "learning_rate": 7.638945180790424e-05, + "loss": 0.3080772638320923, + "step": 55760 + }, + { + "epoch": 0.23943226604157544, + "grad_norm": 0.9898508787155151, + "learning_rate": 7.638514008778662e-05, + "loss": 0.25601682662963865, + "step": 55770 + }, + { + "epoch": 0.23947519813159543, + "grad_norm": 0.08774905651807785, + "learning_rate": 7.6380828367669e-05, + "loss": 0.4679861545562744, + "step": 55780 + }, + { + "epoch": 0.23951813022161544, + "grad_norm": 0.374380886554718, + "learning_rate": 7.637651664755138e-05, + "loss": 0.4750388145446777, + "step": 55790 + }, + { + "epoch": 0.23956106231163546, + "grad_norm": 1.6377544403076172, + "learning_rate": 7.637220492743375e-05, + "loss": 0.12196850776672363, + "step": 55800 + }, + { + "epoch": 0.23960399440165547, + "grad_norm": 0.02132430486381054, + "learning_rate": 7.636789320731613e-05, + "loss": 0.3349509000778198, + "step": 55810 + }, + { + "epoch": 0.23964692649167546, + "grad_norm": 0.2864157259464264, + "learning_rate": 7.636358148719851e-05, + "loss": 0.1535439133644104, + "step": 55820 + }, + { + "epoch": 0.23968985858169548, + "grad_norm": 0.0163569413125515, + "learning_rate": 7.635926976708089e-05, + "loss": 0.261598801612854, + "step": 55830 + }, + { + "epoch": 0.2397327906717155, + "grad_norm": 1.0393491983413696, + "learning_rate": 7.635495804696325e-05, + "loss": 0.3698171377182007, + "step": 55840 + }, + { + "epoch": 0.23977572276173548, + "grad_norm": 0.13531573116779327, + "learning_rate": 7.635064632684563e-05, + "loss": 0.23707644939422606, + "step": 55850 + }, + { + "epoch": 0.2398186548517555, + "grad_norm": 0.01517938356846571, + "learning_rate": 7.6346334606728e-05, + "loss": 0.30153985023498536, + "step": 55860 + }, + { + "epoch": 0.2398615869417755, + "grad_norm": 1.438055396080017, + "learning_rate": 7.634202288661038e-05, + "loss": 0.09777184724807739, + "step": 55870 + }, + { + "epoch": 0.2399045190317955, + "grad_norm": 0.011465308256447315, + "learning_rate": 7.633771116649276e-05, + "loss": 0.46697440147399905, + "step": 55880 + }, + { + "epoch": 0.2399474511218155, + "grad_norm": 0.008004733361303806, + "learning_rate": 7.633339944637514e-05, + "loss": 0.31014342308044435, + "step": 55890 + }, + { + "epoch": 0.23999038321183552, + "grad_norm": 0.11556950956583023, + "learning_rate": 7.632908772625753e-05, + "loss": 0.33110072612762453, + "step": 55900 + }, + { + "epoch": 0.24003331530185554, + "grad_norm": 0.05171780288219452, + "learning_rate": 7.63247760061399e-05, + "loss": 0.10481221675872802, + "step": 55910 + }, + { + "epoch": 0.24007624739187552, + "grad_norm": 17.309627532958984, + "learning_rate": 7.632046428602227e-05, + "loss": 0.21331896781921386, + "step": 55920 + }, + { + "epoch": 0.24011917948189554, + "grad_norm": 0.24218101799488068, + "learning_rate": 7.631615256590464e-05, + "loss": 0.2619100332260132, + "step": 55930 + }, + { + "epoch": 0.24016211157191555, + "grad_norm": 3.7106101512908936, + "learning_rate": 7.631184084578702e-05, + "loss": 0.42134804725646974, + "step": 55940 + }, + { + "epoch": 0.24020504366193554, + "grad_norm": 0.018480826169252396, + "learning_rate": 7.63075291256694e-05, + "loss": 0.2024019479751587, + "step": 55950 + }, + { + "epoch": 0.24024797575195556, + "grad_norm": 0.6436042785644531, + "learning_rate": 7.630321740555178e-05, + "loss": 0.3535203218460083, + "step": 55960 + }, + { + "epoch": 0.24029090784197557, + "grad_norm": 0.06518207490444183, + "learning_rate": 7.629890568543415e-05, + "loss": 0.14140788316726685, + "step": 55970 + }, + { + "epoch": 0.24033383993199556, + "grad_norm": 0.10466308891773224, + "learning_rate": 7.629459396531653e-05, + "loss": 0.017547784745693205, + "step": 55980 + }, + { + "epoch": 0.24037677202201557, + "grad_norm": 0.03298651799559593, + "learning_rate": 7.629028224519891e-05, + "loss": 0.12694171667099, + "step": 55990 + }, + { + "epoch": 0.2404197041120356, + "grad_norm": 0.0035074560437351465, + "learning_rate": 7.628597052508129e-05, + "loss": 0.25837092399597167, + "step": 56000 + }, + { + "epoch": 0.2404197041120356, + "eval_loss": 0.4490428566932678, + "eval_runtime": 27.3737, + "eval_samples_per_second": 3.653, + "eval_steps_per_second": 3.653, + "step": 56000 + }, + { + "epoch": 0.2404626362020556, + "grad_norm": 1.269037127494812, + "learning_rate": 7.628165880496365e-05, + "loss": 0.3254170656204224, + "step": 56010 + }, + { + "epoch": 0.2405055682920756, + "grad_norm": 0.20150920748710632, + "learning_rate": 7.627734708484603e-05, + "loss": 0.28833024501800536, + "step": 56020 + }, + { + "epoch": 0.2405485003820956, + "grad_norm": 2.5157318115234375, + "learning_rate": 7.62730353647284e-05, + "loss": 0.4094663143157959, + "step": 56030 + }, + { + "epoch": 0.24059143247211562, + "grad_norm": 0.044199321419000626, + "learning_rate": 7.626872364461078e-05, + "loss": 0.17894190549850464, + "step": 56040 + }, + { + "epoch": 0.2406343645621356, + "grad_norm": 0.2634064257144928, + "learning_rate": 7.626441192449316e-05, + "loss": 0.29080748558044434, + "step": 56050 + }, + { + "epoch": 0.24067729665215562, + "grad_norm": 0.024507373571395874, + "learning_rate": 7.626010020437554e-05, + "loss": 0.22542548179626465, + "step": 56060 + }, + { + "epoch": 0.24072022874217563, + "grad_norm": 1.499601125717163, + "learning_rate": 7.625578848425791e-05, + "loss": 0.41116743087768554, + "step": 56070 + }, + { + "epoch": 0.24076316083219562, + "grad_norm": 0.085579052567482, + "learning_rate": 7.625147676414029e-05, + "loss": 0.06765444278717041, + "step": 56080 + }, + { + "epoch": 0.24080609292221564, + "grad_norm": 2.3689451217651367, + "learning_rate": 7.624716504402266e-05, + "loss": 0.2160788059234619, + "step": 56090 + }, + { + "epoch": 0.24084902501223565, + "grad_norm": 0.9572241306304932, + "learning_rate": 7.624285332390503e-05, + "loss": 0.5079023361206054, + "step": 56100 + }, + { + "epoch": 0.24089195710225564, + "grad_norm": 0.2550654709339142, + "learning_rate": 7.623854160378741e-05, + "loss": 0.23361270427703856, + "step": 56110 + }, + { + "epoch": 0.24093488919227565, + "grad_norm": 0.27372607588768005, + "learning_rate": 7.62342298836698e-05, + "loss": 0.20449163913726806, + "step": 56120 + }, + { + "epoch": 0.24097782128229567, + "grad_norm": 0.12788492441177368, + "learning_rate": 7.622991816355218e-05, + "loss": 0.2756718873977661, + "step": 56130 + }, + { + "epoch": 0.24102075337231568, + "grad_norm": 0.03664913773536682, + "learning_rate": 7.622560644343456e-05, + "loss": 0.0964701533317566, + "step": 56140 + }, + { + "epoch": 0.24106368546233567, + "grad_norm": 1.1955845355987549, + "learning_rate": 7.622129472331693e-05, + "loss": 0.5049872398376465, + "step": 56150 + }, + { + "epoch": 0.24110661755235568, + "grad_norm": 0.07352086156606674, + "learning_rate": 7.621698300319931e-05, + "loss": 0.31375598907470703, + "step": 56160 + }, + { + "epoch": 0.2411495496423757, + "grad_norm": 0.02647421695291996, + "learning_rate": 7.621267128308167e-05, + "loss": 0.3265992164611816, + "step": 56170 + }, + { + "epoch": 0.24119248173239569, + "grad_norm": 1.968076467514038, + "learning_rate": 7.620835956296405e-05, + "loss": 0.2549333095550537, + "step": 56180 + }, + { + "epoch": 0.2412354138224157, + "grad_norm": 0.08361245691776276, + "learning_rate": 7.620404784284643e-05, + "loss": 0.0806192398071289, + "step": 56190 + }, + { + "epoch": 0.24127834591243572, + "grad_norm": 3.567979097366333, + "learning_rate": 7.61997361227288e-05, + "loss": 0.24559214115142822, + "step": 56200 + }, + { + "epoch": 0.2413212780024557, + "grad_norm": 0.0898161232471466, + "learning_rate": 7.619542440261118e-05, + "loss": 0.051443439722061154, + "step": 56210 + }, + { + "epoch": 0.24136421009247572, + "grad_norm": 2.6911261081695557, + "learning_rate": 7.619111268249356e-05, + "loss": 0.3605805397033691, + "step": 56220 + }, + { + "epoch": 0.24140714218249573, + "grad_norm": 0.23818203806877136, + "learning_rate": 7.618680096237594e-05, + "loss": 0.4218717575073242, + "step": 56230 + }, + { + "epoch": 0.24145007427251575, + "grad_norm": 0.10996730625629425, + "learning_rate": 7.618248924225832e-05, + "loss": 0.09531922936439514, + "step": 56240 + }, + { + "epoch": 0.24149300636253573, + "grad_norm": 0.03727518394589424, + "learning_rate": 7.617817752214068e-05, + "loss": 0.0390720546245575, + "step": 56250 + }, + { + "epoch": 0.24153593845255575, + "grad_norm": 0.011419006623327732, + "learning_rate": 7.617386580202306e-05, + "loss": 0.09541901350021362, + "step": 56260 + }, + { + "epoch": 0.24157887054257576, + "grad_norm": 0.03283700719475746, + "learning_rate": 7.616955408190543e-05, + "loss": 0.06369619965553283, + "step": 56270 + }, + { + "epoch": 0.24162180263259575, + "grad_norm": 0.012161415070295334, + "learning_rate": 7.616524236178781e-05, + "loss": 0.002637416496872902, + "step": 56280 + }, + { + "epoch": 0.24166473472261576, + "grad_norm": 2.356962203979492, + "learning_rate": 7.616093064167019e-05, + "loss": 0.15238213539123535, + "step": 56290 + }, + { + "epoch": 0.24170766681263578, + "grad_norm": 0.002443671924993396, + "learning_rate": 7.615661892155257e-05, + "loss": 0.351188063621521, + "step": 56300 + }, + { + "epoch": 0.24175059890265577, + "grad_norm": 0.008870512247085571, + "learning_rate": 7.615230720143494e-05, + "loss": 0.39712421894073485, + "step": 56310 + }, + { + "epoch": 0.24179353099267578, + "grad_norm": 0.04952370002865791, + "learning_rate": 7.614799548131732e-05, + "loss": 0.25222814083099365, + "step": 56320 + }, + { + "epoch": 0.2418364630826958, + "grad_norm": 0.015394582413136959, + "learning_rate": 7.614368376119968e-05, + "loss": 0.1964707851409912, + "step": 56330 + }, + { + "epoch": 0.2418793951727158, + "grad_norm": 0.09816809743642807, + "learning_rate": 7.613937204108208e-05, + "loss": 0.15900418758392335, + "step": 56340 + }, + { + "epoch": 0.2419223272627358, + "grad_norm": 0.009692016057670116, + "learning_rate": 7.613506032096445e-05, + "loss": 0.2867144584655762, + "step": 56350 + }, + { + "epoch": 0.2419652593527558, + "grad_norm": 0.028097325935959816, + "learning_rate": 7.613074860084683e-05, + "loss": 0.17743611335754395, + "step": 56360 + }, + { + "epoch": 0.24200819144277583, + "grad_norm": 0.5458617210388184, + "learning_rate": 7.612643688072921e-05, + "loss": 0.2927206754684448, + "step": 56370 + }, + { + "epoch": 0.24205112353279581, + "grad_norm": 2.905992031097412, + "learning_rate": 7.612212516061158e-05, + "loss": 0.3399640083312988, + "step": 56380 + }, + { + "epoch": 0.24209405562281583, + "grad_norm": 0.04598947614431381, + "learning_rate": 7.611781344049396e-05, + "loss": 0.27964622974395753, + "step": 56390 + }, + { + "epoch": 0.24213698771283584, + "grad_norm": 1.6085865497589111, + "learning_rate": 7.611350172037634e-05, + "loss": 0.36804425716400146, + "step": 56400 + }, + { + "epoch": 0.24217991980285583, + "grad_norm": 2.989314556121826, + "learning_rate": 7.610919000025872e-05, + "loss": 0.3538304328918457, + "step": 56410 + }, + { + "epoch": 0.24222285189287585, + "grad_norm": 0.038225866854190826, + "learning_rate": 7.610487828014108e-05, + "loss": 0.28265354633331297, + "step": 56420 + }, + { + "epoch": 0.24226578398289586, + "grad_norm": 0.040036361664533615, + "learning_rate": 7.610056656002346e-05, + "loss": 0.2343906879425049, + "step": 56430 + }, + { + "epoch": 0.24230871607291588, + "grad_norm": 0.022909799590706825, + "learning_rate": 7.609625483990584e-05, + "loss": 0.26263716220855715, + "step": 56440 + }, + { + "epoch": 0.24235164816293586, + "grad_norm": 0.19417926669120789, + "learning_rate": 7.609194311978821e-05, + "loss": 0.1913734793663025, + "step": 56450 + }, + { + "epoch": 0.24239458025295588, + "grad_norm": 0.0552341528236866, + "learning_rate": 7.608763139967059e-05, + "loss": 0.14965476989746093, + "step": 56460 + }, + { + "epoch": 0.2424375123429759, + "grad_norm": 0.03202309086918831, + "learning_rate": 7.608331967955297e-05, + "loss": 0.21325488090515138, + "step": 56470 + }, + { + "epoch": 0.24248044443299588, + "grad_norm": 3.847653388977051, + "learning_rate": 7.607900795943534e-05, + "loss": 0.3930112600326538, + "step": 56480 + }, + { + "epoch": 0.2425233765230159, + "grad_norm": 9.966902732849121, + "learning_rate": 7.607469623931772e-05, + "loss": 0.14765695333480836, + "step": 56490 + }, + { + "epoch": 0.2425663086130359, + "grad_norm": 7.9163408279418945, + "learning_rate": 7.607038451920009e-05, + "loss": 0.21990103721618653, + "step": 56500 + }, + { + "epoch": 0.2426092407030559, + "grad_norm": 0.0343107134103775, + "learning_rate": 7.606607279908246e-05, + "loss": 0.2227553129196167, + "step": 56510 + }, + { + "epoch": 0.2426521727930759, + "grad_norm": 0.019609419628977776, + "learning_rate": 7.606176107896484e-05, + "loss": 0.264559006690979, + "step": 56520 + }, + { + "epoch": 0.24269510488309592, + "grad_norm": 0.00810485053807497, + "learning_rate": 7.605744935884722e-05, + "loss": 0.27818167209625244, + "step": 56530 + }, + { + "epoch": 0.2427380369731159, + "grad_norm": 0.00926060788333416, + "learning_rate": 7.60531376387296e-05, + "loss": 0.07828181982040405, + "step": 56540 + }, + { + "epoch": 0.24278096906313593, + "grad_norm": 2.1390297412872314, + "learning_rate": 7.604882591861197e-05, + "loss": 0.35963263511657717, + "step": 56550 + }, + { + "epoch": 0.24282390115315594, + "grad_norm": 1.0619257688522339, + "learning_rate": 7.604451419849435e-05, + "loss": 0.1911607027053833, + "step": 56560 + }, + { + "epoch": 0.24286683324317596, + "grad_norm": 1.8809216022491455, + "learning_rate": 7.604020247837673e-05, + "loss": 0.27733912467956545, + "step": 56570 + }, + { + "epoch": 0.24290976533319594, + "grad_norm": 0.45261240005493164, + "learning_rate": 7.60358907582591e-05, + "loss": 0.2474360704421997, + "step": 56580 + }, + { + "epoch": 0.24295269742321596, + "grad_norm": 0.0851302370429039, + "learning_rate": 7.603157903814148e-05, + "loss": 0.24859883785247802, + "step": 56590 + }, + { + "epoch": 0.24299562951323597, + "grad_norm": 0.07440662384033203, + "learning_rate": 7.602726731802386e-05, + "loss": 0.15816271305084229, + "step": 56600 + }, + { + "epoch": 0.24303856160325596, + "grad_norm": 0.008213400840759277, + "learning_rate": 7.602295559790624e-05, + "loss": 0.13967883586883545, + "step": 56610 + }, + { + "epoch": 0.24308149369327597, + "grad_norm": 1.484141230583191, + "learning_rate": 7.601864387778861e-05, + "loss": 0.3525965690612793, + "step": 56620 + }, + { + "epoch": 0.243124425783296, + "grad_norm": 0.016760317608714104, + "learning_rate": 7.601433215767099e-05, + "loss": 0.14842404127120973, + "step": 56630 + }, + { + "epoch": 0.24316735787331598, + "grad_norm": 0.6866039633750916, + "learning_rate": 7.601002043755337e-05, + "loss": 0.42420759201049807, + "step": 56640 + }, + { + "epoch": 0.243210289963336, + "grad_norm": 2.159764051437378, + "learning_rate": 7.600570871743575e-05, + "loss": 0.3373197317123413, + "step": 56650 + }, + { + "epoch": 0.243253222053356, + "grad_norm": 3.4098398685455322, + "learning_rate": 7.600139699731811e-05, + "loss": 0.15713222026824952, + "step": 56660 + }, + { + "epoch": 0.24329615414337602, + "grad_norm": 0.052497498691082, + "learning_rate": 7.599708527720049e-05, + "loss": 0.30463128089904784, + "step": 56670 + }, + { + "epoch": 0.243339086233396, + "grad_norm": 0.9863664507865906, + "learning_rate": 7.599277355708286e-05, + "loss": 0.36327598094940183, + "step": 56680 + }, + { + "epoch": 0.24338201832341602, + "grad_norm": 1.23508620262146, + "learning_rate": 7.598846183696524e-05, + "loss": 0.21613700389862062, + "step": 56690 + }, + { + "epoch": 0.24342495041343604, + "grad_norm": 0.3627750873565674, + "learning_rate": 7.598415011684762e-05, + "loss": 0.5012299060821533, + "step": 56700 + }, + { + "epoch": 0.24346788250345602, + "grad_norm": 0.17750516533851624, + "learning_rate": 7.597983839673e-05, + "loss": 0.20147082805633545, + "step": 56710 + }, + { + "epoch": 0.24351081459347604, + "grad_norm": 0.028639402240514755, + "learning_rate": 7.597552667661237e-05, + "loss": 0.35656213760375977, + "step": 56720 + }, + { + "epoch": 0.24355374668349605, + "grad_norm": 0.027626140043139458, + "learning_rate": 7.597121495649475e-05, + "loss": 0.23855340480804443, + "step": 56730 + }, + { + "epoch": 0.24359667877351604, + "grad_norm": 1.3562794923782349, + "learning_rate": 7.596690323637713e-05, + "loss": 0.45727100372314455, + "step": 56740 + }, + { + "epoch": 0.24363961086353605, + "grad_norm": 8.56227970123291, + "learning_rate": 7.596259151625949e-05, + "loss": 0.29800164699554443, + "step": 56750 + }, + { + "epoch": 0.24368254295355607, + "grad_norm": 3.818679094314575, + "learning_rate": 7.595827979614187e-05, + "loss": 0.2236952304840088, + "step": 56760 + }, + { + "epoch": 0.24372547504357608, + "grad_norm": 0.007640776690095663, + "learning_rate": 7.595396807602425e-05, + "loss": 0.18620272874832153, + "step": 56770 + }, + { + "epoch": 0.24376840713359607, + "grad_norm": 0.2855760455131531, + "learning_rate": 7.594965635590662e-05, + "loss": 0.38433017730712893, + "step": 56780 + }, + { + "epoch": 0.24381133922361609, + "grad_norm": 2.060290813446045, + "learning_rate": 7.5945344635789e-05, + "loss": 0.30763907432556153, + "step": 56790 + }, + { + "epoch": 0.2438542713136361, + "grad_norm": 0.3530263900756836, + "learning_rate": 7.594103291567138e-05, + "loss": 0.3436635971069336, + "step": 56800 + }, + { + "epoch": 0.2438972034036561, + "grad_norm": 3.5977604389190674, + "learning_rate": 7.593672119555376e-05, + "loss": 0.3365695714950562, + "step": 56810 + }, + { + "epoch": 0.2439401354936761, + "grad_norm": 2.2566967010498047, + "learning_rate": 7.593240947543613e-05, + "loss": 0.23258376121520996, + "step": 56820 + }, + { + "epoch": 0.24398306758369612, + "grad_norm": 2.2684240341186523, + "learning_rate": 7.592809775531851e-05, + "loss": 0.3743330240249634, + "step": 56830 + }, + { + "epoch": 0.2440259996737161, + "grad_norm": 5.0866193771362305, + "learning_rate": 7.592378603520089e-05, + "loss": 0.28389415740966795, + "step": 56840 + }, + { + "epoch": 0.24406893176373612, + "grad_norm": 0.016268953680992126, + "learning_rate": 7.591947431508327e-05, + "loss": 0.11955786943435669, + "step": 56850 + }, + { + "epoch": 0.24411186385375613, + "grad_norm": 0.507336437702179, + "learning_rate": 7.591516259496564e-05, + "loss": 0.2459397315979004, + "step": 56860 + }, + { + "epoch": 0.24415479594377615, + "grad_norm": 3.0881454944610596, + "learning_rate": 7.591085087484802e-05, + "loss": 0.21296381950378418, + "step": 56870 + }, + { + "epoch": 0.24419772803379614, + "grad_norm": 3.4023799896240234, + "learning_rate": 7.59065391547304e-05, + "loss": 0.14938076734542846, + "step": 56880 + }, + { + "epoch": 0.24424066012381615, + "grad_norm": 3.494509696960449, + "learning_rate": 7.590222743461277e-05, + "loss": 0.3454356908798218, + "step": 56890 + }, + { + "epoch": 0.24428359221383616, + "grad_norm": 0.10136851668357849, + "learning_rate": 7.589791571449515e-05, + "loss": 0.1815933346748352, + "step": 56900 + }, + { + "epoch": 0.24432652430385615, + "grad_norm": 0.017074063420295715, + "learning_rate": 7.589360399437752e-05, + "loss": 0.1557396173477173, + "step": 56910 + }, + { + "epoch": 0.24436945639387617, + "grad_norm": 0.008806165307760239, + "learning_rate": 7.58892922742599e-05, + "loss": 0.01468321681022644, + "step": 56920 + }, + { + "epoch": 0.24441238848389618, + "grad_norm": 0.004361881874501705, + "learning_rate": 7.588498055414227e-05, + "loss": 0.18163535594940186, + "step": 56930 + }, + { + "epoch": 0.24445532057391617, + "grad_norm": 2.808199167251587, + "learning_rate": 7.588066883402465e-05, + "loss": 0.13877879381179808, + "step": 56940 + }, + { + "epoch": 0.24449825266393618, + "grad_norm": 0.0035761999897658825, + "learning_rate": 7.587635711390703e-05, + "loss": 0.23683485984802247, + "step": 56950 + }, + { + "epoch": 0.2445411847539562, + "grad_norm": 0.0018882449949160218, + "learning_rate": 7.58720453937894e-05, + "loss": 0.040769991278648374, + "step": 56960 + }, + { + "epoch": 0.24458411684397618, + "grad_norm": 0.0017095755320042372, + "learning_rate": 7.586773367367178e-05, + "loss": 0.07406737804412841, + "step": 56970 + }, + { + "epoch": 0.2446270489339962, + "grad_norm": 0.0053458381444215775, + "learning_rate": 7.586342195355416e-05, + "loss": 0.19818114042282103, + "step": 56980 + }, + { + "epoch": 0.24466998102401621, + "grad_norm": 0.2893374264240265, + "learning_rate": 7.585911023343652e-05, + "loss": 0.06889631152153015, + "step": 56990 + }, + { + "epoch": 0.24471291311403623, + "grad_norm": 0.43643325567245483, + "learning_rate": 7.58547985133189e-05, + "loss": 0.11115390062332153, + "step": 57000 + }, + { + "epoch": 0.24471291311403623, + "eval_loss": 0.45208680629730225, + "eval_runtime": 27.4112, + "eval_samples_per_second": 3.648, + "eval_steps_per_second": 3.648, + "step": 57000 + }, + { + "epoch": 0.24475584520405622, + "grad_norm": 2.3712055683135986, + "learning_rate": 7.585048679320128e-05, + "loss": 0.2907646894454956, + "step": 57010 + }, + { + "epoch": 0.24479877729407623, + "grad_norm": 2.5993006229400635, + "learning_rate": 7.584617507308365e-05, + "loss": 0.2117297887802124, + "step": 57020 + }, + { + "epoch": 0.24484170938409625, + "grad_norm": 1.6458356380462646, + "learning_rate": 7.584186335296603e-05, + "loss": 0.3011794567108154, + "step": 57030 + }, + { + "epoch": 0.24488464147411623, + "grad_norm": 0.010335003025829792, + "learning_rate": 7.583755163284841e-05, + "loss": 0.1275590181350708, + "step": 57040 + }, + { + "epoch": 0.24492757356413625, + "grad_norm": 0.008200617507100105, + "learning_rate": 7.583323991273079e-05, + "loss": 0.0024192286655306816, + "step": 57050 + }, + { + "epoch": 0.24497050565415626, + "grad_norm": 1.3849883079528809, + "learning_rate": 7.582892819261316e-05, + "loss": 0.23618090152740479, + "step": 57060 + }, + { + "epoch": 0.24501343774417625, + "grad_norm": 0.004566836636513472, + "learning_rate": 7.582461647249554e-05, + "loss": 0.21608200073242187, + "step": 57070 + }, + { + "epoch": 0.24505636983419626, + "grad_norm": 0.018896836787462234, + "learning_rate": 7.582030475237792e-05, + "loss": 0.2848828792572021, + "step": 57080 + }, + { + "epoch": 0.24509930192421628, + "grad_norm": 0.02968161180615425, + "learning_rate": 7.58159930322603e-05, + "loss": 0.3523215055465698, + "step": 57090 + }, + { + "epoch": 0.2451422340142363, + "grad_norm": 1.484802007675171, + "learning_rate": 7.581168131214267e-05, + "loss": 0.2556891441345215, + "step": 57100 + }, + { + "epoch": 0.24518516610425628, + "grad_norm": 1.1301987171173096, + "learning_rate": 7.580736959202505e-05, + "loss": 0.34354567527770996, + "step": 57110 + }, + { + "epoch": 0.2452280981942763, + "grad_norm": 0.0026234660763293505, + "learning_rate": 7.580305787190743e-05, + "loss": 0.11568312644958496, + "step": 57120 + }, + { + "epoch": 0.2452710302842963, + "grad_norm": 0.03418371081352234, + "learning_rate": 7.57987461517898e-05, + "loss": 0.31449136734008787, + "step": 57130 + }, + { + "epoch": 0.2453139623743163, + "grad_norm": 9.43087100982666, + "learning_rate": 7.579443443167218e-05, + "loss": 0.3676129341125488, + "step": 57140 + }, + { + "epoch": 0.2453568944643363, + "grad_norm": 0.024150878190994263, + "learning_rate": 7.579012271155456e-05, + "loss": 0.19816354513168336, + "step": 57150 + }, + { + "epoch": 0.24539982655435633, + "grad_norm": 0.05833771079778671, + "learning_rate": 7.578581099143692e-05, + "loss": 0.2208428144454956, + "step": 57160 + }, + { + "epoch": 0.2454427586443763, + "grad_norm": 0.1591957062482834, + "learning_rate": 7.57814992713193e-05, + "loss": 0.11497148275375366, + "step": 57170 + }, + { + "epoch": 0.24548569073439633, + "grad_norm": 0.15572971105575562, + "learning_rate": 7.577718755120168e-05, + "loss": 0.3440711975097656, + "step": 57180 + }, + { + "epoch": 0.24552862282441634, + "grad_norm": 2.8513052463531494, + "learning_rate": 7.577287583108405e-05, + "loss": 0.46184988021850587, + "step": 57190 + }, + { + "epoch": 0.24557155491443636, + "grad_norm": 1.0112202167510986, + "learning_rate": 7.576856411096643e-05, + "loss": 0.22247018814086914, + "step": 57200 + }, + { + "epoch": 0.24561448700445634, + "grad_norm": 0.9999586939811707, + "learning_rate": 7.576425239084881e-05, + "loss": 0.32436323165893555, + "step": 57210 + }, + { + "epoch": 0.24565741909447636, + "grad_norm": 16.790267944335938, + "learning_rate": 7.575994067073119e-05, + "loss": 0.3138288974761963, + "step": 57220 + }, + { + "epoch": 0.24570035118449637, + "grad_norm": 1.5769797563552856, + "learning_rate": 7.575562895061356e-05, + "loss": 0.26715841293334963, + "step": 57230 + }, + { + "epoch": 0.24574328327451636, + "grad_norm": 0.003906694240868092, + "learning_rate": 7.575131723049593e-05, + "loss": 0.398896598815918, + "step": 57240 + }, + { + "epoch": 0.24578621536453638, + "grad_norm": 0.04367861524224281, + "learning_rate": 7.57470055103783e-05, + "loss": 0.14916378259658813, + "step": 57250 + }, + { + "epoch": 0.2458291474545564, + "grad_norm": 0.001283388352021575, + "learning_rate": 7.574269379026068e-05, + "loss": 0.2797567367553711, + "step": 57260 + }, + { + "epoch": 0.24587207954457638, + "grad_norm": 0.13986949622631073, + "learning_rate": 7.573838207014306e-05, + "loss": 0.1595933437347412, + "step": 57270 + }, + { + "epoch": 0.2459150116345964, + "grad_norm": 0.010912904515862465, + "learning_rate": 7.573407035002544e-05, + "loss": 0.2850444555282593, + "step": 57280 + }, + { + "epoch": 0.2459579437246164, + "grad_norm": 1.6686103343963623, + "learning_rate": 7.572975862990781e-05, + "loss": 0.36696457862854004, + "step": 57290 + }, + { + "epoch": 0.24600087581463642, + "grad_norm": 0.0024620601907372475, + "learning_rate": 7.57254469097902e-05, + "loss": 0.18678882122039794, + "step": 57300 + }, + { + "epoch": 0.2460438079046564, + "grad_norm": 0.10246946662664413, + "learning_rate": 7.572113518967258e-05, + "loss": 0.15630044937133789, + "step": 57310 + }, + { + "epoch": 0.24608673999467642, + "grad_norm": 0.022064056247472763, + "learning_rate": 7.571682346955495e-05, + "loss": 0.05567708015441895, + "step": 57320 + }, + { + "epoch": 0.24612967208469644, + "grad_norm": 0.002969966037198901, + "learning_rate": 7.571251174943732e-05, + "loss": 0.36348717212677, + "step": 57330 + }, + { + "epoch": 0.24617260417471643, + "grad_norm": 0.17457488179206848, + "learning_rate": 7.57082000293197e-05, + "loss": 0.33927862644195556, + "step": 57340 + }, + { + "epoch": 0.24621553626473644, + "grad_norm": 1.373376488685608, + "learning_rate": 7.570388830920208e-05, + "loss": 0.2118394136428833, + "step": 57350 + }, + { + "epoch": 0.24625846835475645, + "grad_norm": 0.04611791670322418, + "learning_rate": 7.569957658908446e-05, + "loss": 0.18210265636444092, + "step": 57360 + }, + { + "epoch": 0.24630140044477644, + "grad_norm": 0.9812759160995483, + "learning_rate": 7.569526486896683e-05, + "loss": 0.1476440668106079, + "step": 57370 + }, + { + "epoch": 0.24634433253479646, + "grad_norm": 0.03836577385663986, + "learning_rate": 7.569095314884921e-05, + "loss": 0.22062788009643555, + "step": 57380 + }, + { + "epoch": 0.24638726462481647, + "grad_norm": 0.0069273305125534534, + "learning_rate": 7.568664142873159e-05, + "loss": 0.38461480140686033, + "step": 57390 + }, + { + "epoch": 0.24643019671483646, + "grad_norm": 0.07969487458467484, + "learning_rate": 7.568232970861395e-05, + "loss": 0.1768990635871887, + "step": 57400 + }, + { + "epoch": 0.24647312880485647, + "grad_norm": 0.04361164569854736, + "learning_rate": 7.567801798849633e-05, + "loss": 0.3352889776229858, + "step": 57410 + }, + { + "epoch": 0.2465160608948765, + "grad_norm": 0.12355928122997284, + "learning_rate": 7.56737062683787e-05, + "loss": 0.3005270719528198, + "step": 57420 + }, + { + "epoch": 0.2465589929848965, + "grad_norm": 0.0032305035274475813, + "learning_rate": 7.566939454826108e-05, + "loss": 0.39624905586242676, + "step": 57430 + }, + { + "epoch": 0.2466019250749165, + "grad_norm": 11.76025676727295, + "learning_rate": 7.566508282814346e-05, + "loss": 0.2909027814865112, + "step": 57440 + }, + { + "epoch": 0.2466448571649365, + "grad_norm": 0.0032621347345411777, + "learning_rate": 7.566077110802584e-05, + "loss": 0.3369508028030396, + "step": 57450 + }, + { + "epoch": 0.24668778925495652, + "grad_norm": 1.9829251766204834, + "learning_rate": 7.565645938790822e-05, + "loss": 0.4466217517852783, + "step": 57460 + }, + { + "epoch": 0.2467307213449765, + "grad_norm": 1.7846029996871948, + "learning_rate": 7.565214766779059e-05, + "loss": 0.10857169628143311, + "step": 57470 + }, + { + "epoch": 0.24677365343499652, + "grad_norm": 0.6172264218330383, + "learning_rate": 7.564783594767297e-05, + "loss": 0.3345313787460327, + "step": 57480 + }, + { + "epoch": 0.24681658552501654, + "grad_norm": 0.24104170501232147, + "learning_rate": 7.564352422755533e-05, + "loss": 0.202384614944458, + "step": 57490 + }, + { + "epoch": 0.24685951761503652, + "grad_norm": 1.0128809213638306, + "learning_rate": 7.563921250743771e-05, + "loss": 0.12638015747070314, + "step": 57500 + }, + { + "epoch": 0.24690244970505654, + "grad_norm": 5.046428203582764, + "learning_rate": 7.563490078732009e-05, + "loss": 0.18070136308670043, + "step": 57510 + }, + { + "epoch": 0.24694538179507655, + "grad_norm": 0.14580535888671875, + "learning_rate": 7.563058906720248e-05, + "loss": 0.12394511699676514, + "step": 57520 + }, + { + "epoch": 0.24698831388509657, + "grad_norm": 3.3323802947998047, + "learning_rate": 7.562627734708486e-05, + "loss": 0.1436055302619934, + "step": 57530 + }, + { + "epoch": 0.24703124597511655, + "grad_norm": 0.06035997346043587, + "learning_rate": 7.562196562696723e-05, + "loss": 0.17422177791595458, + "step": 57540 + }, + { + "epoch": 0.24707417806513657, + "grad_norm": 0.04609677940607071, + "learning_rate": 7.561765390684961e-05, + "loss": 0.3345479965209961, + "step": 57550 + }, + { + "epoch": 0.24711711015515658, + "grad_norm": 0.022857768461108208, + "learning_rate": 7.561334218673199e-05, + "loss": 0.061411714553833006, + "step": 57560 + }, + { + "epoch": 0.24716004224517657, + "grad_norm": 0.03906315565109253, + "learning_rate": 7.560903046661435e-05, + "loss": 0.2037139892578125, + "step": 57570 + }, + { + "epoch": 0.24720297433519658, + "grad_norm": 2.5178000926971436, + "learning_rate": 7.560471874649673e-05, + "loss": 0.24743380546569824, + "step": 57580 + }, + { + "epoch": 0.2472459064252166, + "grad_norm": 0.10384848713874817, + "learning_rate": 7.560040702637911e-05, + "loss": 0.2382965087890625, + "step": 57590 + }, + { + "epoch": 0.2472888385152366, + "grad_norm": 0.04842836409807205, + "learning_rate": 7.559609530626149e-05, + "loss": 0.13167185783386232, + "step": 57600 + }, + { + "epoch": 0.2473317706052566, + "grad_norm": 0.111887127161026, + "learning_rate": 7.559178358614386e-05, + "loss": 0.07016860842704772, + "step": 57610 + }, + { + "epoch": 0.24737470269527662, + "grad_norm": 0.675987184047699, + "learning_rate": 7.558747186602624e-05, + "loss": 0.20956764221191407, + "step": 57620 + }, + { + "epoch": 0.24741763478529663, + "grad_norm": 0.003355368971824646, + "learning_rate": 7.558316014590862e-05, + "loss": 0.347391676902771, + "step": 57630 + }, + { + "epoch": 0.24746056687531662, + "grad_norm": 1.6586114168167114, + "learning_rate": 7.5578848425791e-05, + "loss": 0.35160102844238283, + "step": 57640 + }, + { + "epoch": 0.24750349896533663, + "grad_norm": 0.19788451492786407, + "learning_rate": 7.557453670567336e-05, + "loss": 0.1748092770576477, + "step": 57650 + }, + { + "epoch": 0.24754643105535665, + "grad_norm": 3.0574142932891846, + "learning_rate": 7.557022498555574e-05, + "loss": 0.26805412769317627, + "step": 57660 + }, + { + "epoch": 0.24758936314537663, + "grad_norm": 0.028504248708486557, + "learning_rate": 7.556591326543811e-05, + "loss": 0.44770083427429197, + "step": 57670 + }, + { + "epoch": 0.24763229523539665, + "grad_norm": 1.19331693649292, + "learning_rate": 7.556160154532049e-05, + "loss": 0.2743785858154297, + "step": 57680 + }, + { + "epoch": 0.24767522732541666, + "grad_norm": 14.593634605407715, + "learning_rate": 7.555728982520287e-05, + "loss": 0.14170204401016234, + "step": 57690 + }, + { + "epoch": 0.24771815941543665, + "grad_norm": 0.3801772892475128, + "learning_rate": 7.555297810508524e-05, + "loss": 0.37049763202667235, + "step": 57700 + }, + { + "epoch": 0.24776109150545667, + "grad_norm": 1.7399812936782837, + "learning_rate": 7.554866638496762e-05, + "loss": 0.31119160652160643, + "step": 57710 + }, + { + "epoch": 0.24780402359547668, + "grad_norm": 16.13619613647461, + "learning_rate": 7.554435466485e-05, + "loss": 0.4131039619445801, + "step": 57720 + }, + { + "epoch": 0.2478469556854967, + "grad_norm": 12.335932731628418, + "learning_rate": 7.554004294473236e-05, + "loss": 0.2520522356033325, + "step": 57730 + }, + { + "epoch": 0.24788988777551668, + "grad_norm": 0.017610453069210052, + "learning_rate": 7.553573122461475e-05, + "loss": 0.143160879611969, + "step": 57740 + }, + { + "epoch": 0.2479328198655367, + "grad_norm": 0.005706754047423601, + "learning_rate": 7.553141950449713e-05, + "loss": 0.09667201042175293, + "step": 57750 + }, + { + "epoch": 0.2479757519555567, + "grad_norm": 0.4213595390319824, + "learning_rate": 7.552710778437951e-05, + "loss": 0.26541934013366697, + "step": 57760 + }, + { + "epoch": 0.2480186840455767, + "grad_norm": 0.9437307715415955, + "learning_rate": 7.552279606426189e-05, + "loss": 0.5367563247680665, + "step": 57770 + }, + { + "epoch": 0.2480616161355967, + "grad_norm": 0.007601315155625343, + "learning_rate": 7.551848434414426e-05, + "loss": 0.17007397413253783, + "step": 57780 + }, + { + "epoch": 0.24810454822561673, + "grad_norm": 1.5910160541534424, + "learning_rate": 7.551417262402664e-05, + "loss": 0.20665650367736815, + "step": 57790 + }, + { + "epoch": 0.24814748031563671, + "grad_norm": 0.00580306351184845, + "learning_rate": 7.550986090390902e-05, + "loss": 0.026889517903327942, + "step": 57800 + }, + { + "epoch": 0.24819041240565673, + "grad_norm": 0.003274303860962391, + "learning_rate": 7.55055491837914e-05, + "loss": 0.3093209981918335, + "step": 57810 + }, + { + "epoch": 0.24823334449567674, + "grad_norm": 0.10975901782512665, + "learning_rate": 7.550123746367376e-05, + "loss": 0.06779348254203796, + "step": 57820 + }, + { + "epoch": 0.24827627658569673, + "grad_norm": 0.02362568862736225, + "learning_rate": 7.549692574355614e-05, + "loss": 0.16981412172317506, + "step": 57830 + }, + { + "epoch": 0.24831920867571675, + "grad_norm": 0.4983203113079071, + "learning_rate": 7.549261402343851e-05, + "loss": 0.21854088306427003, + "step": 57840 + }, + { + "epoch": 0.24836214076573676, + "grad_norm": 2.061974287033081, + "learning_rate": 7.548830230332089e-05, + "loss": 0.21698181629180907, + "step": 57850 + }, + { + "epoch": 0.24840507285575678, + "grad_norm": 0.048223454505205154, + "learning_rate": 7.548399058320327e-05, + "loss": 0.12005207538604737, + "step": 57860 + }, + { + "epoch": 0.24844800494577676, + "grad_norm": 1.0992143154144287, + "learning_rate": 7.547967886308565e-05, + "loss": 0.23123271465301515, + "step": 57870 + }, + { + "epoch": 0.24849093703579678, + "grad_norm": 1.3549517393112183, + "learning_rate": 7.547536714296802e-05, + "loss": 0.46811304092407224, + "step": 57880 + }, + { + "epoch": 0.2485338691258168, + "grad_norm": 0.07769492268562317, + "learning_rate": 7.54710554228504e-05, + "loss": 0.22488780021667482, + "step": 57890 + }, + { + "epoch": 0.24857680121583678, + "grad_norm": 0.06210146099328995, + "learning_rate": 7.546674370273276e-05, + "loss": 0.3013421058654785, + "step": 57900 + }, + { + "epoch": 0.2486197333058568, + "grad_norm": 0.14759461581707, + "learning_rate": 7.546243198261514e-05, + "loss": 0.1355152726173401, + "step": 57910 + }, + { + "epoch": 0.2486626653958768, + "grad_norm": 1.5669615268707275, + "learning_rate": 7.545812026249752e-05, + "loss": 0.1003315806388855, + "step": 57920 + }, + { + "epoch": 0.2487055974858968, + "grad_norm": 0.0019271537894383073, + "learning_rate": 7.54538085423799e-05, + "loss": 0.17151752710342408, + "step": 57930 + }, + { + "epoch": 0.2487485295759168, + "grad_norm": 0.09990391135215759, + "learning_rate": 7.544949682226227e-05, + "loss": 0.2588262796401978, + "step": 57940 + }, + { + "epoch": 0.24879146166593682, + "grad_norm": 1.4302527904510498, + "learning_rate": 7.544518510214465e-05, + "loss": 0.17463706731796264, + "step": 57950 + }, + { + "epoch": 0.24883439375595684, + "grad_norm": 0.0026034568436443806, + "learning_rate": 7.544087338202703e-05, + "loss": 0.11903560161590576, + "step": 57960 + }, + { + "epoch": 0.24887732584597683, + "grad_norm": 0.02368382178246975, + "learning_rate": 7.54365616619094e-05, + "loss": 0.1572781801223755, + "step": 57970 + }, + { + "epoch": 0.24892025793599684, + "grad_norm": 0.5909692645072937, + "learning_rate": 7.543224994179178e-05, + "loss": 0.22923357486724855, + "step": 57980 + }, + { + "epoch": 0.24896319002601686, + "grad_norm": 0.1352832168340683, + "learning_rate": 7.542793822167416e-05, + "loss": 0.5410185813903808, + "step": 57990 + }, + { + "epoch": 0.24900612211603684, + "grad_norm": 0.6172003746032715, + "learning_rate": 7.542362650155654e-05, + "loss": 0.4828122615814209, + "step": 58000 + }, + { + "epoch": 0.24900612211603684, + "eval_loss": 0.43727967143058777, + "eval_runtime": 27.5929, + "eval_samples_per_second": 3.624, + "eval_steps_per_second": 3.624, + "step": 58000 + }, + { + "epoch": 0.24904905420605686, + "grad_norm": 1.3846662044525146, + "learning_rate": 7.541931478143892e-05, + "loss": 0.2785909414291382, + "step": 58010 + }, + { + "epoch": 0.24909198629607687, + "grad_norm": 1.3835780620574951, + "learning_rate": 7.541500306132129e-05, + "loss": 0.485788631439209, + "step": 58020 + }, + { + "epoch": 0.24913491838609686, + "grad_norm": 0.17436742782592773, + "learning_rate": 7.541069134120367e-05, + "loss": 0.19860415458679198, + "step": 58030 + }, + { + "epoch": 0.24917785047611687, + "grad_norm": 0.09022471308708191, + "learning_rate": 7.540637962108605e-05, + "loss": 0.11170189380645752, + "step": 58040 + }, + { + "epoch": 0.2492207825661369, + "grad_norm": 0.00697419373318553, + "learning_rate": 7.540206790096842e-05, + "loss": 0.27677962779998777, + "step": 58050 + }, + { + "epoch": 0.2492637146561569, + "grad_norm": 0.30201953649520874, + "learning_rate": 7.539775618085079e-05, + "loss": 0.07165834903717042, + "step": 58060 + }, + { + "epoch": 0.2493066467461769, + "grad_norm": 1.4676119089126587, + "learning_rate": 7.539344446073317e-05, + "loss": 0.29705684185028075, + "step": 58070 + }, + { + "epoch": 0.2493495788361969, + "grad_norm": 0.36478012800216675, + "learning_rate": 7.538913274061554e-05, + "loss": 0.25623486042022703, + "step": 58080 + }, + { + "epoch": 0.24939251092621692, + "grad_norm": 0.09869907796382904, + "learning_rate": 7.538482102049792e-05, + "loss": 0.40760035514831544, + "step": 58090 + }, + { + "epoch": 0.2494354430162369, + "grad_norm": 1.5321518182754517, + "learning_rate": 7.53805093003803e-05, + "loss": 0.3262056827545166, + "step": 58100 + }, + { + "epoch": 0.24947837510625692, + "grad_norm": 0.03607706353068352, + "learning_rate": 7.537619758026268e-05, + "loss": 0.1874048590660095, + "step": 58110 + }, + { + "epoch": 0.24952130719627694, + "grad_norm": 0.04087758809328079, + "learning_rate": 7.537188586014505e-05, + "loss": 0.24982888698577882, + "step": 58120 + }, + { + "epoch": 0.24956423928629692, + "grad_norm": 2.5731465816497803, + "learning_rate": 7.536757414002743e-05, + "loss": 0.12112574577331543, + "step": 58130 + }, + { + "epoch": 0.24960717137631694, + "grad_norm": 0.34729182720184326, + "learning_rate": 7.53632624199098e-05, + "loss": 0.3160462141036987, + "step": 58140 + }, + { + "epoch": 0.24965010346633695, + "grad_norm": 0.011773771606385708, + "learning_rate": 7.535895069979217e-05, + "loss": 0.121714186668396, + "step": 58150 + }, + { + "epoch": 0.24969303555635697, + "grad_norm": 0.1527256816625595, + "learning_rate": 7.535463897967455e-05, + "loss": 0.2888143301010132, + "step": 58160 + }, + { + "epoch": 0.24973596764637696, + "grad_norm": 3.5751779079437256, + "learning_rate": 7.535032725955693e-05, + "loss": 0.14579278230667114, + "step": 58170 + }, + { + "epoch": 0.24977889973639697, + "grad_norm": 0.0369403250515461, + "learning_rate": 7.53460155394393e-05, + "loss": 0.14296526908874513, + "step": 58180 + }, + { + "epoch": 0.24982183182641698, + "grad_norm": 0.06302861869335175, + "learning_rate": 7.534170381932168e-05, + "loss": 0.3374220848083496, + "step": 58190 + }, + { + "epoch": 0.24986476391643697, + "grad_norm": 1.6855621337890625, + "learning_rate": 7.533739209920406e-05, + "loss": 0.4434662818908691, + "step": 58200 + }, + { + "epoch": 0.249907696006457, + "grad_norm": 0.0061494093388319016, + "learning_rate": 7.533308037908644e-05, + "loss": 0.16283658742904664, + "step": 58210 + }, + { + "epoch": 0.249950628096477, + "grad_norm": 0.04007069393992424, + "learning_rate": 7.532876865896881e-05, + "loss": 0.08710308074951172, + "step": 58220 + }, + { + "epoch": 0.249993560186497, + "grad_norm": 0.1617412120103836, + "learning_rate": 7.532445693885119e-05, + "loss": 0.3287935256958008, + "step": 58230 + }, + { + "epoch": 0.25003649227651703, + "grad_norm": 0.0027142164763063192, + "learning_rate": 7.532014521873357e-05, + "loss": 0.07832266092300415, + "step": 58240 + }, + { + "epoch": 0.250079424366537, + "grad_norm": 0.02477000653743744, + "learning_rate": 7.531583349861594e-05, + "loss": 0.17995315790176392, + "step": 58250 + }, + { + "epoch": 0.250122356456557, + "grad_norm": 0.25469478964805603, + "learning_rate": 7.531152177849832e-05, + "loss": 0.439362907409668, + "step": 58260 + }, + { + "epoch": 0.25016528854657705, + "grad_norm": 0.14019158482551575, + "learning_rate": 7.53072100583807e-05, + "loss": 0.160223650932312, + "step": 58270 + }, + { + "epoch": 0.25020822063659703, + "grad_norm": 0.09695712476968765, + "learning_rate": 7.530289833826308e-05, + "loss": 0.15709249973297118, + "step": 58280 + }, + { + "epoch": 0.250251152726617, + "grad_norm": 1.4217171669006348, + "learning_rate": 7.529858661814545e-05, + "loss": 0.1981913208961487, + "step": 58290 + }, + { + "epoch": 0.25029408481663706, + "grad_norm": 0.00611657602712512, + "learning_rate": 7.529427489802783e-05, + "loss": 0.16728440523147584, + "step": 58300 + }, + { + "epoch": 0.25033701690665705, + "grad_norm": 0.018936268985271454, + "learning_rate": 7.52899631779102e-05, + "loss": 0.11950817108154296, + "step": 58310 + }, + { + "epoch": 0.25037994899667704, + "grad_norm": 0.23093880712985992, + "learning_rate": 7.528565145779257e-05, + "loss": 0.1709181547164917, + "step": 58320 + }, + { + "epoch": 0.2504228810866971, + "grad_norm": 2.0191850662231445, + "learning_rate": 7.528133973767495e-05, + "loss": 0.3569988965988159, + "step": 58330 + }, + { + "epoch": 0.25046581317671707, + "grad_norm": 7.1040472984313965, + "learning_rate": 7.527702801755733e-05, + "loss": 0.15479474067687987, + "step": 58340 + }, + { + "epoch": 0.25050874526673705, + "grad_norm": 2.3190762996673584, + "learning_rate": 7.52727162974397e-05, + "loss": 0.07152878642082214, + "step": 58350 + }, + { + "epoch": 0.2505516773567571, + "grad_norm": 3.2260525226593018, + "learning_rate": 7.526840457732208e-05, + "loss": 0.17637765407562256, + "step": 58360 + }, + { + "epoch": 0.2505946094467771, + "grad_norm": 0.09935518354177475, + "learning_rate": 7.526409285720446e-05, + "loss": 0.29577863216400146, + "step": 58370 + }, + { + "epoch": 0.25063754153679707, + "grad_norm": 8.207350730895996, + "learning_rate": 7.525978113708684e-05, + "loss": 0.3453080177307129, + "step": 58380 + }, + { + "epoch": 0.2506804736268171, + "grad_norm": 0.001084555173292756, + "learning_rate": 7.52554694169692e-05, + "loss": 0.18551015853881836, + "step": 58390 + }, + { + "epoch": 0.2507234057168371, + "grad_norm": 0.03261413797736168, + "learning_rate": 7.525115769685158e-05, + "loss": 0.18724746704101564, + "step": 58400 + }, + { + "epoch": 0.2507663378068571, + "grad_norm": 0.0074689267203211784, + "learning_rate": 7.524684597673395e-05, + "loss": 0.07106940746307373, + "step": 58410 + }, + { + "epoch": 0.25080926989687713, + "grad_norm": 0.023070115596055984, + "learning_rate": 7.524253425661633e-05, + "loss": 0.1185571312904358, + "step": 58420 + }, + { + "epoch": 0.2508522019868971, + "grad_norm": 0.015555283054709435, + "learning_rate": 7.523822253649871e-05, + "loss": 0.2828044891357422, + "step": 58430 + }, + { + "epoch": 0.25089513407691716, + "grad_norm": 0.016580946743488312, + "learning_rate": 7.523391081638109e-05, + "loss": 0.2243107318878174, + "step": 58440 + }, + { + "epoch": 0.25093806616693715, + "grad_norm": 0.6320114135742188, + "learning_rate": 7.522959909626346e-05, + "loss": 0.14534295797348024, + "step": 58450 + }, + { + "epoch": 0.25098099825695713, + "grad_norm": 0.019783420488238335, + "learning_rate": 7.522528737614584e-05, + "loss": 0.08147812485694886, + "step": 58460 + }, + { + "epoch": 0.2510239303469772, + "grad_norm": 0.048422615975141525, + "learning_rate": 7.522097565602822e-05, + "loss": 0.19120093584060668, + "step": 58470 + }, + { + "epoch": 0.25106686243699716, + "grad_norm": 0.12801282107830048, + "learning_rate": 7.52166639359106e-05, + "loss": 0.17279950380325318, + "step": 58480 + }, + { + "epoch": 0.25110979452701715, + "grad_norm": 1.5858972072601318, + "learning_rate": 7.521235221579297e-05, + "loss": 0.48514652252197266, + "step": 58490 + }, + { + "epoch": 0.2511527266170372, + "grad_norm": 0.030733229592442513, + "learning_rate": 7.520804049567535e-05, + "loss": 0.30051817893981936, + "step": 58500 + }, + { + "epoch": 0.2511956587070572, + "grad_norm": 1.0026241540908813, + "learning_rate": 7.520372877555773e-05, + "loss": 0.1893421769142151, + "step": 58510 + }, + { + "epoch": 0.25123859079707717, + "grad_norm": 0.048215437680482864, + "learning_rate": 7.51994170554401e-05, + "loss": 0.22662749290466308, + "step": 58520 + }, + { + "epoch": 0.2512815228870972, + "grad_norm": 0.38019275665283203, + "learning_rate": 7.519510533532248e-05, + "loss": 0.3480359077453613, + "step": 58530 + }, + { + "epoch": 0.2513244549771172, + "grad_norm": 1.5052499771118164, + "learning_rate": 7.519079361520486e-05, + "loss": 0.37305445671081544, + "step": 58540 + }, + { + "epoch": 0.2513673870671372, + "grad_norm": 0.2208074927330017, + "learning_rate": 7.518648189508724e-05, + "loss": 0.16929982900619506, + "step": 58550 + }, + { + "epoch": 0.2514103191571572, + "grad_norm": 0.006902139633893967, + "learning_rate": 7.51821701749696e-05, + "loss": 0.2761898756027222, + "step": 58560 + }, + { + "epoch": 0.2514532512471772, + "grad_norm": 1.5039210319519043, + "learning_rate": 7.517785845485198e-05, + "loss": 0.2421489715576172, + "step": 58570 + }, + { + "epoch": 0.2514961833371972, + "grad_norm": 0.40178382396698, + "learning_rate": 7.517354673473436e-05, + "loss": 0.1107286810874939, + "step": 58580 + }, + { + "epoch": 0.25153911542721724, + "grad_norm": 1.3667283058166504, + "learning_rate": 7.516923501461673e-05, + "loss": 0.4685997009277344, + "step": 58590 + }, + { + "epoch": 0.25158204751723723, + "grad_norm": 0.09705159813165665, + "learning_rate": 7.516492329449911e-05, + "loss": 0.2425306797027588, + "step": 58600 + }, + { + "epoch": 0.2516249796072572, + "grad_norm": 2.8658628463745117, + "learning_rate": 7.516061157438149e-05, + "loss": 0.18263787031173706, + "step": 58610 + }, + { + "epoch": 0.25166791169727726, + "grad_norm": 1.7215452194213867, + "learning_rate": 7.515629985426387e-05, + "loss": 0.07476306557655335, + "step": 58620 + }, + { + "epoch": 0.25171084378729724, + "grad_norm": 2.6153411865234375, + "learning_rate": 7.515198813414624e-05, + "loss": 0.2928791522979736, + "step": 58630 + }, + { + "epoch": 0.25175377587731723, + "grad_norm": 1.717194676399231, + "learning_rate": 7.51476764140286e-05, + "loss": 0.22246553897857665, + "step": 58640 + }, + { + "epoch": 0.2517967079673373, + "grad_norm": 1.2184993028640747, + "learning_rate": 7.514336469391098e-05, + "loss": 0.24329085350036622, + "step": 58650 + }, + { + "epoch": 0.25183964005735726, + "grad_norm": 0.03462841361761093, + "learning_rate": 7.513905297379336e-05, + "loss": 0.15679537057876586, + "step": 58660 + }, + { + "epoch": 0.2518825721473773, + "grad_norm": 0.11113351583480835, + "learning_rate": 7.513474125367574e-05, + "loss": 0.3553584575653076, + "step": 58670 + }, + { + "epoch": 0.2519255042373973, + "grad_norm": 0.03636935353279114, + "learning_rate": 7.513042953355812e-05, + "loss": 0.2889517307281494, + "step": 58680 + }, + { + "epoch": 0.2519684363274173, + "grad_norm": 0.5163435935974121, + "learning_rate": 7.51261178134405e-05, + "loss": 0.23903121948242187, + "step": 58690 + }, + { + "epoch": 0.2520113684174373, + "grad_norm": 0.11025349795818329, + "learning_rate": 7.512180609332287e-05, + "loss": 0.15377535820007324, + "step": 58700 + }, + { + "epoch": 0.2520543005074573, + "grad_norm": 0.07065358012914658, + "learning_rate": 7.511749437320526e-05, + "loss": 0.2515211820602417, + "step": 58710 + }, + { + "epoch": 0.2520972325974773, + "grad_norm": 0.03804844617843628, + "learning_rate": 7.511318265308763e-05, + "loss": 0.3007489204406738, + "step": 58720 + }, + { + "epoch": 0.25214016468749734, + "grad_norm": 0.008624300360679626, + "learning_rate": 7.510887093297e-05, + "loss": 0.3628600835800171, + "step": 58730 + }, + { + "epoch": 0.2521830967775173, + "grad_norm": 0.08319579809904099, + "learning_rate": 7.510455921285238e-05, + "loss": 0.25749032497406005, + "step": 58740 + }, + { + "epoch": 0.2522260288675373, + "grad_norm": 0.9691449403762817, + "learning_rate": 7.510024749273476e-05, + "loss": 0.2763275384902954, + "step": 58750 + }, + { + "epoch": 0.25226896095755735, + "grad_norm": 0.08982349932193756, + "learning_rate": 7.509593577261713e-05, + "loss": 0.313446044921875, + "step": 58760 + }, + { + "epoch": 0.25231189304757734, + "grad_norm": 8.64424991607666, + "learning_rate": 7.509162405249951e-05, + "loss": 0.13973323106765748, + "step": 58770 + }, + { + "epoch": 0.2523548251375973, + "grad_norm": 0.08145991712808609, + "learning_rate": 7.508731233238189e-05, + "loss": 0.3383770227432251, + "step": 58780 + }, + { + "epoch": 0.25239775722761737, + "grad_norm": 1.739539623260498, + "learning_rate": 7.508300061226427e-05, + "loss": 0.3257728099822998, + "step": 58790 + }, + { + "epoch": 0.25244068931763736, + "grad_norm": 0.14261537790298462, + "learning_rate": 7.507868889214663e-05, + "loss": 0.3546148300170898, + "step": 58800 + }, + { + "epoch": 0.25248362140765734, + "grad_norm": 0.45535334944725037, + "learning_rate": 7.507437717202901e-05, + "loss": 0.1574306845664978, + "step": 58810 + }, + { + "epoch": 0.2525265534976774, + "grad_norm": 12.764909744262695, + "learning_rate": 7.507006545191139e-05, + "loss": 0.4659184455871582, + "step": 58820 + }, + { + "epoch": 0.2525694855876974, + "grad_norm": 0.034577127546072006, + "learning_rate": 7.506575373179376e-05, + "loss": 0.23389551639556885, + "step": 58830 + }, + { + "epoch": 0.25261241767771736, + "grad_norm": 0.021863384172320366, + "learning_rate": 7.506144201167614e-05, + "loss": 0.3838773012161255, + "step": 58840 + }, + { + "epoch": 0.2526553497677374, + "grad_norm": 1.3971561193466187, + "learning_rate": 7.505713029155852e-05, + "loss": 0.3844272613525391, + "step": 58850 + }, + { + "epoch": 0.2526982818577574, + "grad_norm": 1.2504379749298096, + "learning_rate": 7.50528185714409e-05, + "loss": 0.214629602432251, + "step": 58860 + }, + { + "epoch": 0.25274121394777743, + "grad_norm": 0.005005622748285532, + "learning_rate": 7.504850685132327e-05, + "loss": 0.1708429217338562, + "step": 58870 + }, + { + "epoch": 0.2527841460377974, + "grad_norm": 1.2264432907104492, + "learning_rate": 7.504419513120564e-05, + "loss": 0.245497465133667, + "step": 58880 + }, + { + "epoch": 0.2528270781278174, + "grad_norm": 0.3678496181964874, + "learning_rate": 7.503988341108801e-05, + "loss": 0.05550463199615478, + "step": 58890 + }, + { + "epoch": 0.25287001021783745, + "grad_norm": 0.07211752980947495, + "learning_rate": 7.503557169097039e-05, + "loss": 0.15995752811431885, + "step": 58900 + }, + { + "epoch": 0.25291294230785744, + "grad_norm": 0.1209837943315506, + "learning_rate": 7.503125997085277e-05, + "loss": 0.26348142623901366, + "step": 58910 + }, + { + "epoch": 0.2529558743978774, + "grad_norm": 0.011889472603797913, + "learning_rate": 7.502694825073515e-05, + "loss": 0.4343246936798096, + "step": 58920 + }, + { + "epoch": 0.25299880648789747, + "grad_norm": 0.04902556538581848, + "learning_rate": 7.502263653061754e-05, + "loss": 0.2803981065750122, + "step": 58930 + }, + { + "epoch": 0.25304173857791745, + "grad_norm": 3.0477848052978516, + "learning_rate": 7.501832481049991e-05, + "loss": 0.17602345943450928, + "step": 58940 + }, + { + "epoch": 0.25308467066793744, + "grad_norm": 2.994198799133301, + "learning_rate": 7.501401309038229e-05, + "loss": 0.2608646869659424, + "step": 58950 + }, + { + "epoch": 0.2531276027579575, + "grad_norm": 0.003419468877837062, + "learning_rate": 7.500970137026467e-05, + "loss": 0.28305621147155763, + "step": 58960 + }, + { + "epoch": 0.25317053484797747, + "grad_norm": 0.018120309337973595, + "learning_rate": 7.500538965014703e-05, + "loss": 0.2792409896850586, + "step": 58970 + }, + { + "epoch": 0.25321346693799746, + "grad_norm": 0.04600166156888008, + "learning_rate": 7.500107793002941e-05, + "loss": 0.06299285888671875, + "step": 58980 + }, + { + "epoch": 0.2532563990280175, + "grad_norm": 0.9423048496246338, + "learning_rate": 7.499676620991179e-05, + "loss": 0.06454428434371948, + "step": 58990 + }, + { + "epoch": 0.2532993311180375, + "grad_norm": 0.00812695175409317, + "learning_rate": 7.499245448979416e-05, + "loss": 0.12127074003219604, + "step": 59000 + }, + { + "epoch": 0.2532993311180375, + "eval_loss": 0.45360976457595825, + "eval_runtime": 27.4239, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 59000 + }, + { + "epoch": 0.25334226320805747, + "grad_norm": 0.29408156871795654, + "learning_rate": 7.498814276967654e-05, + "loss": 0.18373805284500122, + "step": 59010 + }, + { + "epoch": 0.2533851952980775, + "grad_norm": 0.03120812214910984, + "learning_rate": 7.498383104955892e-05, + "loss": 0.2167065382003784, + "step": 59020 + }, + { + "epoch": 0.2534281273880975, + "grad_norm": 0.0006006321054883301, + "learning_rate": 7.49795193294413e-05, + "loss": 0.13845841884613036, + "step": 59030 + }, + { + "epoch": 0.2534710594781175, + "grad_norm": 0.0009152772836387157, + "learning_rate": 7.497520760932367e-05, + "loss": 0.19261682033538818, + "step": 59040 + }, + { + "epoch": 0.25351399156813753, + "grad_norm": 0.014247381128370762, + "learning_rate": 7.497089588920604e-05, + "loss": 0.3147283554077148, + "step": 59050 + }, + { + "epoch": 0.2535569236581575, + "grad_norm": 1.0784889459609985, + "learning_rate": 7.496658416908841e-05, + "loss": 0.201631760597229, + "step": 59060 + }, + { + "epoch": 0.2535998557481775, + "grad_norm": 0.0418044738471508, + "learning_rate": 7.496227244897079e-05, + "loss": 0.06878702044486999, + "step": 59070 + }, + { + "epoch": 0.25364278783819755, + "grad_norm": 0.021921556442975998, + "learning_rate": 7.495796072885317e-05, + "loss": 0.22168614864349365, + "step": 59080 + }, + { + "epoch": 0.25368571992821753, + "grad_norm": 0.06008792296051979, + "learning_rate": 7.495364900873555e-05, + "loss": 0.13947556018829346, + "step": 59090 + }, + { + "epoch": 0.2537286520182376, + "grad_norm": 0.0034183943644165993, + "learning_rate": 7.494933728861792e-05, + "loss": 0.14371013641357422, + "step": 59100 + }, + { + "epoch": 0.25377158410825756, + "grad_norm": 2.4520370960235596, + "learning_rate": 7.49450255685003e-05, + "loss": 0.2792895078659058, + "step": 59110 + }, + { + "epoch": 0.25381451619827755, + "grad_norm": 0.6797070503234863, + "learning_rate": 7.494071384838268e-05, + "loss": 0.285917854309082, + "step": 59120 + }, + { + "epoch": 0.2538574482882976, + "grad_norm": 1.3992570638656616, + "learning_rate": 7.493640212826504e-05, + "loss": 0.13060109615325927, + "step": 59130 + }, + { + "epoch": 0.2539003803783176, + "grad_norm": 55.88208770751953, + "learning_rate": 7.493209040814742e-05, + "loss": 0.2650261402130127, + "step": 59140 + }, + { + "epoch": 0.25394331246833757, + "grad_norm": 0.7198887467384338, + "learning_rate": 7.492777868802981e-05, + "loss": 0.14854855537414552, + "step": 59150 + }, + { + "epoch": 0.2539862445583576, + "grad_norm": 0.0003937442961614579, + "learning_rate": 7.492346696791219e-05, + "loss": 0.033887633681297304, + "step": 59160 + }, + { + "epoch": 0.2540291766483776, + "grad_norm": 0.0008705161744728684, + "learning_rate": 7.491915524779457e-05, + "loss": 0.15312042236328124, + "step": 59170 + }, + { + "epoch": 0.2540721087383976, + "grad_norm": 2.6492867469787598, + "learning_rate": 7.491484352767694e-05, + "loss": 0.3735518455505371, + "step": 59180 + }, + { + "epoch": 0.2541150408284176, + "grad_norm": 0.0005868783337064087, + "learning_rate": 7.491053180755932e-05, + "loss": 0.28540782928466796, + "step": 59190 + }, + { + "epoch": 0.2541579729184376, + "grad_norm": 2.3117873668670654, + "learning_rate": 7.49062200874417e-05, + "loss": 0.15944833755493165, + "step": 59200 + }, + { + "epoch": 0.2542009050084576, + "grad_norm": 0.009221755899488926, + "learning_rate": 7.490190836732406e-05, + "loss": 0.29479308128356935, + "step": 59210 + }, + { + "epoch": 0.25424383709847764, + "grad_norm": 0.49922358989715576, + "learning_rate": 7.489759664720644e-05, + "loss": 0.2660544395446777, + "step": 59220 + }, + { + "epoch": 0.25428676918849763, + "grad_norm": 0.21182771027088165, + "learning_rate": 7.489328492708882e-05, + "loss": 0.1434078812599182, + "step": 59230 + }, + { + "epoch": 0.2543297012785176, + "grad_norm": 0.9136849045753479, + "learning_rate": 7.488897320697119e-05, + "loss": 0.29035928249359133, + "step": 59240 + }, + { + "epoch": 0.25437263336853766, + "grad_norm": 0.4379677474498749, + "learning_rate": 7.488466148685357e-05, + "loss": 0.19489789009094238, + "step": 59250 + }, + { + "epoch": 0.25441556545855765, + "grad_norm": 0.1081492155790329, + "learning_rate": 7.488034976673595e-05, + "loss": 0.2349034309387207, + "step": 59260 + }, + { + "epoch": 0.25445849754857763, + "grad_norm": 1.3798065185546875, + "learning_rate": 7.487603804661833e-05, + "loss": 0.38103649616241453, + "step": 59270 + }, + { + "epoch": 0.2545014296385977, + "grad_norm": 0.09544280916452408, + "learning_rate": 7.48717263265007e-05, + "loss": 0.16104166507720946, + "step": 59280 + }, + { + "epoch": 0.25454436172861766, + "grad_norm": 0.009478382766246796, + "learning_rate": 7.486741460638308e-05, + "loss": 0.12755447626113892, + "step": 59290 + }, + { + "epoch": 0.2545872938186377, + "grad_norm": 0.04885256290435791, + "learning_rate": 7.486310288626544e-05, + "loss": 0.1894428014755249, + "step": 59300 + }, + { + "epoch": 0.2546302259086577, + "grad_norm": 0.1674768328666687, + "learning_rate": 7.485879116614782e-05, + "loss": 0.18973815441131592, + "step": 59310 + }, + { + "epoch": 0.2546731579986777, + "grad_norm": 0.018817557021975517, + "learning_rate": 7.48544794460302e-05, + "loss": 0.13941391706466674, + "step": 59320 + }, + { + "epoch": 0.2547160900886977, + "grad_norm": 0.09732145071029663, + "learning_rate": 7.485016772591258e-05, + "loss": 0.17832938432693482, + "step": 59330 + }, + { + "epoch": 0.2547590221787177, + "grad_norm": 0.10007254779338837, + "learning_rate": 7.484585600579495e-05, + "loss": 0.2736708402633667, + "step": 59340 + }, + { + "epoch": 0.2548019542687377, + "grad_norm": 1.3420660495758057, + "learning_rate": 7.484154428567733e-05, + "loss": 0.11637892723083496, + "step": 59350 + }, + { + "epoch": 0.25484488635875774, + "grad_norm": 1.2482755184173584, + "learning_rate": 7.483723256555971e-05, + "loss": 0.24099578857421874, + "step": 59360 + }, + { + "epoch": 0.2548878184487777, + "grad_norm": 0.0030424906872212887, + "learning_rate": 7.483292084544208e-05, + "loss": 0.40054893493652344, + "step": 59370 + }, + { + "epoch": 0.2549307505387977, + "grad_norm": 0.15289510786533356, + "learning_rate": 7.482860912532446e-05, + "loss": 0.27155065536499023, + "step": 59380 + }, + { + "epoch": 0.25497368262881775, + "grad_norm": 0.15874122083187103, + "learning_rate": 7.482429740520684e-05, + "loss": 0.3292946100234985, + "step": 59390 + }, + { + "epoch": 0.25501661471883774, + "grad_norm": 0.7041093707084656, + "learning_rate": 7.481998568508922e-05, + "loss": 0.09723674654960632, + "step": 59400 + }, + { + "epoch": 0.25505954680885773, + "grad_norm": 0.03337360545992851, + "learning_rate": 7.48156739649716e-05, + "loss": 0.1512210488319397, + "step": 59410 + }, + { + "epoch": 0.25510247889887777, + "grad_norm": 0.47118431329727173, + "learning_rate": 7.481136224485397e-05, + "loss": 0.19009388685226442, + "step": 59420 + }, + { + "epoch": 0.25514541098889776, + "grad_norm": 1.1213189363479614, + "learning_rate": 7.480705052473635e-05, + "loss": 0.32139723300933837, + "step": 59430 + }, + { + "epoch": 0.25518834307891775, + "grad_norm": 0.02947445586323738, + "learning_rate": 7.480273880461873e-05, + "loss": 0.13824256658554077, + "step": 59440 + }, + { + "epoch": 0.2552312751689378, + "grad_norm": 0.2438763827085495, + "learning_rate": 7.47984270845011e-05, + "loss": 0.216261887550354, + "step": 59450 + }, + { + "epoch": 0.2552742072589578, + "grad_norm": 0.01802912726998329, + "learning_rate": 7.479411536438347e-05, + "loss": 0.10054420232772827, + "step": 59460 + }, + { + "epoch": 0.25531713934897776, + "grad_norm": 4.936689376831055, + "learning_rate": 7.478980364426584e-05, + "loss": 0.3198725700378418, + "step": 59470 + }, + { + "epoch": 0.2553600714389978, + "grad_norm": 1.1724637746810913, + "learning_rate": 7.478549192414822e-05, + "loss": 0.22834150791168212, + "step": 59480 + }, + { + "epoch": 0.2554030035290178, + "grad_norm": 0.04114644601941109, + "learning_rate": 7.47811802040306e-05, + "loss": 0.09133874773979186, + "step": 59490 + }, + { + "epoch": 0.2554459356190378, + "grad_norm": 0.24322256445884705, + "learning_rate": 7.477686848391298e-05, + "loss": 0.16889041662216187, + "step": 59500 + }, + { + "epoch": 0.2554888677090578, + "grad_norm": 1.2901346683502197, + "learning_rate": 7.477255676379535e-05, + "loss": 0.04150072932243347, + "step": 59510 + }, + { + "epoch": 0.2555317997990778, + "grad_norm": 4.035179138183594, + "learning_rate": 7.476824504367773e-05, + "loss": 0.2927661418914795, + "step": 59520 + }, + { + "epoch": 0.25557473188909785, + "grad_norm": 4.362427234649658, + "learning_rate": 7.476393332356011e-05, + "loss": 0.24192519187927247, + "step": 59530 + }, + { + "epoch": 0.25561766397911784, + "grad_norm": 0.0030299974605441093, + "learning_rate": 7.475962160344247e-05, + "loss": 0.14141761064529418, + "step": 59540 + }, + { + "epoch": 0.2556605960691378, + "grad_norm": 2.231828451156616, + "learning_rate": 7.475530988332485e-05, + "loss": 0.30825395584106446, + "step": 59550 + }, + { + "epoch": 0.25570352815915787, + "grad_norm": 0.09296996891498566, + "learning_rate": 7.475099816320723e-05, + "loss": 0.25759999752044677, + "step": 59560 + }, + { + "epoch": 0.25574646024917785, + "grad_norm": 0.18211933970451355, + "learning_rate": 7.47466864430896e-05, + "loss": 0.1883184313774109, + "step": 59570 + }, + { + "epoch": 0.25578939233919784, + "grad_norm": 0.07114825397729874, + "learning_rate": 7.474237472297198e-05, + "loss": 0.17066493034362792, + "step": 59580 + }, + { + "epoch": 0.2558323244292179, + "grad_norm": 3.3013527393341064, + "learning_rate": 7.473806300285436e-05, + "loss": 0.35579254627227785, + "step": 59590 + }, + { + "epoch": 0.25587525651923787, + "grad_norm": 0.6940078139305115, + "learning_rate": 7.473375128273674e-05, + "loss": 0.15746339559555053, + "step": 59600 + }, + { + "epoch": 0.25591818860925786, + "grad_norm": 0.4556047320365906, + "learning_rate": 7.472943956261911e-05, + "loss": 0.3222770690917969, + "step": 59610 + }, + { + "epoch": 0.2559611206992779, + "grad_norm": 0.66447913646698, + "learning_rate": 7.472512784250149e-05, + "loss": 0.29715771675109864, + "step": 59620 + }, + { + "epoch": 0.2560040527892979, + "grad_norm": 0.20826716721057892, + "learning_rate": 7.472081612238387e-05, + "loss": 0.3474601745605469, + "step": 59630 + }, + { + "epoch": 0.2560469848793179, + "grad_norm": 0.8093248009681702, + "learning_rate": 7.471650440226625e-05, + "loss": 0.2916031122207642, + "step": 59640 + }, + { + "epoch": 0.2560899169693379, + "grad_norm": 0.1829816699028015, + "learning_rate": 7.471219268214862e-05, + "loss": 0.3677819728851318, + "step": 59650 + }, + { + "epoch": 0.2561328490593579, + "grad_norm": 0.4850645661354065, + "learning_rate": 7.4707880962031e-05, + "loss": 0.20362043380737305, + "step": 59660 + }, + { + "epoch": 0.2561757811493779, + "grad_norm": 1.195241928100586, + "learning_rate": 7.470356924191338e-05, + "loss": 0.22135019302368164, + "step": 59670 + }, + { + "epoch": 0.25621871323939793, + "grad_norm": 1.7356536388397217, + "learning_rate": 7.469925752179576e-05, + "loss": 0.21251988410949707, + "step": 59680 + }, + { + "epoch": 0.2562616453294179, + "grad_norm": 0.22998811304569244, + "learning_rate": 7.469494580167813e-05, + "loss": 0.153911817073822, + "step": 59690 + }, + { + "epoch": 0.2563045774194379, + "grad_norm": 10.037912368774414, + "learning_rate": 7.469063408156051e-05, + "loss": 0.23573625087738037, + "step": 59700 + }, + { + "epoch": 0.25634750950945795, + "grad_norm": 0.022693343460559845, + "learning_rate": 7.468632236144287e-05, + "loss": 0.19555919170379638, + "step": 59710 + }, + { + "epoch": 0.25639044159947794, + "grad_norm": 3.438164710998535, + "learning_rate": 7.468201064132525e-05, + "loss": 0.2517883539199829, + "step": 59720 + }, + { + "epoch": 0.256433373689498, + "grad_norm": 0.8784910440444946, + "learning_rate": 7.467769892120763e-05, + "loss": 0.14495769739151002, + "step": 59730 + }, + { + "epoch": 0.25647630577951797, + "grad_norm": 0.06742941588163376, + "learning_rate": 7.467338720109e-05, + "loss": 0.08876391649246215, + "step": 59740 + }, + { + "epoch": 0.25651923786953795, + "grad_norm": 0.14247381687164307, + "learning_rate": 7.466907548097238e-05, + "loss": 0.07637916207313537, + "step": 59750 + }, + { + "epoch": 0.256562169959558, + "grad_norm": 0.00508470181375742, + "learning_rate": 7.466476376085476e-05, + "loss": 0.0826393187046051, + "step": 59760 + }, + { + "epoch": 0.256605102049578, + "grad_norm": 0.02827954664826393, + "learning_rate": 7.466045204073714e-05, + "loss": 0.16171613931655884, + "step": 59770 + }, + { + "epoch": 0.25664803413959797, + "grad_norm": 0.035804182291030884, + "learning_rate": 7.465614032061952e-05, + "loss": 0.12921042442321778, + "step": 59780 + }, + { + "epoch": 0.256690966229618, + "grad_norm": 0.17865890264511108, + "learning_rate": 7.465182860050188e-05, + "loss": 0.13270853757858275, + "step": 59790 + }, + { + "epoch": 0.256733898319638, + "grad_norm": 0.003751277457922697, + "learning_rate": 7.464751688038426e-05, + "loss": 0.11085785627365112, + "step": 59800 + }, + { + "epoch": 0.256776830409658, + "grad_norm": 0.4030987620353699, + "learning_rate": 7.464320516026663e-05, + "loss": 0.27253243923187254, + "step": 59810 + }, + { + "epoch": 0.25681976249967803, + "grad_norm": 0.009502650238573551, + "learning_rate": 7.463889344014901e-05, + "loss": 0.1683057427406311, + "step": 59820 + }, + { + "epoch": 0.256862694589698, + "grad_norm": 0.006902703549712896, + "learning_rate": 7.463458172003139e-05, + "loss": 0.2961188077926636, + "step": 59830 + }, + { + "epoch": 0.256905626679718, + "grad_norm": 4.155785083770752, + "learning_rate": 7.463026999991377e-05, + "loss": 0.42710652351379397, + "step": 59840 + }, + { + "epoch": 0.25694855876973804, + "grad_norm": 0.012135523371398449, + "learning_rate": 7.462595827979614e-05, + "loss": 0.0692120611667633, + "step": 59850 + }, + { + "epoch": 0.25699149085975803, + "grad_norm": 0.07463029026985168, + "learning_rate": 7.462164655967852e-05, + "loss": 0.13505473136901855, + "step": 59860 + }, + { + "epoch": 0.257034422949778, + "grad_norm": 1.5088930130004883, + "learning_rate": 7.46173348395609e-05, + "loss": 0.170632004737854, + "step": 59870 + }, + { + "epoch": 0.25707735503979806, + "grad_norm": 1.4531193971633911, + "learning_rate": 7.461302311944328e-05, + "loss": 0.2785643100738525, + "step": 59880 + }, + { + "epoch": 0.25712028712981805, + "grad_norm": 16.77339744567871, + "learning_rate": 7.460871139932565e-05, + "loss": 0.19587944746017455, + "step": 59890 + }, + { + "epoch": 0.25716321921983804, + "grad_norm": 1.1448432207107544, + "learning_rate": 7.460439967920803e-05, + "loss": 0.18592535257339476, + "step": 59900 + }, + { + "epoch": 0.2572061513098581, + "grad_norm": 0.000942026439588517, + "learning_rate": 7.460008795909041e-05, + "loss": 0.28868684768676756, + "step": 59910 + }, + { + "epoch": 0.25724908339987806, + "grad_norm": 1.498420238494873, + "learning_rate": 7.459577623897278e-05, + "loss": 0.3154158115386963, + "step": 59920 + }, + { + "epoch": 0.25729201548989805, + "grad_norm": 0.030521482229232788, + "learning_rate": 7.459146451885516e-05, + "loss": 0.29817702770233157, + "step": 59930 + }, + { + "epoch": 0.2573349475799181, + "grad_norm": 0.9940090179443359, + "learning_rate": 7.458715279873754e-05, + "loss": 0.2779792070388794, + "step": 59940 + }, + { + "epoch": 0.2573778796699381, + "grad_norm": 0.004922129213809967, + "learning_rate": 7.45828410786199e-05, + "loss": 0.26671979427337644, + "step": 59950 + }, + { + "epoch": 0.2574208117599581, + "grad_norm": 0.09344282746315002, + "learning_rate": 7.457852935850228e-05, + "loss": 0.20685970783233643, + "step": 59960 + }, + { + "epoch": 0.2574637438499781, + "grad_norm": 2.032776117324829, + "learning_rate": 7.457421763838466e-05, + "loss": 0.4120779514312744, + "step": 59970 + }, + { + "epoch": 0.2575066759399981, + "grad_norm": 3.692190170288086, + "learning_rate": 7.456990591826704e-05, + "loss": 0.28315205574035646, + "step": 59980 + }, + { + "epoch": 0.25754960803001814, + "grad_norm": 3.3654446601867676, + "learning_rate": 7.456559419814941e-05, + "loss": 0.2883306503295898, + "step": 59990 + }, + { + "epoch": 0.2575925401200381, + "grad_norm": 0.10398758202791214, + "learning_rate": 7.456128247803179e-05, + "loss": 0.22278923988342286, + "step": 60000 + }, + { + "epoch": 0.2575925401200381, + "eval_loss": 0.4334496557712555, + "eval_runtime": 27.6311, + "eval_samples_per_second": 3.619, + "eval_steps_per_second": 3.619, + "step": 60000 + }, + { + "epoch": 0.2576354722100581, + "grad_norm": 0.4824603199958801, + "learning_rate": 7.455697075791417e-05, + "loss": 0.12448277473449706, + "step": 60010 + }, + { + "epoch": 0.25767840430007816, + "grad_norm": 0.1026025265455246, + "learning_rate": 7.455265903779654e-05, + "loss": 0.34478096961975097, + "step": 60020 + }, + { + "epoch": 0.25772133639009814, + "grad_norm": 0.0035014173481613398, + "learning_rate": 7.454834731767892e-05, + "loss": 0.21980509757995606, + "step": 60030 + }, + { + "epoch": 0.25776426848011813, + "grad_norm": 1.0586053133010864, + "learning_rate": 7.454403559756129e-05, + "loss": 0.2351222515106201, + "step": 60040 + }, + { + "epoch": 0.2578072005701382, + "grad_norm": 0.901056706905365, + "learning_rate": 7.453972387744366e-05, + "loss": 0.2706778526306152, + "step": 60050 + }, + { + "epoch": 0.25785013266015816, + "grad_norm": 0.008316353894770145, + "learning_rate": 7.453541215732604e-05, + "loss": 0.16525228023529054, + "step": 60060 + }, + { + "epoch": 0.25789306475017815, + "grad_norm": 0.4059770405292511, + "learning_rate": 7.453110043720842e-05, + "loss": 0.17263598442077638, + "step": 60070 + }, + { + "epoch": 0.2579359968401982, + "grad_norm": 2.582120180130005, + "learning_rate": 7.45267887170908e-05, + "loss": 0.09478598833084106, + "step": 60080 + }, + { + "epoch": 0.2579789289302182, + "grad_norm": 0.0057938783429563046, + "learning_rate": 7.452247699697317e-05, + "loss": 0.2529701471328735, + "step": 60090 + }, + { + "epoch": 0.25802186102023816, + "grad_norm": 0.014004064723849297, + "learning_rate": 7.451816527685555e-05, + "loss": 0.2436042308807373, + "step": 60100 + }, + { + "epoch": 0.2580647931102582, + "grad_norm": 0.13050612807273865, + "learning_rate": 7.451385355673793e-05, + "loss": 0.249086594581604, + "step": 60110 + }, + { + "epoch": 0.2581077252002782, + "grad_norm": 0.011018295772373676, + "learning_rate": 7.45095418366203e-05, + "loss": 0.24939985275268556, + "step": 60120 + }, + { + "epoch": 0.2581506572902982, + "grad_norm": 0.37946784496307373, + "learning_rate": 7.450523011650268e-05, + "loss": 0.08818068504333496, + "step": 60130 + }, + { + "epoch": 0.2581935893803182, + "grad_norm": 0.572830080986023, + "learning_rate": 7.450091839638506e-05, + "loss": 0.1864118456840515, + "step": 60140 + }, + { + "epoch": 0.2582365214703382, + "grad_norm": 0.03537747636437416, + "learning_rate": 7.449660667626744e-05, + "loss": 0.11507842540740967, + "step": 60150 + }, + { + "epoch": 0.25827945356035825, + "grad_norm": 0.003414865816012025, + "learning_rate": 7.449229495614981e-05, + "loss": 0.19977638721466065, + "step": 60160 + }, + { + "epoch": 0.25832238565037824, + "grad_norm": 0.05791240185499191, + "learning_rate": 7.448798323603219e-05, + "loss": 0.09170815348625183, + "step": 60170 + }, + { + "epoch": 0.2583653177403982, + "grad_norm": 0.9761412143707275, + "learning_rate": 7.448367151591457e-05, + "loss": 0.26405317783355714, + "step": 60180 + }, + { + "epoch": 0.25840824983041827, + "grad_norm": 7.960112998262048e-05, + "learning_rate": 7.447935979579695e-05, + "loss": 0.07050980925559998, + "step": 60190 + }, + { + "epoch": 0.25845118192043826, + "grad_norm": 0.0009460471337661147, + "learning_rate": 7.447504807567931e-05, + "loss": 0.1875847339630127, + "step": 60200 + }, + { + "epoch": 0.25849411401045824, + "grad_norm": 0.05905630439519882, + "learning_rate": 7.447073635556169e-05, + "loss": 0.07909368872642517, + "step": 60210 + }, + { + "epoch": 0.2585370461004783, + "grad_norm": 4.739538192749023, + "learning_rate": 7.446642463544406e-05, + "loss": 0.327526330947876, + "step": 60220 + }, + { + "epoch": 0.25857997819049827, + "grad_norm": 1.2322735786437988, + "learning_rate": 7.446211291532644e-05, + "loss": 0.6427930355072021, + "step": 60230 + }, + { + "epoch": 0.25862291028051826, + "grad_norm": 0.22826102375984192, + "learning_rate": 7.445780119520882e-05, + "loss": 0.23440871238708497, + "step": 60240 + }, + { + "epoch": 0.2586658423705383, + "grad_norm": 0.3581126928329468, + "learning_rate": 7.44534894750912e-05, + "loss": 0.11961183547973633, + "step": 60250 + }, + { + "epoch": 0.2587087744605583, + "grad_norm": 0.009623724035918713, + "learning_rate": 7.444917775497357e-05, + "loss": 0.24155714511871337, + "step": 60260 + }, + { + "epoch": 0.2587517065505783, + "grad_norm": 0.11129165440797806, + "learning_rate": 7.444486603485595e-05, + "loss": 0.12745927572250365, + "step": 60270 + }, + { + "epoch": 0.2587946386405983, + "grad_norm": 0.0014452520990744233, + "learning_rate": 7.444055431473831e-05, + "loss": 0.1345227837562561, + "step": 60280 + }, + { + "epoch": 0.2588375707306183, + "grad_norm": 1.325457215309143, + "learning_rate": 7.443624259462069e-05, + "loss": 0.21954782009124757, + "step": 60290 + }, + { + "epoch": 0.2588805028206383, + "grad_norm": 3.4132261276245117, + "learning_rate": 7.443193087450307e-05, + "loss": 0.2938798189163208, + "step": 60300 + }, + { + "epoch": 0.25892343491065833, + "grad_norm": 3.204024076461792, + "learning_rate": 7.442761915438545e-05, + "loss": 0.17051490545272827, + "step": 60310 + }, + { + "epoch": 0.2589663670006783, + "grad_norm": 0.0036249810364097357, + "learning_rate": 7.442330743426782e-05, + "loss": 0.18110159635543824, + "step": 60320 + }, + { + "epoch": 0.2590092990906983, + "grad_norm": 0.8694581985473633, + "learning_rate": 7.44189957141502e-05, + "loss": 0.23339805603027344, + "step": 60330 + }, + { + "epoch": 0.25905223118071835, + "grad_norm": 4.196567058563232, + "learning_rate": 7.441468399403259e-05, + "loss": 0.13609391450881958, + "step": 60340 + }, + { + "epoch": 0.25909516327073834, + "grad_norm": 0.347924143075943, + "learning_rate": 7.441037227391497e-05, + "loss": 0.16419798135757446, + "step": 60350 + }, + { + "epoch": 0.2591380953607583, + "grad_norm": 1.3597310781478882, + "learning_rate": 7.440606055379735e-05, + "loss": 0.05138199329376221, + "step": 60360 + }, + { + "epoch": 0.25918102745077837, + "grad_norm": 0.029793528839945793, + "learning_rate": 7.440174883367971e-05, + "loss": 0.16066555976867675, + "step": 60370 + }, + { + "epoch": 0.25922395954079835, + "grad_norm": 0.06812033802270889, + "learning_rate": 7.439743711356209e-05, + "loss": 0.2599786281585693, + "step": 60380 + }, + { + "epoch": 0.2592668916308184, + "grad_norm": 0.000821946538053453, + "learning_rate": 7.439312539344447e-05, + "loss": 0.19980417490005492, + "step": 60390 + }, + { + "epoch": 0.2593098237208384, + "grad_norm": 0.6022235751152039, + "learning_rate": 7.438881367332684e-05, + "loss": 0.21292507648468018, + "step": 60400 + }, + { + "epoch": 0.25935275581085837, + "grad_norm": 0.05035751685500145, + "learning_rate": 7.438450195320922e-05, + "loss": 0.23559396266937255, + "step": 60410 + }, + { + "epoch": 0.2593956879008784, + "grad_norm": 0.026783062145113945, + "learning_rate": 7.43801902330916e-05, + "loss": 0.23798015117645263, + "step": 60420 + }, + { + "epoch": 0.2594386199908984, + "grad_norm": 0.021695852279663086, + "learning_rate": 7.437587851297397e-05, + "loss": 0.07167540788650513, + "step": 60430 + }, + { + "epoch": 0.2594815520809184, + "grad_norm": 4.8020301619544625e-05, + "learning_rate": 7.437156679285635e-05, + "loss": 0.27671382427215574, + "step": 60440 + }, + { + "epoch": 0.25952448417093843, + "grad_norm": 0.06290990114212036, + "learning_rate": 7.436725507273872e-05, + "loss": 0.28073556423187257, + "step": 60450 + }, + { + "epoch": 0.2595674162609584, + "grad_norm": 0.20219306647777557, + "learning_rate": 7.43629433526211e-05, + "loss": 0.028297588229179382, + "step": 60460 + }, + { + "epoch": 0.2596103483509784, + "grad_norm": 1.8177192211151123, + "learning_rate": 7.435863163250347e-05, + "loss": 0.14526848793029784, + "step": 60470 + }, + { + "epoch": 0.25965328044099845, + "grad_norm": 0.676733136177063, + "learning_rate": 7.435431991238585e-05, + "loss": 0.23378782272338866, + "step": 60480 + }, + { + "epoch": 0.25969621253101843, + "grad_norm": 0.136207714676857, + "learning_rate": 7.435000819226823e-05, + "loss": 0.06558563113212586, + "step": 60490 + }, + { + "epoch": 0.2597391446210384, + "grad_norm": 0.0994429960846901, + "learning_rate": 7.43456964721506e-05, + "loss": 0.11308003664016723, + "step": 60500 + }, + { + "epoch": 0.25978207671105846, + "grad_norm": 3.658642530441284, + "learning_rate": 7.434138475203298e-05, + "loss": 0.46092705726623534, + "step": 60510 + }, + { + "epoch": 0.25982500880107845, + "grad_norm": 0.09556927531957626, + "learning_rate": 7.433707303191536e-05, + "loss": 0.359284782409668, + "step": 60520 + }, + { + "epoch": 0.25986794089109844, + "grad_norm": 0.01794356107711792, + "learning_rate": 7.433276131179772e-05, + "loss": 0.22540340423583985, + "step": 60530 + }, + { + "epoch": 0.2599108729811185, + "grad_norm": 6.463069915771484, + "learning_rate": 7.43284495916801e-05, + "loss": 0.1235724925994873, + "step": 60540 + }, + { + "epoch": 0.25995380507113847, + "grad_norm": 0.486515611410141, + "learning_rate": 7.432413787156248e-05, + "loss": 0.2912492513656616, + "step": 60550 + }, + { + "epoch": 0.25999673716115845, + "grad_norm": 0.0728234276175499, + "learning_rate": 7.431982615144487e-05, + "loss": 0.08355991840362549, + "step": 60560 + }, + { + "epoch": 0.2600396692511785, + "grad_norm": 0.006520441733300686, + "learning_rate": 7.431551443132724e-05, + "loss": 0.11683311462402343, + "step": 60570 + }, + { + "epoch": 0.2600826013411985, + "grad_norm": 0.021994153037667274, + "learning_rate": 7.431120271120962e-05, + "loss": 0.21824615001678466, + "step": 60580 + }, + { + "epoch": 0.2601255334312185, + "grad_norm": 0.02452601119875908, + "learning_rate": 7.4306890991092e-05, + "loss": 0.2871460199356079, + "step": 60590 + }, + { + "epoch": 0.2601684655212385, + "grad_norm": 0.002121657133102417, + "learning_rate": 7.430257927097438e-05, + "loss": 0.3002606391906738, + "step": 60600 + }, + { + "epoch": 0.2602113976112585, + "grad_norm": 1.038457989692688, + "learning_rate": 7.429826755085674e-05, + "loss": 0.15184781551361085, + "step": 60610 + }, + { + "epoch": 0.26025432970127854, + "grad_norm": 0.3401985168457031, + "learning_rate": 7.429395583073912e-05, + "loss": 0.24284937381744384, + "step": 60620 + }, + { + "epoch": 0.26029726179129853, + "grad_norm": 1.9231289625167847, + "learning_rate": 7.42896441106215e-05, + "loss": 0.36919641494750977, + "step": 60630 + }, + { + "epoch": 0.2603401938813185, + "grad_norm": 0.0033993495162576437, + "learning_rate": 7.428533239050387e-05, + "loss": 0.1777048110961914, + "step": 60640 + }, + { + "epoch": 0.26038312597133856, + "grad_norm": 0.17801420390605927, + "learning_rate": 7.428102067038625e-05, + "loss": 0.27388036251068115, + "step": 60650 + }, + { + "epoch": 0.26042605806135855, + "grad_norm": 0.0023477845825254917, + "learning_rate": 7.427670895026863e-05, + "loss": 0.24748921394348145, + "step": 60660 + }, + { + "epoch": 0.26046899015137853, + "grad_norm": 8.327670097351074, + "learning_rate": 7.4272397230151e-05, + "loss": 0.21605587005615234, + "step": 60670 + }, + { + "epoch": 0.2605119222413986, + "grad_norm": 0.02114141546189785, + "learning_rate": 7.426808551003338e-05, + "loss": 0.23582861423492432, + "step": 60680 + }, + { + "epoch": 0.26055485433141856, + "grad_norm": 2.584583044052124, + "learning_rate": 7.426377378991575e-05, + "loss": 0.3142396450042725, + "step": 60690 + }, + { + "epoch": 0.26059778642143855, + "grad_norm": 0.015979178249835968, + "learning_rate": 7.425946206979812e-05, + "loss": 0.38975841999053956, + "step": 60700 + }, + { + "epoch": 0.2606407185114586, + "grad_norm": 0.07706556469202042, + "learning_rate": 7.42551503496805e-05, + "loss": 0.26714980602264404, + "step": 60710 + }, + { + "epoch": 0.2606836506014786, + "grad_norm": 0.3661552369594574, + "learning_rate": 7.425083862956288e-05, + "loss": 0.17433593273162842, + "step": 60720 + }, + { + "epoch": 0.26072658269149857, + "grad_norm": 0.9815655946731567, + "learning_rate": 7.424652690944525e-05, + "loss": 0.33178982734680174, + "step": 60730 + }, + { + "epoch": 0.2607695147815186, + "grad_norm": 2.396820068359375, + "learning_rate": 7.424221518932763e-05, + "loss": 0.16736443042755128, + "step": 60740 + }, + { + "epoch": 0.2608124468715386, + "grad_norm": 0.06286032497882843, + "learning_rate": 7.423790346921001e-05, + "loss": 0.19664554595947265, + "step": 60750 + }, + { + "epoch": 0.2608553789615586, + "grad_norm": 0.01892467401921749, + "learning_rate": 7.423359174909239e-05, + "loss": 0.06982214450836181, + "step": 60760 + }, + { + "epoch": 0.2608983110515786, + "grad_norm": 0.031845398247241974, + "learning_rate": 7.422928002897476e-05, + "loss": 0.21323072910308838, + "step": 60770 + }, + { + "epoch": 0.2609412431415986, + "grad_norm": 0.0011438673827797174, + "learning_rate": 7.422496830885714e-05, + "loss": 0.1070354700088501, + "step": 60780 + }, + { + "epoch": 0.2609841752316186, + "grad_norm": 0.015031355433166027, + "learning_rate": 7.422065658873952e-05, + "loss": 0.16533334255218507, + "step": 60790 + }, + { + "epoch": 0.26102710732163864, + "grad_norm": 0.2482471913099289, + "learning_rate": 7.42163448686219e-05, + "loss": 0.17978440523147582, + "step": 60800 + }, + { + "epoch": 0.2610700394116586, + "grad_norm": 0.02777029201388359, + "learning_rate": 7.421203314850427e-05, + "loss": 0.16307601928710938, + "step": 60810 + }, + { + "epoch": 0.26111297150167867, + "grad_norm": 0.7483030557632446, + "learning_rate": 7.420772142838665e-05, + "loss": 0.1298598289489746, + "step": 60820 + }, + { + "epoch": 0.26115590359169866, + "grad_norm": 0.051577258855104446, + "learning_rate": 7.420340970826903e-05, + "loss": 0.23332612514495848, + "step": 60830 + }, + { + "epoch": 0.26119883568171864, + "grad_norm": 0.011332403868436813, + "learning_rate": 7.41990979881514e-05, + "loss": 0.21819396018981935, + "step": 60840 + }, + { + "epoch": 0.2612417677717387, + "grad_norm": 0.01872512884438038, + "learning_rate": 7.419478626803378e-05, + "loss": 0.12530485391616822, + "step": 60850 + }, + { + "epoch": 0.2612846998617587, + "grad_norm": 5.371342182159424, + "learning_rate": 7.419047454791615e-05, + "loss": 0.3505243301391602, + "step": 60860 + }, + { + "epoch": 0.26132763195177866, + "grad_norm": 0.9568906426429749, + "learning_rate": 7.418616282779852e-05, + "loss": 0.3265427827835083, + "step": 60870 + }, + { + "epoch": 0.2613705640417987, + "grad_norm": 0.0020468125585466623, + "learning_rate": 7.41818511076809e-05, + "loss": 0.3428152084350586, + "step": 60880 + }, + { + "epoch": 0.2614134961318187, + "grad_norm": 0.013312124647200108, + "learning_rate": 7.417753938756328e-05, + "loss": 0.3842325210571289, + "step": 60890 + }, + { + "epoch": 0.2614564282218387, + "grad_norm": 2.257316827774048, + "learning_rate": 7.417322766744566e-05, + "loss": 0.4196054935455322, + "step": 60900 + }, + { + "epoch": 0.2614993603118587, + "grad_norm": 0.17104367911815643, + "learning_rate": 7.416891594732803e-05, + "loss": 0.2096198558807373, + "step": 60910 + }, + { + "epoch": 0.2615422924018787, + "grad_norm": 1.1753196716308594, + "learning_rate": 7.416460422721041e-05, + "loss": 0.480803918838501, + "step": 60920 + }, + { + "epoch": 0.2615852244918987, + "grad_norm": 3.2457165718078613, + "learning_rate": 7.416029250709279e-05, + "loss": 0.2990148067474365, + "step": 60930 + }, + { + "epoch": 0.26162815658191874, + "grad_norm": 0.9289878010749817, + "learning_rate": 7.415598078697515e-05, + "loss": 0.3847595930099487, + "step": 60940 + }, + { + "epoch": 0.2616710886719387, + "grad_norm": 0.02137531340122223, + "learning_rate": 7.415166906685753e-05, + "loss": 0.16399847269058226, + "step": 60950 + }, + { + "epoch": 0.2617140207619587, + "grad_norm": 0.7735177874565125, + "learning_rate": 7.41473573467399e-05, + "loss": 0.22880373001098633, + "step": 60960 + }, + { + "epoch": 0.26175695285197875, + "grad_norm": 0.3202783167362213, + "learning_rate": 7.414304562662228e-05, + "loss": 0.29592530727386473, + "step": 60970 + }, + { + "epoch": 0.26179988494199874, + "grad_norm": 0.01103890035301447, + "learning_rate": 7.413873390650466e-05, + "loss": 0.16606005430221557, + "step": 60980 + }, + { + "epoch": 0.2618428170320187, + "grad_norm": 0.036014948040246964, + "learning_rate": 7.413442218638704e-05, + "loss": 0.15441317558288575, + "step": 60990 + }, + { + "epoch": 0.26188574912203877, + "grad_norm": 1.53452467918396, + "learning_rate": 7.413011046626942e-05, + "loss": 0.34018664360046386, + "step": 61000 + }, + { + "epoch": 0.26188574912203877, + "eval_loss": 0.4289569854736328, + "eval_runtime": 27.4285, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 61000 + }, + { + "epoch": 0.26192868121205876, + "grad_norm": 0.02968435175716877, + "learning_rate": 7.412579874615179e-05, + "loss": 0.09290619492530823, + "step": 61010 + }, + { + "epoch": 0.2619716133020788, + "grad_norm": 0.3154435157775879, + "learning_rate": 7.412148702603417e-05, + "loss": 0.11877356767654419, + "step": 61020 + }, + { + "epoch": 0.2620145453920988, + "grad_norm": 0.050260115414857864, + "learning_rate": 7.411717530591655e-05, + "loss": 0.2617194414138794, + "step": 61030 + }, + { + "epoch": 0.2620574774821188, + "grad_norm": 0.033218640834093094, + "learning_rate": 7.411286358579893e-05, + "loss": 0.3478349447250366, + "step": 61040 + }, + { + "epoch": 0.2621004095721388, + "grad_norm": 0.12185132503509521, + "learning_rate": 7.41085518656813e-05, + "loss": 0.21468589305877686, + "step": 61050 + }, + { + "epoch": 0.2621433416621588, + "grad_norm": 4.730641841888428, + "learning_rate": 7.410424014556368e-05, + "loss": 0.2136781930923462, + "step": 61060 + }, + { + "epoch": 0.2621862737521788, + "grad_norm": 0.03792364150285721, + "learning_rate": 7.409992842544606e-05, + "loss": 0.22919461727142335, + "step": 61070 + }, + { + "epoch": 0.26222920584219883, + "grad_norm": 0.03635554760694504, + "learning_rate": 7.409561670532843e-05, + "loss": 0.14863102436065673, + "step": 61080 + }, + { + "epoch": 0.2622721379322188, + "grad_norm": 0.6019494533538818, + "learning_rate": 7.409130498521081e-05, + "loss": 0.13661930561065674, + "step": 61090 + }, + { + "epoch": 0.2623150700222388, + "grad_norm": 0.3180076777935028, + "learning_rate": 7.408699326509319e-05, + "loss": 0.0830872893333435, + "step": 61100 + }, + { + "epoch": 0.26235800211225885, + "grad_norm": 0.09637082368135452, + "learning_rate": 7.408268154497555e-05, + "loss": 0.2411327838897705, + "step": 61110 + }, + { + "epoch": 0.26240093420227883, + "grad_norm": 0.07667584717273712, + "learning_rate": 7.407836982485793e-05, + "loss": 0.3642971277236938, + "step": 61120 + }, + { + "epoch": 0.2624438662922988, + "grad_norm": 1.2912453413009644, + "learning_rate": 7.407405810474031e-05, + "loss": 0.21056365966796875, + "step": 61130 + }, + { + "epoch": 0.26248679838231886, + "grad_norm": 0.04481130465865135, + "learning_rate": 7.406974638462268e-05, + "loss": 0.2890332221984863, + "step": 61140 + }, + { + "epoch": 0.26252973047233885, + "grad_norm": 3.0413002967834473, + "learning_rate": 7.406543466450506e-05, + "loss": 0.2858407974243164, + "step": 61150 + }, + { + "epoch": 0.26257266256235884, + "grad_norm": 0.009976262226700783, + "learning_rate": 7.406112294438744e-05, + "loss": 0.22789084911346436, + "step": 61160 + }, + { + "epoch": 0.2626155946523789, + "grad_norm": 0.10438531637191772, + "learning_rate": 7.405681122426982e-05, + "loss": 0.15007318258285524, + "step": 61170 + }, + { + "epoch": 0.26265852674239887, + "grad_norm": 1.8232972621917725, + "learning_rate": 7.40524995041522e-05, + "loss": 0.3524114370346069, + "step": 61180 + }, + { + "epoch": 0.26270145883241885, + "grad_norm": 0.10440938919782639, + "learning_rate": 7.404818778403456e-05, + "loss": 0.10628962516784668, + "step": 61190 + }, + { + "epoch": 0.2627443909224389, + "grad_norm": 0.1989804059267044, + "learning_rate": 7.404387606391694e-05, + "loss": 0.09079195857048035, + "step": 61200 + }, + { + "epoch": 0.2627873230124589, + "grad_norm": 0.007323736324906349, + "learning_rate": 7.403956434379931e-05, + "loss": 0.3279279232025146, + "step": 61210 + }, + { + "epoch": 0.26283025510247887, + "grad_norm": 0.019976818934082985, + "learning_rate": 7.403525262368169e-05, + "loss": 0.349375319480896, + "step": 61220 + }, + { + "epoch": 0.2628731871924989, + "grad_norm": 0.07553509622812271, + "learning_rate": 7.403094090356407e-05, + "loss": 0.22444021701812744, + "step": 61230 + }, + { + "epoch": 0.2629161192825189, + "grad_norm": 0.6492979526519775, + "learning_rate": 7.402662918344644e-05, + "loss": 0.3088306665420532, + "step": 61240 + }, + { + "epoch": 0.26295905137253894, + "grad_norm": 0.02214295044541359, + "learning_rate": 7.402231746332882e-05, + "loss": 0.24729163646698, + "step": 61250 + }, + { + "epoch": 0.26300198346255893, + "grad_norm": 0.5439571142196655, + "learning_rate": 7.40180057432112e-05, + "loss": 0.2864963531494141, + "step": 61260 + }, + { + "epoch": 0.2630449155525789, + "grad_norm": 0.003847939195111394, + "learning_rate": 7.401369402309358e-05, + "loss": 0.19426699876785278, + "step": 61270 + }, + { + "epoch": 0.26308784764259896, + "grad_norm": 5.374867916107178, + "learning_rate": 7.400938230297595e-05, + "loss": 0.2439223051071167, + "step": 61280 + }, + { + "epoch": 0.26313077973261895, + "grad_norm": 0.053749583661556244, + "learning_rate": 7.400507058285833e-05, + "loss": 0.14222971200942994, + "step": 61290 + }, + { + "epoch": 0.26317371182263893, + "grad_norm": 3.159461259841919, + "learning_rate": 7.400075886274071e-05, + "loss": 0.34703402519226073, + "step": 61300 + }, + { + "epoch": 0.263216643912659, + "grad_norm": 0.19708716869354248, + "learning_rate": 7.399644714262309e-05, + "loss": 0.24233169555664064, + "step": 61310 + }, + { + "epoch": 0.26325957600267896, + "grad_norm": 0.06329300999641418, + "learning_rate": 7.399213542250546e-05, + "loss": 0.2854588270187378, + "step": 61320 + }, + { + "epoch": 0.26330250809269895, + "grad_norm": 0.44259560108184814, + "learning_rate": 7.398782370238784e-05, + "loss": 0.3140692949295044, + "step": 61330 + }, + { + "epoch": 0.263345440182719, + "grad_norm": 1.087395429611206, + "learning_rate": 7.398351198227022e-05, + "loss": 0.1668491005897522, + "step": 61340 + }, + { + "epoch": 0.263388372272739, + "grad_norm": 2.940399646759033, + "learning_rate": 7.397920026215258e-05, + "loss": 0.3233808994293213, + "step": 61350 + }, + { + "epoch": 0.26343130436275897, + "grad_norm": 0.06883282214403152, + "learning_rate": 7.397488854203496e-05, + "loss": 0.21582355499267578, + "step": 61360 + }, + { + "epoch": 0.263474236452779, + "grad_norm": 0.3769819438457489, + "learning_rate": 7.397057682191734e-05, + "loss": 0.21639833450317383, + "step": 61370 + }, + { + "epoch": 0.263517168542799, + "grad_norm": 3.261446475982666, + "learning_rate": 7.396626510179971e-05, + "loss": 0.34641716480255125, + "step": 61380 + }, + { + "epoch": 0.263560100632819, + "grad_norm": 3.089873790740967, + "learning_rate": 7.396195338168209e-05, + "loss": 0.2210846424102783, + "step": 61390 + }, + { + "epoch": 0.263603032722839, + "grad_norm": 4.159793376922607, + "learning_rate": 7.395764166156447e-05, + "loss": 0.2680304527282715, + "step": 61400 + }, + { + "epoch": 0.263645964812859, + "grad_norm": 1.3745867013931274, + "learning_rate": 7.395332994144685e-05, + "loss": 0.295793342590332, + "step": 61410 + }, + { + "epoch": 0.263688896902879, + "grad_norm": 0.010196246206760406, + "learning_rate": 7.394901822132922e-05, + "loss": 0.18006807565689087, + "step": 61420 + }, + { + "epoch": 0.26373182899289904, + "grad_norm": 0.20760205388069153, + "learning_rate": 7.394470650121159e-05, + "loss": 0.21314103603363038, + "step": 61430 + }, + { + "epoch": 0.26377476108291903, + "grad_norm": 2.232950448989868, + "learning_rate": 7.394039478109396e-05, + "loss": 0.24137496948242188, + "step": 61440 + }, + { + "epoch": 0.26381769317293907, + "grad_norm": 1.0074223279953003, + "learning_rate": 7.393608306097634e-05, + "loss": 0.210703444480896, + "step": 61450 + }, + { + "epoch": 0.26386062526295906, + "grad_norm": 0.6803957223892212, + "learning_rate": 7.393177134085872e-05, + "loss": 0.40192604064941406, + "step": 61460 + }, + { + "epoch": 0.26390355735297905, + "grad_norm": 4.76837682723999, + "learning_rate": 7.39274596207411e-05, + "loss": 0.18026487827301024, + "step": 61470 + }, + { + "epoch": 0.2639464894429991, + "grad_norm": 7.9994988441467285, + "learning_rate": 7.392314790062347e-05, + "loss": 0.302392315864563, + "step": 61480 + }, + { + "epoch": 0.2639894215330191, + "grad_norm": 4.274667739868164, + "learning_rate": 7.391883618050585e-05, + "loss": 0.17441201210021973, + "step": 61490 + }, + { + "epoch": 0.26403235362303906, + "grad_norm": 0.026807954534888268, + "learning_rate": 7.391452446038823e-05, + "loss": 0.2764727115631104, + "step": 61500 + }, + { + "epoch": 0.2640752857130591, + "grad_norm": 5.083781719207764, + "learning_rate": 7.39102127402706e-05, + "loss": 0.45632572174072267, + "step": 61510 + }, + { + "epoch": 0.2641182178030791, + "grad_norm": 1.9132654666900635, + "learning_rate": 7.390590102015298e-05, + "loss": 0.22143087387084961, + "step": 61520 + }, + { + "epoch": 0.2641611498930991, + "grad_norm": 0.5253879427909851, + "learning_rate": 7.390158930003536e-05, + "loss": 0.600839900970459, + "step": 61530 + }, + { + "epoch": 0.2642040819831191, + "grad_norm": 1.4570778608322144, + "learning_rate": 7.389727757991774e-05, + "loss": 0.2691011905670166, + "step": 61540 + }, + { + "epoch": 0.2642470140731391, + "grad_norm": 24.884841918945312, + "learning_rate": 7.389296585980012e-05, + "loss": 0.18936102390289306, + "step": 61550 + }, + { + "epoch": 0.2642899461631591, + "grad_norm": 0.4241613745689392, + "learning_rate": 7.388865413968249e-05, + "loss": 0.2016587734222412, + "step": 61560 + }, + { + "epoch": 0.26433287825317914, + "grad_norm": 1.7418383359909058, + "learning_rate": 7.388434241956487e-05, + "loss": 0.06550635695457459, + "step": 61570 + }, + { + "epoch": 0.2643758103431991, + "grad_norm": 1.0983703136444092, + "learning_rate": 7.388003069944725e-05, + "loss": 0.31223478317260744, + "step": 61580 + }, + { + "epoch": 0.2644187424332191, + "grad_norm": 0.04658321663737297, + "learning_rate": 7.387571897932962e-05, + "loss": 0.20343027114868165, + "step": 61590 + }, + { + "epoch": 0.26446167452323915, + "grad_norm": 6.366367816925049, + "learning_rate": 7.387140725921199e-05, + "loss": 0.25048508644104006, + "step": 61600 + }, + { + "epoch": 0.26450460661325914, + "grad_norm": 0.2299579381942749, + "learning_rate": 7.386709553909437e-05, + "loss": 0.1549118161201477, + "step": 61610 + }, + { + "epoch": 0.26454753870327913, + "grad_norm": 3.3510000705718994, + "learning_rate": 7.386278381897674e-05, + "loss": 0.4528806686401367, + "step": 61620 + }, + { + "epoch": 0.26459047079329917, + "grad_norm": 6.139395236968994, + "learning_rate": 7.385847209885912e-05, + "loss": 0.23490145206451415, + "step": 61630 + }, + { + "epoch": 0.26463340288331916, + "grad_norm": 5.379457473754883, + "learning_rate": 7.38541603787415e-05, + "loss": 0.29587996006011963, + "step": 61640 + }, + { + "epoch": 0.26467633497333914, + "grad_norm": 0.06632285565137863, + "learning_rate": 7.384984865862388e-05, + "loss": 0.17854514122009277, + "step": 61650 + }, + { + "epoch": 0.2647192670633592, + "grad_norm": 1.1752125024795532, + "learning_rate": 7.384553693850625e-05, + "loss": 0.10081328153610229, + "step": 61660 + }, + { + "epoch": 0.2647621991533792, + "grad_norm": 0.03773083910346031, + "learning_rate": 7.384122521838863e-05, + "loss": 0.24993579387664794, + "step": 61670 + }, + { + "epoch": 0.2648051312433992, + "grad_norm": 0.823442280292511, + "learning_rate": 7.3836913498271e-05, + "loss": 0.2658759355545044, + "step": 61680 + }, + { + "epoch": 0.2648480633334192, + "grad_norm": 3.046865940093994, + "learning_rate": 7.383260177815337e-05, + "loss": 0.4629732608795166, + "step": 61690 + }, + { + "epoch": 0.2648909954234392, + "grad_norm": 0.8300249576568604, + "learning_rate": 7.382829005803575e-05, + "loss": 0.21923952102661132, + "step": 61700 + }, + { + "epoch": 0.26493392751345923, + "grad_norm": 0.0004211229388602078, + "learning_rate": 7.382397833791813e-05, + "loss": 0.3185274124145508, + "step": 61710 + }, + { + "epoch": 0.2649768596034792, + "grad_norm": 0.012865206226706505, + "learning_rate": 7.38196666178005e-05, + "loss": 0.20773918628692628, + "step": 61720 + }, + { + "epoch": 0.2650197916934992, + "grad_norm": 1.0187405347824097, + "learning_rate": 7.381535489768288e-05, + "loss": 0.33336970806121824, + "step": 61730 + }, + { + "epoch": 0.26506272378351925, + "grad_norm": 0.7479426860809326, + "learning_rate": 7.381104317756526e-05, + "loss": 0.3752789258956909, + "step": 61740 + }, + { + "epoch": 0.26510565587353924, + "grad_norm": 2.9396021366119385, + "learning_rate": 7.380673145744765e-05, + "loss": 0.09405736327171325, + "step": 61750 + }, + { + "epoch": 0.2651485879635592, + "grad_norm": 0.04090379178524017, + "learning_rate": 7.380241973733001e-05, + "loss": 0.17366938591003417, + "step": 61760 + }, + { + "epoch": 0.26519152005357927, + "grad_norm": 5.305620193481445, + "learning_rate": 7.379810801721239e-05, + "loss": 0.35981733798980714, + "step": 61770 + }, + { + "epoch": 0.26523445214359925, + "grad_norm": 0.04921811819076538, + "learning_rate": 7.379379629709477e-05, + "loss": 0.029331964254379273, + "step": 61780 + }, + { + "epoch": 0.26527738423361924, + "grad_norm": 0.32620060443878174, + "learning_rate": 7.378948457697714e-05, + "loss": 0.24157421588897704, + "step": 61790 + }, + { + "epoch": 0.2653203163236393, + "grad_norm": 0.010810810141265392, + "learning_rate": 7.378517285685952e-05, + "loss": 0.08367153406143188, + "step": 61800 + }, + { + "epoch": 0.26536324841365927, + "grad_norm": 0.39795973896980286, + "learning_rate": 7.37808611367419e-05, + "loss": 0.2598013639450073, + "step": 61810 + }, + { + "epoch": 0.26540618050367926, + "grad_norm": 0.05956602096557617, + "learning_rate": 7.377654941662428e-05, + "loss": 0.18582472801208497, + "step": 61820 + }, + { + "epoch": 0.2654491125936993, + "grad_norm": 3.81455659866333, + "learning_rate": 7.377223769650665e-05, + "loss": 0.35067429542541506, + "step": 61830 + }, + { + "epoch": 0.2654920446837193, + "grad_norm": 1.979435682296753, + "learning_rate": 7.376792597638903e-05, + "loss": 0.34986724853515627, + "step": 61840 + }, + { + "epoch": 0.2655349767737393, + "grad_norm": 2.7388412952423096, + "learning_rate": 7.37636142562714e-05, + "loss": 0.3203420639038086, + "step": 61850 + }, + { + "epoch": 0.2655779088637593, + "grad_norm": 0.003037786576896906, + "learning_rate": 7.375930253615377e-05, + "loss": 0.19214255809783937, + "step": 61860 + }, + { + "epoch": 0.2656208409537793, + "grad_norm": 1.1273523569107056, + "learning_rate": 7.375499081603615e-05, + "loss": 0.20186288356781007, + "step": 61870 + }, + { + "epoch": 0.26566377304379934, + "grad_norm": 1.959748387336731, + "learning_rate": 7.375067909591853e-05, + "loss": 0.1970548987388611, + "step": 61880 + }, + { + "epoch": 0.26570670513381933, + "grad_norm": 1.9517850875854492, + "learning_rate": 7.37463673758009e-05, + "loss": 0.20934562683105468, + "step": 61890 + }, + { + "epoch": 0.2657496372238393, + "grad_norm": 0.18594670295715332, + "learning_rate": 7.374205565568328e-05, + "loss": 0.13820563554763793, + "step": 61900 + }, + { + "epoch": 0.26579256931385936, + "grad_norm": 3.1220526695251465, + "learning_rate": 7.373774393556566e-05, + "loss": 0.3077308893203735, + "step": 61910 + }, + { + "epoch": 0.26583550140387935, + "grad_norm": 37.886390686035156, + "learning_rate": 7.373343221544804e-05, + "loss": 0.2505849123001099, + "step": 61920 + }, + { + "epoch": 0.26587843349389934, + "grad_norm": 1.8187617063522339, + "learning_rate": 7.37291204953304e-05, + "loss": 0.2245555639266968, + "step": 61930 + }, + { + "epoch": 0.2659213655839194, + "grad_norm": 2.576911687850952, + "learning_rate": 7.372480877521278e-05, + "loss": 0.3204474687576294, + "step": 61940 + }, + { + "epoch": 0.26596429767393936, + "grad_norm": 1.5807483196258545, + "learning_rate": 7.372049705509515e-05, + "loss": 0.23133842945098876, + "step": 61950 + }, + { + "epoch": 0.26600722976395935, + "grad_norm": 0.004031818360090256, + "learning_rate": 7.371618533497753e-05, + "loss": 0.30945868492126466, + "step": 61960 + }, + { + "epoch": 0.2660501618539794, + "grad_norm": 0.18390342593193054, + "learning_rate": 7.371187361485992e-05, + "loss": 0.21341826915740966, + "step": 61970 + }, + { + "epoch": 0.2660930939439994, + "grad_norm": 3.562913656234741, + "learning_rate": 7.37075618947423e-05, + "loss": 0.2335270404815674, + "step": 61980 + }, + { + "epoch": 0.26613602603401937, + "grad_norm": 8.34090518951416, + "learning_rate": 7.370325017462468e-05, + "loss": 0.1888748288154602, + "step": 61990 + }, + { + "epoch": 0.2661789581240394, + "grad_norm": 0.03201881796121597, + "learning_rate": 7.369893845450706e-05, + "loss": 0.22842373847961425, + "step": 62000 + }, + { + "epoch": 0.2661789581240394, + "eval_loss": 0.43444526195526123, + "eval_runtime": 27.4274, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 62000 + }, + { + "epoch": 0.2662218902140594, + "grad_norm": 0.0013862289488315582, + "learning_rate": 7.369462673438942e-05, + "loss": 0.10994983911514282, + "step": 62010 + }, + { + "epoch": 0.2662648223040794, + "grad_norm": 0.0036061222199350595, + "learning_rate": 7.36903150142718e-05, + "loss": 0.05537129044532776, + "step": 62020 + }, + { + "epoch": 0.2663077543940994, + "grad_norm": 0.1683593988418579, + "learning_rate": 7.368600329415417e-05, + "loss": 0.4459203243255615, + "step": 62030 + }, + { + "epoch": 0.2663506864841194, + "grad_norm": 1.4167718887329102, + "learning_rate": 7.368169157403655e-05, + "loss": 0.4820812702178955, + "step": 62040 + }, + { + "epoch": 0.2663936185741394, + "grad_norm": 0.642146110534668, + "learning_rate": 7.367737985391893e-05, + "loss": 0.2838521242141724, + "step": 62050 + }, + { + "epoch": 0.26643655066415944, + "grad_norm": 0.9649590253829956, + "learning_rate": 7.36730681338013e-05, + "loss": 0.1945676326751709, + "step": 62060 + }, + { + "epoch": 0.26647948275417943, + "grad_norm": 0.06836599111557007, + "learning_rate": 7.366875641368368e-05, + "loss": 0.26728594303131104, + "step": 62070 + }, + { + "epoch": 0.2665224148441994, + "grad_norm": 0.029291236773133278, + "learning_rate": 7.366444469356606e-05, + "loss": 0.00536903589963913, + "step": 62080 + }, + { + "epoch": 0.26656534693421946, + "grad_norm": 0.01653222367167473, + "learning_rate": 7.366013297344842e-05, + "loss": 0.3317889928817749, + "step": 62090 + }, + { + "epoch": 0.26660827902423945, + "grad_norm": 0.5249482989311218, + "learning_rate": 7.36558212533308e-05, + "loss": 0.22167203426361085, + "step": 62100 + }, + { + "epoch": 0.2666512111142595, + "grad_norm": 0.7123648524284363, + "learning_rate": 7.365150953321318e-05, + "loss": 0.29805192947387693, + "step": 62110 + }, + { + "epoch": 0.2666941432042795, + "grad_norm": 0.003714463673532009, + "learning_rate": 7.364719781309556e-05, + "loss": 0.20269992351531982, + "step": 62120 + }, + { + "epoch": 0.26673707529429946, + "grad_norm": 0.057309433817863464, + "learning_rate": 7.364288609297793e-05, + "loss": 0.43726015090942383, + "step": 62130 + }, + { + "epoch": 0.2667800073843195, + "grad_norm": 0.005555383395403624, + "learning_rate": 7.363857437286031e-05, + "loss": 0.3711522340774536, + "step": 62140 + }, + { + "epoch": 0.2668229394743395, + "grad_norm": 0.39166271686553955, + "learning_rate": 7.363426265274269e-05, + "loss": 0.11932632923126221, + "step": 62150 + }, + { + "epoch": 0.2668658715643595, + "grad_norm": 0.7060807943344116, + "learning_rate": 7.362995093262507e-05, + "loss": 0.3223679780960083, + "step": 62160 + }, + { + "epoch": 0.2669088036543795, + "grad_norm": 0.23493017256259918, + "learning_rate": 7.362563921250744e-05, + "loss": 0.14590442180633545, + "step": 62170 + }, + { + "epoch": 0.2669517357443995, + "grad_norm": 0.04516841843724251, + "learning_rate": 7.36213274923898e-05, + "loss": 0.13015230894088745, + "step": 62180 + }, + { + "epoch": 0.2669946678344195, + "grad_norm": 6.605303764343262, + "learning_rate": 7.36170157722722e-05, + "loss": 0.30696520805358884, + "step": 62190 + }, + { + "epoch": 0.26703759992443954, + "grad_norm": 0.44605475664138794, + "learning_rate": 7.361270405215457e-05, + "loss": 0.20978827476501466, + "step": 62200 + }, + { + "epoch": 0.2670805320144595, + "grad_norm": 0.04324696585536003, + "learning_rate": 7.360839233203695e-05, + "loss": 0.1520734667778015, + "step": 62210 + }, + { + "epoch": 0.2671234641044795, + "grad_norm": 0.11113552004098892, + "learning_rate": 7.360408061191933e-05, + "loss": 0.23489885330200194, + "step": 62220 + }, + { + "epoch": 0.26716639619449956, + "grad_norm": 0.17562143504619598, + "learning_rate": 7.359976889180171e-05, + "loss": 0.2564015626907349, + "step": 62230 + }, + { + "epoch": 0.26720932828451954, + "grad_norm": 0.2895164489746094, + "learning_rate": 7.359545717168408e-05, + "loss": 0.17047150135040284, + "step": 62240 + }, + { + "epoch": 0.26725226037453953, + "grad_norm": 0.21865420043468475, + "learning_rate": 7.359114545156646e-05, + "loss": 0.1545347213745117, + "step": 62250 + }, + { + "epoch": 0.2672951924645596, + "grad_norm": 0.05775504559278488, + "learning_rate": 7.358683373144883e-05, + "loss": 0.17338919639587402, + "step": 62260 + }, + { + "epoch": 0.26733812455457956, + "grad_norm": 0.0026504702400416136, + "learning_rate": 7.35825220113312e-05, + "loss": 0.08958525061607361, + "step": 62270 + }, + { + "epoch": 0.26738105664459955, + "grad_norm": 1.8347655534744263, + "learning_rate": 7.357821029121358e-05, + "loss": 0.2308887481689453, + "step": 62280 + }, + { + "epoch": 0.2674239887346196, + "grad_norm": 0.02356887236237526, + "learning_rate": 7.357389857109596e-05, + "loss": 0.046024075150489806, + "step": 62290 + }, + { + "epoch": 0.2674669208246396, + "grad_norm": 2.049039840698242, + "learning_rate": 7.356958685097833e-05, + "loss": 0.3371156930923462, + "step": 62300 + }, + { + "epoch": 0.2675098529146596, + "grad_norm": 0.5070636868476868, + "learning_rate": 7.356527513086071e-05, + "loss": 0.30076262950897215, + "step": 62310 + }, + { + "epoch": 0.2675527850046796, + "grad_norm": 2.742283821105957, + "learning_rate": 7.356096341074309e-05, + "loss": 0.22705247402191162, + "step": 62320 + }, + { + "epoch": 0.2675957170946996, + "grad_norm": 0.020929275080561638, + "learning_rate": 7.355665169062547e-05, + "loss": 0.4815894603729248, + "step": 62330 + }, + { + "epoch": 0.26763864918471963, + "grad_norm": 2.0954997539520264, + "learning_rate": 7.355233997050783e-05, + "loss": 0.2702434778213501, + "step": 62340 + }, + { + "epoch": 0.2676815812747396, + "grad_norm": 0.01475040428340435, + "learning_rate": 7.354802825039021e-05, + "loss": 0.10310671329498292, + "step": 62350 + }, + { + "epoch": 0.2677245133647596, + "grad_norm": 0.07373770326375961, + "learning_rate": 7.354371653027259e-05, + "loss": 0.3095245838165283, + "step": 62360 + }, + { + "epoch": 0.26776744545477965, + "grad_norm": 7.2039265632629395, + "learning_rate": 7.353940481015496e-05, + "loss": 0.14268449544906617, + "step": 62370 + }, + { + "epoch": 0.26781037754479964, + "grad_norm": 1.4639053344726562, + "learning_rate": 7.353509309003734e-05, + "loss": 0.19518343210220337, + "step": 62380 + }, + { + "epoch": 0.2678533096348196, + "grad_norm": 26.01327133178711, + "learning_rate": 7.353078136991972e-05, + "loss": 0.05920148491859436, + "step": 62390 + }, + { + "epoch": 0.26789624172483967, + "grad_norm": 1.3359770774841309, + "learning_rate": 7.35264696498021e-05, + "loss": 0.37773053646087645, + "step": 62400 + }, + { + "epoch": 0.26793917381485965, + "grad_norm": 1.264256238937378, + "learning_rate": 7.352215792968447e-05, + "loss": 0.3920429706573486, + "step": 62410 + }, + { + "epoch": 0.26798210590487964, + "grad_norm": 0.02614930272102356, + "learning_rate": 7.351784620956685e-05, + "loss": 0.18842644691467286, + "step": 62420 + }, + { + "epoch": 0.2680250379948997, + "grad_norm": 0.5546753406524658, + "learning_rate": 7.351353448944923e-05, + "loss": 0.2389291286468506, + "step": 62430 + }, + { + "epoch": 0.26806797008491967, + "grad_norm": 0.9541937112808228, + "learning_rate": 7.35092227693316e-05, + "loss": 0.36570725440979, + "step": 62440 + }, + { + "epoch": 0.26811090217493966, + "grad_norm": 3.901456594467163, + "learning_rate": 7.350491104921398e-05, + "loss": 0.2512011766433716, + "step": 62450 + }, + { + "epoch": 0.2681538342649597, + "grad_norm": 3.098036050796509, + "learning_rate": 7.350059932909636e-05, + "loss": 0.2772386074066162, + "step": 62460 + }, + { + "epoch": 0.2681967663549797, + "grad_norm": 0.006769151426851749, + "learning_rate": 7.349628760897874e-05, + "loss": 0.33778197765350343, + "step": 62470 + }, + { + "epoch": 0.2682396984449997, + "grad_norm": 1.5368144512176514, + "learning_rate": 7.349197588886111e-05, + "loss": 0.23536946773529052, + "step": 62480 + }, + { + "epoch": 0.2682826305350197, + "grad_norm": 0.004882279317826033, + "learning_rate": 7.348766416874349e-05, + "loss": 0.1848854899406433, + "step": 62490 + }, + { + "epoch": 0.2683255626250397, + "grad_norm": 0.657514214515686, + "learning_rate": 7.348335244862585e-05, + "loss": 0.29116086959838866, + "step": 62500 + }, + { + "epoch": 0.2683684947150597, + "grad_norm": 0.07048039138317108, + "learning_rate": 7.347904072850823e-05, + "loss": 0.1695142149925232, + "step": 62510 + }, + { + "epoch": 0.26841142680507973, + "grad_norm": 0.012177824974060059, + "learning_rate": 7.347472900839061e-05, + "loss": 0.28824386596679685, + "step": 62520 + }, + { + "epoch": 0.2684543588950997, + "grad_norm": 0.058346137404441833, + "learning_rate": 7.347041728827299e-05, + "loss": 0.26667845249176025, + "step": 62530 + }, + { + "epoch": 0.26849729098511976, + "grad_norm": 0.008558766916394234, + "learning_rate": 7.346610556815536e-05, + "loss": 0.2703331708908081, + "step": 62540 + }, + { + "epoch": 0.26854022307513975, + "grad_norm": 0.916430652141571, + "learning_rate": 7.346179384803774e-05, + "loss": 0.15587707757949829, + "step": 62550 + }, + { + "epoch": 0.26858315516515974, + "grad_norm": 0.5451849699020386, + "learning_rate": 7.345748212792012e-05, + "loss": 0.25034103393554685, + "step": 62560 + }, + { + "epoch": 0.2686260872551798, + "grad_norm": 2.4739830493927, + "learning_rate": 7.34531704078025e-05, + "loss": 0.13914070129394532, + "step": 62570 + }, + { + "epoch": 0.26866901934519977, + "grad_norm": 0.006528899073600769, + "learning_rate": 7.344885868768487e-05, + "loss": 0.28967092037200926, + "step": 62580 + }, + { + "epoch": 0.26871195143521975, + "grad_norm": 0.05252145603299141, + "learning_rate": 7.344454696756724e-05, + "loss": 0.32130486965179444, + "step": 62590 + }, + { + "epoch": 0.2687548835252398, + "grad_norm": 3.624072313308716, + "learning_rate": 7.344023524744961e-05, + "loss": 0.23874349594116212, + "step": 62600 + }, + { + "epoch": 0.2687978156152598, + "grad_norm": 0.6444095373153687, + "learning_rate": 7.343592352733199e-05, + "loss": 0.11211415529251098, + "step": 62610 + }, + { + "epoch": 0.26884074770527977, + "grad_norm": 1.74681556224823, + "learning_rate": 7.343161180721437e-05, + "loss": 0.24570322036743164, + "step": 62620 + }, + { + "epoch": 0.2688836797952998, + "grad_norm": 2.3091390132904053, + "learning_rate": 7.342730008709675e-05, + "loss": 0.1712045431137085, + "step": 62630 + }, + { + "epoch": 0.2689266118853198, + "grad_norm": 3.986283540725708, + "learning_rate": 7.342298836697912e-05, + "loss": 0.3222727537155151, + "step": 62640 + }, + { + "epoch": 0.2689695439753398, + "grad_norm": 2.886242389678955, + "learning_rate": 7.34186766468615e-05, + "loss": 0.3050569534301758, + "step": 62650 + }, + { + "epoch": 0.26901247606535983, + "grad_norm": 1.565330147743225, + "learning_rate": 7.341436492674388e-05, + "loss": 0.12091318368911744, + "step": 62660 + }, + { + "epoch": 0.2690554081553798, + "grad_norm": 2.3400821685791016, + "learning_rate": 7.341005320662626e-05, + "loss": 0.17482272386550904, + "step": 62670 + }, + { + "epoch": 0.2690983402453998, + "grad_norm": 0.005332406144589186, + "learning_rate": 7.340574148650863e-05, + "loss": 0.20458731651306153, + "step": 62680 + }, + { + "epoch": 0.26914127233541985, + "grad_norm": 1.2362321615219116, + "learning_rate": 7.340142976639101e-05, + "loss": 0.1346642255783081, + "step": 62690 + }, + { + "epoch": 0.26918420442543983, + "grad_norm": 4.819741249084473, + "learning_rate": 7.339711804627339e-05, + "loss": 0.24375250339508056, + "step": 62700 + }, + { + "epoch": 0.2692271365154598, + "grad_norm": 0.2714204788208008, + "learning_rate": 7.339280632615577e-05, + "loss": 0.16298720836639405, + "step": 62710 + }, + { + "epoch": 0.26927006860547986, + "grad_norm": 1.7343765497207642, + "learning_rate": 7.338849460603814e-05, + "loss": 0.12646753787994386, + "step": 62720 + }, + { + "epoch": 0.26931300069549985, + "grad_norm": 0.6741864681243896, + "learning_rate": 7.338418288592052e-05, + "loss": 0.18241225481033324, + "step": 62730 + }, + { + "epoch": 0.2693559327855199, + "grad_norm": 0.6616902351379395, + "learning_rate": 7.33798711658029e-05, + "loss": 0.22634491920471192, + "step": 62740 + }, + { + "epoch": 0.2693988648755399, + "grad_norm": 4.4191460609436035, + "learning_rate": 7.337555944568526e-05, + "loss": 0.2351382255554199, + "step": 62750 + }, + { + "epoch": 0.26944179696555987, + "grad_norm": 0.004057802725583315, + "learning_rate": 7.337124772556764e-05, + "loss": 0.012336998432874679, + "step": 62760 + }, + { + "epoch": 0.2694847290555799, + "grad_norm": 0.0004717262927442789, + "learning_rate": 7.336693600545002e-05, + "loss": 0.17408014535903932, + "step": 62770 + }, + { + "epoch": 0.2695276611455999, + "grad_norm": 2.6368041038513184, + "learning_rate": 7.336262428533239e-05, + "loss": 0.26660282611846925, + "step": 62780 + }, + { + "epoch": 0.2695705932356199, + "grad_norm": 0.03568703308701515, + "learning_rate": 7.335831256521477e-05, + "loss": 0.3751511812210083, + "step": 62790 + }, + { + "epoch": 0.2696135253256399, + "grad_norm": 1.9657670259475708, + "learning_rate": 7.335400084509715e-05, + "loss": 0.28092188835144044, + "step": 62800 + }, + { + "epoch": 0.2696564574156599, + "grad_norm": 1.9834064245224, + "learning_rate": 7.334968912497953e-05, + "loss": 0.1973501205444336, + "step": 62810 + }, + { + "epoch": 0.2696993895056799, + "grad_norm": 11.444055557250977, + "learning_rate": 7.33453774048619e-05, + "loss": 0.2275404930114746, + "step": 62820 + }, + { + "epoch": 0.26974232159569994, + "grad_norm": 7.298734188079834, + "learning_rate": 7.334106568474427e-05, + "loss": 0.14433352947235106, + "step": 62830 + }, + { + "epoch": 0.26978525368571993, + "grad_norm": 5.212087631225586, + "learning_rate": 7.333675396462664e-05, + "loss": 0.26730144023895264, + "step": 62840 + }, + { + "epoch": 0.2698281857757399, + "grad_norm": 0.022054264321923256, + "learning_rate": 7.333244224450902e-05, + "loss": 0.3407695770263672, + "step": 62850 + }, + { + "epoch": 0.26987111786575996, + "grad_norm": 0.010150299407541752, + "learning_rate": 7.33281305243914e-05, + "loss": 0.14565470218658447, + "step": 62860 + }, + { + "epoch": 0.26991404995577994, + "grad_norm": 0.04980747401714325, + "learning_rate": 7.332381880427378e-05, + "loss": 0.3148836135864258, + "step": 62870 + }, + { + "epoch": 0.26995698204579993, + "grad_norm": 0.029979297891259193, + "learning_rate": 7.331950708415615e-05, + "loss": 0.20162365436553956, + "step": 62880 + }, + { + "epoch": 0.26999991413582, + "grad_norm": 0.3095214068889618, + "learning_rate": 7.331519536403853e-05, + "loss": 0.07629244923591613, + "step": 62890 + }, + { + "epoch": 0.27004284622583996, + "grad_norm": 0.05000022426247597, + "learning_rate": 7.331088364392091e-05, + "loss": 0.20231847763061522, + "step": 62900 + }, + { + "epoch": 0.27008577831585995, + "grad_norm": 0.016812866553664207, + "learning_rate": 7.330657192380328e-05, + "loss": 0.1995375156402588, + "step": 62910 + }, + { + "epoch": 0.27012871040588, + "grad_norm": 0.03599417582154274, + "learning_rate": 7.330226020368566e-05, + "loss": 0.09891985058784485, + "step": 62920 + }, + { + "epoch": 0.2701716424959, + "grad_norm": 0.3898320496082306, + "learning_rate": 7.329794848356804e-05, + "loss": 0.12806140184402465, + "step": 62930 + }, + { + "epoch": 0.27021457458591996, + "grad_norm": 0.009992690756917, + "learning_rate": 7.329363676345042e-05, + "loss": 0.12640279531478882, + "step": 62940 + }, + { + "epoch": 0.27025750667594, + "grad_norm": 0.11849182844161987, + "learning_rate": 7.32893250433328e-05, + "loss": 0.3613492488861084, + "step": 62950 + }, + { + "epoch": 0.27030043876596, + "grad_norm": 1.4423352479934692, + "learning_rate": 7.328501332321517e-05, + "loss": 0.4773262977600098, + "step": 62960 + }, + { + "epoch": 0.27034337085598004, + "grad_norm": 0.05382090061903, + "learning_rate": 7.328070160309755e-05, + "loss": 0.17667585611343384, + "step": 62970 + }, + { + "epoch": 0.270386302946, + "grad_norm": 0.3748687207698822, + "learning_rate": 7.327638988297993e-05, + "loss": 0.32606868743896483, + "step": 62980 + }, + { + "epoch": 0.27042923503602, + "grad_norm": 1.5009466409683228, + "learning_rate": 7.32720781628623e-05, + "loss": 0.21618103981018066, + "step": 62990 + }, + { + "epoch": 0.27047216712604005, + "grad_norm": 0.018100149929523468, + "learning_rate": 7.326776644274467e-05, + "loss": 0.2740698575973511, + "step": 63000 + }, + { + "epoch": 0.27047216712604005, + "eval_loss": 0.4309654235839844, + "eval_runtime": 27.4414, + "eval_samples_per_second": 3.644, + "eval_steps_per_second": 3.644, + "step": 63000 + }, + { + "epoch": 0.27051509921606004, + "grad_norm": 0.007100074086338282, + "learning_rate": 7.326345472262704e-05, + "loss": 0.326142692565918, + "step": 63010 + }, + { + "epoch": 0.27055803130608, + "grad_norm": 4.347993850708008, + "learning_rate": 7.325914300250942e-05, + "loss": 0.08076274394989014, + "step": 63020 + }, + { + "epoch": 0.27060096339610007, + "grad_norm": 0.4008999466896057, + "learning_rate": 7.32548312823918e-05, + "loss": 0.3010993242263794, + "step": 63030 + }, + { + "epoch": 0.27064389548612006, + "grad_norm": 1.1022944450378418, + "learning_rate": 7.325051956227418e-05, + "loss": 0.30613150596618655, + "step": 63040 + }, + { + "epoch": 0.27068682757614004, + "grad_norm": 0.004065214190632105, + "learning_rate": 7.324620784215655e-05, + "loss": 0.08829469680786133, + "step": 63050 + }, + { + "epoch": 0.2707297596661601, + "grad_norm": 1.723278522491455, + "learning_rate": 7.324189612203893e-05, + "loss": 0.4858261585235596, + "step": 63060 + }, + { + "epoch": 0.2707726917561801, + "grad_norm": 3.401308298110962, + "learning_rate": 7.323758440192131e-05, + "loss": 0.40488252639770506, + "step": 63070 + }, + { + "epoch": 0.27081562384620006, + "grad_norm": 0.10068821161985397, + "learning_rate": 7.323327268180367e-05, + "loss": 0.10350894927978516, + "step": 63080 + }, + { + "epoch": 0.2708585559362201, + "grad_norm": 3.938289165496826, + "learning_rate": 7.322896096168605e-05, + "loss": 0.27509207725524903, + "step": 63090 + }, + { + "epoch": 0.2709014880262401, + "grad_norm": 0.045174308121204376, + "learning_rate": 7.322464924156843e-05, + "loss": 0.2806502342224121, + "step": 63100 + }, + { + "epoch": 0.2709444201162601, + "grad_norm": 0.09346527606248856, + "learning_rate": 7.32203375214508e-05, + "loss": 0.4489753723144531, + "step": 63110 + }, + { + "epoch": 0.2709873522062801, + "grad_norm": 14.672597885131836, + "learning_rate": 7.321602580133318e-05, + "loss": 0.22636539936065675, + "step": 63120 + }, + { + "epoch": 0.2710302842963001, + "grad_norm": 0.023005153983831406, + "learning_rate": 7.321171408121556e-05, + "loss": 0.3792133331298828, + "step": 63130 + }, + { + "epoch": 0.2710732163863201, + "grad_norm": 0.024886643514037132, + "learning_rate": 7.320740236109794e-05, + "loss": 0.23559410572052003, + "step": 63140 + }, + { + "epoch": 0.27111614847634014, + "grad_norm": 1.5846565961837769, + "learning_rate": 7.320309064098031e-05, + "loss": 0.34138927459716795, + "step": 63150 + }, + { + "epoch": 0.2711590805663601, + "grad_norm": 8.62157917022705, + "learning_rate": 7.319877892086269e-05, + "loss": 0.150002384185791, + "step": 63160 + }, + { + "epoch": 0.27120201265638016, + "grad_norm": 4.657135963439941, + "learning_rate": 7.319446720074507e-05, + "loss": 0.165123450756073, + "step": 63170 + }, + { + "epoch": 0.27124494474640015, + "grad_norm": 0.5713585615158081, + "learning_rate": 7.319015548062745e-05, + "loss": 0.39691801071166993, + "step": 63180 + }, + { + "epoch": 0.27128787683642014, + "grad_norm": 0.8264710903167725, + "learning_rate": 7.318584376050982e-05, + "loss": 0.18502925634384154, + "step": 63190 + }, + { + "epoch": 0.2713308089264402, + "grad_norm": 0.016064446419477463, + "learning_rate": 7.31815320403922e-05, + "loss": 0.13511744737625123, + "step": 63200 + }, + { + "epoch": 0.27137374101646017, + "grad_norm": 0.029205219820141792, + "learning_rate": 7.317722032027458e-05, + "loss": 0.0855722427368164, + "step": 63210 + }, + { + "epoch": 0.27141667310648016, + "grad_norm": 0.019982269033789635, + "learning_rate": 7.317290860015696e-05, + "loss": 0.291228199005127, + "step": 63220 + }, + { + "epoch": 0.2714596051965002, + "grad_norm": 1.64790678024292, + "learning_rate": 7.316859688003933e-05, + "loss": 0.33681635856628417, + "step": 63230 + }, + { + "epoch": 0.2715025372865202, + "grad_norm": 1.967416524887085, + "learning_rate": 7.31642851599217e-05, + "loss": 0.22783217430114747, + "step": 63240 + }, + { + "epoch": 0.27154546937654017, + "grad_norm": 1.4456120729446411, + "learning_rate": 7.315997343980407e-05, + "loss": 0.2588557004928589, + "step": 63250 + }, + { + "epoch": 0.2715884014665602, + "grad_norm": 1.7478188276290894, + "learning_rate": 7.315566171968645e-05, + "loss": 0.33290698528289797, + "step": 63260 + }, + { + "epoch": 0.2716313335565802, + "grad_norm": 0.002389519242569804, + "learning_rate": 7.315134999956883e-05, + "loss": 0.31861014366149903, + "step": 63270 + }, + { + "epoch": 0.2716742656466002, + "grad_norm": 0.009542024694383144, + "learning_rate": 7.31470382794512e-05, + "loss": 0.20476877689361572, + "step": 63280 + }, + { + "epoch": 0.27171719773662023, + "grad_norm": 3.3123133182525635, + "learning_rate": 7.314272655933358e-05, + "loss": 0.28285880088806153, + "step": 63290 + }, + { + "epoch": 0.2717601298266402, + "grad_norm": 0.1270076185464859, + "learning_rate": 7.313841483921596e-05, + "loss": 0.0846854031085968, + "step": 63300 + }, + { + "epoch": 0.2718030619166602, + "grad_norm": 1.4891453981399536, + "learning_rate": 7.313410311909834e-05, + "loss": 0.1929972767829895, + "step": 63310 + }, + { + "epoch": 0.27184599400668025, + "grad_norm": 1.1889742612838745, + "learning_rate": 7.312979139898072e-05, + "loss": 0.27428336143493653, + "step": 63320 + }, + { + "epoch": 0.27188892609670023, + "grad_norm": 2.5377233028411865, + "learning_rate": 7.312547967886308e-05, + "loss": 0.2582359313964844, + "step": 63330 + }, + { + "epoch": 0.2719318581867202, + "grad_norm": 0.23425069451332092, + "learning_rate": 7.312116795874546e-05, + "loss": 0.14315102100372315, + "step": 63340 + }, + { + "epoch": 0.27197479027674026, + "grad_norm": 3.611154317855835, + "learning_rate": 7.311685623862783e-05, + "loss": 0.23460731506347657, + "step": 63350 + }, + { + "epoch": 0.27201772236676025, + "grad_norm": 2.1825406551361084, + "learning_rate": 7.311254451851021e-05, + "loss": 0.2122056007385254, + "step": 63360 + }, + { + "epoch": 0.27206065445678024, + "grad_norm": 0.25567686557769775, + "learning_rate": 7.310823279839259e-05, + "loss": 0.22430589199066162, + "step": 63370 + }, + { + "epoch": 0.2721035865468003, + "grad_norm": 1.1983752250671387, + "learning_rate": 7.310392107827498e-05, + "loss": 0.16110701560974122, + "step": 63380 + }, + { + "epoch": 0.27214651863682027, + "grad_norm": 0.022660892456769943, + "learning_rate": 7.309960935815736e-05, + "loss": 0.20464136600494384, + "step": 63390 + }, + { + "epoch": 0.2721894507268403, + "grad_norm": 2.540070056915283, + "learning_rate": 7.309529763803973e-05, + "loss": 0.3315227746963501, + "step": 63400 + }, + { + "epoch": 0.2722323828168603, + "grad_norm": 3.014178514480591, + "learning_rate": 7.30909859179221e-05, + "loss": 0.1401577115058899, + "step": 63410 + }, + { + "epoch": 0.2722753149068803, + "grad_norm": 2.405425786972046, + "learning_rate": 7.308667419780448e-05, + "loss": 0.5511850833892822, + "step": 63420 + }, + { + "epoch": 0.2723182469969003, + "grad_norm": 0.009172724559903145, + "learning_rate": 7.308236247768685e-05, + "loss": 0.318274998664856, + "step": 63430 + }, + { + "epoch": 0.2723611790869203, + "grad_norm": 0.16494855284690857, + "learning_rate": 7.307805075756923e-05, + "loss": 0.0355972558259964, + "step": 63440 + }, + { + "epoch": 0.2724041111769403, + "grad_norm": 2.663872718811035, + "learning_rate": 7.307373903745161e-05, + "loss": 0.29265077114105226, + "step": 63450 + }, + { + "epoch": 0.27244704326696034, + "grad_norm": 1.3054550886154175, + "learning_rate": 7.306942731733398e-05, + "loss": 0.08337785601615906, + "step": 63460 + }, + { + "epoch": 0.27248997535698033, + "grad_norm": 2.61496901512146, + "learning_rate": 7.306511559721636e-05, + "loss": 0.23691489696502685, + "step": 63470 + }, + { + "epoch": 0.2725329074470003, + "grad_norm": 0.04049558565020561, + "learning_rate": 7.306080387709874e-05, + "loss": 0.16273421049118042, + "step": 63480 + }, + { + "epoch": 0.27257583953702036, + "grad_norm": 1.6018909215927124, + "learning_rate": 7.30564921569811e-05, + "loss": 0.19723182916641235, + "step": 63490 + }, + { + "epoch": 0.27261877162704035, + "grad_norm": 0.0021466747857630253, + "learning_rate": 7.305218043686348e-05, + "loss": 0.025392404198646544, + "step": 63500 + }, + { + "epoch": 0.27266170371706033, + "grad_norm": 4.236203670501709, + "learning_rate": 7.304786871674586e-05, + "loss": 0.16003117561340333, + "step": 63510 + }, + { + "epoch": 0.2727046358070804, + "grad_norm": 0.028405891731381416, + "learning_rate": 7.304355699662824e-05, + "loss": 0.32063629627227785, + "step": 63520 + }, + { + "epoch": 0.27274756789710036, + "grad_norm": 0.028358174487948418, + "learning_rate": 7.303924527651061e-05, + "loss": 0.03217737674713135, + "step": 63530 + }, + { + "epoch": 0.27279049998712035, + "grad_norm": 0.0237318966537714, + "learning_rate": 7.303493355639299e-05, + "loss": 0.18059996366500855, + "step": 63540 + }, + { + "epoch": 0.2728334320771404, + "grad_norm": 0.002409103326499462, + "learning_rate": 7.303062183627537e-05, + "loss": 0.09812519550323487, + "step": 63550 + }, + { + "epoch": 0.2728763641671604, + "grad_norm": 0.16705992817878723, + "learning_rate": 7.302631011615774e-05, + "loss": 0.19015454053878783, + "step": 63560 + }, + { + "epoch": 0.27291929625718037, + "grad_norm": 0.0017375649185851216, + "learning_rate": 7.302199839604011e-05, + "loss": 0.18906515836715698, + "step": 63570 + }, + { + "epoch": 0.2729622283472004, + "grad_norm": 1.643397331237793, + "learning_rate": 7.301768667592249e-05, + "loss": 0.37471392154693606, + "step": 63580 + }, + { + "epoch": 0.2730051604372204, + "grad_norm": 0.008646286092698574, + "learning_rate": 7.301337495580486e-05, + "loss": 0.38129801750183107, + "step": 63590 + }, + { + "epoch": 0.27304809252724044, + "grad_norm": 3.7929863929748535, + "learning_rate": 7.300906323568725e-05, + "loss": 0.24006271362304688, + "step": 63600 + }, + { + "epoch": 0.2730910246172604, + "grad_norm": 5.849998474121094, + "learning_rate": 7.300475151556963e-05, + "loss": 0.13114572763442994, + "step": 63610 + }, + { + "epoch": 0.2731339567072804, + "grad_norm": 0.018327688798308372, + "learning_rate": 7.300043979545201e-05, + "loss": 0.08518844246864318, + "step": 63620 + }, + { + "epoch": 0.27317688879730045, + "grad_norm": 0.14316414296627045, + "learning_rate": 7.299612807533439e-05, + "loss": 0.052723509073257444, + "step": 63630 + }, + { + "epoch": 0.27321982088732044, + "grad_norm": 2.5703630447387695, + "learning_rate": 7.299181635521676e-05, + "loss": 0.26565487384796144, + "step": 63640 + }, + { + "epoch": 0.27326275297734043, + "grad_norm": 1.4851138591766357, + "learning_rate": 7.298750463509914e-05, + "loss": 0.18685510158538818, + "step": 63650 + }, + { + "epoch": 0.27330568506736047, + "grad_norm": 0.04608583077788353, + "learning_rate": 7.29831929149815e-05, + "loss": 0.11118690967559815, + "step": 63660 + }, + { + "epoch": 0.27334861715738046, + "grad_norm": 0.024671832099556923, + "learning_rate": 7.297888119486388e-05, + "loss": 0.30791258811950684, + "step": 63670 + }, + { + "epoch": 0.27339154924740044, + "grad_norm": 18.297765731811523, + "learning_rate": 7.297456947474626e-05, + "loss": 0.1248279333114624, + "step": 63680 + }, + { + "epoch": 0.2734344813374205, + "grad_norm": 34.01502990722656, + "learning_rate": 7.297025775462864e-05, + "loss": 0.20545308589935302, + "step": 63690 + }, + { + "epoch": 0.2734774134274405, + "grad_norm": 0.03677314892411232, + "learning_rate": 7.296594603451101e-05, + "loss": 0.21320362091064454, + "step": 63700 + }, + { + "epoch": 0.27352034551746046, + "grad_norm": 2.0875508785247803, + "learning_rate": 7.296163431439339e-05, + "loss": 0.3435171604156494, + "step": 63710 + }, + { + "epoch": 0.2735632776074805, + "grad_norm": 0.06830603629350662, + "learning_rate": 7.295732259427577e-05, + "loss": 0.17610445022583007, + "step": 63720 + }, + { + "epoch": 0.2736062096975005, + "grad_norm": 0.676224410533905, + "learning_rate": 7.295301087415815e-05, + "loss": 0.3897466897964478, + "step": 63730 + }, + { + "epoch": 0.2736491417875205, + "grad_norm": 4.646862983703613, + "learning_rate": 7.294869915404051e-05, + "loss": 0.28968157768249514, + "step": 63740 + }, + { + "epoch": 0.2736920738775405, + "grad_norm": 0.4177786409854889, + "learning_rate": 7.294438743392289e-05, + "loss": 0.266361141204834, + "step": 63750 + }, + { + "epoch": 0.2737350059675605, + "grad_norm": 0.4810597002506256, + "learning_rate": 7.294007571380526e-05, + "loss": 0.2344064950942993, + "step": 63760 + }, + { + "epoch": 0.2737779380575805, + "grad_norm": 0.0033130869269371033, + "learning_rate": 7.293576399368764e-05, + "loss": 0.15541367530822753, + "step": 63770 + }, + { + "epoch": 0.27382087014760054, + "grad_norm": 0.339785635471344, + "learning_rate": 7.293145227357002e-05, + "loss": 0.24946272373199463, + "step": 63780 + }, + { + "epoch": 0.2738638022376205, + "grad_norm": 0.017482534050941467, + "learning_rate": 7.29271405534524e-05, + "loss": 0.20425059795379638, + "step": 63790 + }, + { + "epoch": 0.2739067343276405, + "grad_norm": 7.477847576141357, + "learning_rate": 7.292282883333477e-05, + "loss": 0.14926481246948242, + "step": 63800 + }, + { + "epoch": 0.27394966641766055, + "grad_norm": 0.003075790125876665, + "learning_rate": 7.291851711321715e-05, + "loss": 0.18997302055358886, + "step": 63810 + }, + { + "epoch": 0.27399259850768054, + "grad_norm": 0.0006680086953565478, + "learning_rate": 7.291420539309953e-05, + "loss": 0.2547083616256714, + "step": 63820 + }, + { + "epoch": 0.2740355305977006, + "grad_norm": 0.021308038383722305, + "learning_rate": 7.29098936729819e-05, + "loss": 0.2349924325942993, + "step": 63830 + }, + { + "epoch": 0.27407846268772057, + "grad_norm": 4.248676300048828, + "learning_rate": 7.290558195286428e-05, + "loss": 0.4020249843597412, + "step": 63840 + }, + { + "epoch": 0.27412139477774056, + "grad_norm": 0.2813398540019989, + "learning_rate": 7.290127023274666e-05, + "loss": 0.30169265270233153, + "step": 63850 + }, + { + "epoch": 0.2741643268677606, + "grad_norm": 0.5681740641593933, + "learning_rate": 7.289695851262904e-05, + "loss": 0.10693715810775757, + "step": 63860 + }, + { + "epoch": 0.2742072589577806, + "grad_norm": 10.923131942749023, + "learning_rate": 7.289264679251141e-05, + "loss": 0.15194600820541382, + "step": 63870 + }, + { + "epoch": 0.2742501910478006, + "grad_norm": 0.02402135357260704, + "learning_rate": 7.288833507239379e-05, + "loss": 0.17886065244674682, + "step": 63880 + }, + { + "epoch": 0.2742931231378206, + "grad_norm": 1.7143903970718384, + "learning_rate": 7.288402335227617e-05, + "loss": 0.40548253059387207, + "step": 63890 + }, + { + "epoch": 0.2743360552278406, + "grad_norm": 0.28345656394958496, + "learning_rate": 7.287971163215853e-05, + "loss": 0.13184144496917724, + "step": 63900 + }, + { + "epoch": 0.2743789873178606, + "grad_norm": 0.12064622342586517, + "learning_rate": 7.287539991204091e-05, + "loss": 0.30831031799316405, + "step": 63910 + }, + { + "epoch": 0.27442191940788063, + "grad_norm": 1.588731050491333, + "learning_rate": 7.287108819192329e-05, + "loss": 0.10332268476486206, + "step": 63920 + }, + { + "epoch": 0.2744648514979006, + "grad_norm": 0.0007470548735000193, + "learning_rate": 7.286677647180567e-05, + "loss": 0.15131561756134032, + "step": 63930 + }, + { + "epoch": 0.2745077835879206, + "grad_norm": 1.6601921319961548, + "learning_rate": 7.286246475168804e-05, + "loss": 0.19120069742202758, + "step": 63940 + }, + { + "epoch": 0.27455071567794065, + "grad_norm": 0.511339545249939, + "learning_rate": 7.285815303157042e-05, + "loss": 0.16427149772644042, + "step": 63950 + }, + { + "epoch": 0.27459364776796064, + "grad_norm": 1.9782980680465698, + "learning_rate": 7.28538413114528e-05, + "loss": 0.22477269172668457, + "step": 63960 + }, + { + "epoch": 0.2746365798579806, + "grad_norm": 0.11178219318389893, + "learning_rate": 7.284952959133517e-05, + "loss": 0.09104756116867066, + "step": 63970 + }, + { + "epoch": 0.27467951194800067, + "grad_norm": 0.16948817670345306, + "learning_rate": 7.284521787121754e-05, + "loss": 0.3317331075668335, + "step": 63980 + }, + { + "epoch": 0.27472244403802065, + "grad_norm": 0.03879360482096672, + "learning_rate": 7.284090615109992e-05, + "loss": 0.3537483215332031, + "step": 63990 + }, + { + "epoch": 0.27476537612804064, + "grad_norm": 0.005190638825297356, + "learning_rate": 7.28365944309823e-05, + "loss": 0.1923106789588928, + "step": 64000 + }, + { + "epoch": 0.27476537612804064, + "eval_loss": 0.4264955222606659, + "eval_runtime": 27.3919, + "eval_samples_per_second": 3.651, + "eval_steps_per_second": 3.651, + "step": 64000 + }, + { + "epoch": 0.2748083082180607, + "grad_norm": 11.412931442260742, + "learning_rate": 7.283228271086467e-05, + "loss": 0.1319342851638794, + "step": 64010 + }, + { + "epoch": 0.27485124030808067, + "grad_norm": 2.2989232540130615, + "learning_rate": 7.282797099074705e-05, + "loss": 0.3625922441482544, + "step": 64020 + }, + { + "epoch": 0.2748941723981007, + "grad_norm": 0.015120278112590313, + "learning_rate": 7.282365927062943e-05, + "loss": 0.21226191520690918, + "step": 64030 + }, + { + "epoch": 0.2749371044881207, + "grad_norm": 1.596508502960205, + "learning_rate": 7.28193475505118e-05, + "loss": 0.2687784433364868, + "step": 64040 + }, + { + "epoch": 0.2749800365781407, + "grad_norm": 0.04696401581168175, + "learning_rate": 7.281503583039418e-05, + "loss": 0.19433155059814453, + "step": 64050 + }, + { + "epoch": 0.2750229686681607, + "grad_norm": 2.4514739513397217, + "learning_rate": 7.281072411027656e-05, + "loss": 0.27985870838165283, + "step": 64060 + }, + { + "epoch": 0.2750659007581807, + "grad_norm": 0.027478374540805817, + "learning_rate": 7.280641239015893e-05, + "loss": 0.3420846462249756, + "step": 64070 + }, + { + "epoch": 0.2751088328482007, + "grad_norm": 0.002903494518250227, + "learning_rate": 7.280210067004131e-05, + "loss": 0.0563319206237793, + "step": 64080 + }, + { + "epoch": 0.27515176493822074, + "grad_norm": 1.0330356359481812, + "learning_rate": 7.279778894992369e-05, + "loss": 0.38048701286315917, + "step": 64090 + }, + { + "epoch": 0.27519469702824073, + "grad_norm": 0.028292180970311165, + "learning_rate": 7.279347722980607e-05, + "loss": 0.13075207471847533, + "step": 64100 + }, + { + "epoch": 0.2752376291182607, + "grad_norm": 0.10859852284193039, + "learning_rate": 7.278916550968844e-05, + "loss": 0.03039872646331787, + "step": 64110 + }, + { + "epoch": 0.27528056120828076, + "grad_norm": 0.02786979079246521, + "learning_rate": 7.278485378957082e-05, + "loss": 0.1169194221496582, + "step": 64120 + }, + { + "epoch": 0.27532349329830075, + "grad_norm": 16.22138023376465, + "learning_rate": 7.27805420694532e-05, + "loss": 0.2297053813934326, + "step": 64130 + }, + { + "epoch": 0.27536642538832073, + "grad_norm": 0.013343838974833488, + "learning_rate": 7.277623034933558e-05, + "loss": 0.0033049676567316055, + "step": 64140 + }, + { + "epoch": 0.2754093574783408, + "grad_norm": 0.0016603783005848527, + "learning_rate": 7.277191862921794e-05, + "loss": 0.40090460777282716, + "step": 64150 + }, + { + "epoch": 0.27545228956836076, + "grad_norm": 0.025042593479156494, + "learning_rate": 7.276760690910032e-05, + "loss": 0.10281308889389038, + "step": 64160 + }, + { + "epoch": 0.27549522165838075, + "grad_norm": 0.031472593545913696, + "learning_rate": 7.27632951889827e-05, + "loss": 0.05010194182395935, + "step": 64170 + }, + { + "epoch": 0.2755381537484008, + "grad_norm": 7.79873514175415, + "learning_rate": 7.275898346886507e-05, + "loss": 0.3900416851043701, + "step": 64180 + }, + { + "epoch": 0.2755810858384208, + "grad_norm": 0.182487353682518, + "learning_rate": 7.275467174874745e-05, + "loss": 0.2953609228134155, + "step": 64190 + }, + { + "epoch": 0.27562401792844077, + "grad_norm": 2.5202181339263916, + "learning_rate": 7.275036002862983e-05, + "loss": 0.19363387823104858, + "step": 64200 + }, + { + "epoch": 0.2756669500184608, + "grad_norm": 0.009070219472050667, + "learning_rate": 7.27460483085122e-05, + "loss": 0.10885969400405884, + "step": 64210 + }, + { + "epoch": 0.2757098821084808, + "grad_norm": 0.16483668982982635, + "learning_rate": 7.274173658839458e-05, + "loss": 0.19616938829421998, + "step": 64220 + }, + { + "epoch": 0.2757528141985008, + "grad_norm": 1.6876084804534912, + "learning_rate": 7.273742486827695e-05, + "loss": 0.3560824394226074, + "step": 64230 + }, + { + "epoch": 0.2757957462885208, + "grad_norm": 0.10044686496257782, + "learning_rate": 7.273311314815932e-05, + "loss": 0.3663048505783081, + "step": 64240 + }, + { + "epoch": 0.2758386783785408, + "grad_norm": 2.984308958053589, + "learning_rate": 7.27288014280417e-05, + "loss": 0.31598410606384275, + "step": 64250 + }, + { + "epoch": 0.27588161046856086, + "grad_norm": 0.024099402129650116, + "learning_rate": 7.272448970792408e-05, + "loss": 0.3546855688095093, + "step": 64260 + }, + { + "epoch": 0.27592454255858084, + "grad_norm": 0.4444213807582855, + "learning_rate": 7.272017798780645e-05, + "loss": 0.3234513998031616, + "step": 64270 + }, + { + "epoch": 0.27596747464860083, + "grad_norm": 0.12474822998046875, + "learning_rate": 7.271586626768883e-05, + "loss": 0.17412568330764772, + "step": 64280 + }, + { + "epoch": 0.2760104067386209, + "grad_norm": 15.606555938720703, + "learning_rate": 7.271155454757121e-05, + "loss": 0.2279426336288452, + "step": 64290 + }, + { + "epoch": 0.27605333882864086, + "grad_norm": 0.028739456087350845, + "learning_rate": 7.270724282745359e-05, + "loss": 0.2794104337692261, + "step": 64300 + }, + { + "epoch": 0.27609627091866085, + "grad_norm": 1.804038643836975, + "learning_rate": 7.270293110733596e-05, + "loss": 0.22412712574005128, + "step": 64310 + }, + { + "epoch": 0.2761392030086809, + "grad_norm": 2.9447500705718994, + "learning_rate": 7.269861938721834e-05, + "loss": 0.239043664932251, + "step": 64320 + }, + { + "epoch": 0.2761821350987009, + "grad_norm": 0.13474737107753754, + "learning_rate": 7.269430766710072e-05, + "loss": 0.33125255107879636, + "step": 64330 + }, + { + "epoch": 0.27622506718872086, + "grad_norm": 0.08215455710887909, + "learning_rate": 7.26899959469831e-05, + "loss": 0.10604830980300903, + "step": 64340 + }, + { + "epoch": 0.2762679992787409, + "grad_norm": 1.0933629274368286, + "learning_rate": 7.268568422686547e-05, + "loss": 0.22616326808929443, + "step": 64350 + }, + { + "epoch": 0.2763109313687609, + "grad_norm": 1.9105830192565918, + "learning_rate": 7.268137250674785e-05, + "loss": 0.2156665563583374, + "step": 64360 + }, + { + "epoch": 0.2763538634587809, + "grad_norm": 0.22264964878559113, + "learning_rate": 7.267706078663023e-05, + "loss": 0.17556583881378174, + "step": 64370 + }, + { + "epoch": 0.2763967955488009, + "grad_norm": 2.2902016639709473, + "learning_rate": 7.26727490665126e-05, + "loss": 0.15605368614196777, + "step": 64380 + }, + { + "epoch": 0.2764397276388209, + "grad_norm": 3.392604351043701, + "learning_rate": 7.266843734639498e-05, + "loss": 0.3492011070251465, + "step": 64390 + }, + { + "epoch": 0.2764826597288409, + "grad_norm": 1.067148208618164, + "learning_rate": 7.266412562627735e-05, + "loss": 0.33337039947509767, + "step": 64400 + }, + { + "epoch": 0.27652559181886094, + "grad_norm": 3.995007038116455, + "learning_rate": 7.265981390615972e-05, + "loss": 0.2956079483032227, + "step": 64410 + }, + { + "epoch": 0.2765685239088809, + "grad_norm": 2.1864147186279297, + "learning_rate": 7.26555021860421e-05, + "loss": 0.2727602481842041, + "step": 64420 + }, + { + "epoch": 0.2766114559989009, + "grad_norm": 0.38919079303741455, + "learning_rate": 7.265119046592448e-05, + "loss": 0.2046382188796997, + "step": 64430 + }, + { + "epoch": 0.27665438808892096, + "grad_norm": 0.08920589834451675, + "learning_rate": 7.264687874580686e-05, + "loss": 0.009954053908586502, + "step": 64440 + }, + { + "epoch": 0.27669732017894094, + "grad_norm": 0.05094486102461815, + "learning_rate": 7.264256702568923e-05, + "loss": 0.08167248368263244, + "step": 64450 + }, + { + "epoch": 0.276740252268961, + "grad_norm": 1.2057560682296753, + "learning_rate": 7.263825530557161e-05, + "loss": 0.21390509605407715, + "step": 64460 + }, + { + "epoch": 0.27678318435898097, + "grad_norm": 1.2782899141311646, + "learning_rate": 7.263394358545399e-05, + "loss": 0.27840044498443606, + "step": 64470 + }, + { + "epoch": 0.27682611644900096, + "grad_norm": 0.01978636533021927, + "learning_rate": 7.262963186533635e-05, + "loss": 0.08450507521629333, + "step": 64480 + }, + { + "epoch": 0.276869048539021, + "grad_norm": 0.07557360827922821, + "learning_rate": 7.262532014521873e-05, + "loss": 0.21510164737701415, + "step": 64490 + }, + { + "epoch": 0.276911980629041, + "grad_norm": 1.8195322751998901, + "learning_rate": 7.26210084251011e-05, + "loss": 0.38956317901611326, + "step": 64500 + }, + { + "epoch": 0.276954912719061, + "grad_norm": 0.05904494225978851, + "learning_rate": 7.261669670498348e-05, + "loss": 0.3122358798980713, + "step": 64510 + }, + { + "epoch": 0.276997844809081, + "grad_norm": 4.156544208526611, + "learning_rate": 7.261238498486586e-05, + "loss": 0.22088017463684081, + "step": 64520 + }, + { + "epoch": 0.277040776899101, + "grad_norm": 2.8702967166900635, + "learning_rate": 7.260807326474824e-05, + "loss": 0.1237061619758606, + "step": 64530 + }, + { + "epoch": 0.277083708989121, + "grad_norm": 0.3356216549873352, + "learning_rate": 7.260376154463062e-05, + "loss": 0.2359468460083008, + "step": 64540 + }, + { + "epoch": 0.27712664107914103, + "grad_norm": 44.1283073425293, + "learning_rate": 7.259944982451299e-05, + "loss": 0.40196738243103025, + "step": 64550 + }, + { + "epoch": 0.277169573169161, + "grad_norm": 0.044547755271196365, + "learning_rate": 7.259513810439537e-05, + "loss": 0.3468801736831665, + "step": 64560 + }, + { + "epoch": 0.277212505259181, + "grad_norm": 4.803623199462891, + "learning_rate": 7.259082638427775e-05, + "loss": 0.15955498218536376, + "step": 64570 + }, + { + "epoch": 0.27725543734920105, + "grad_norm": 0.03871012479066849, + "learning_rate": 7.258651466416012e-05, + "loss": 0.09484660625457764, + "step": 64580 + }, + { + "epoch": 0.27729836943922104, + "grad_norm": 2.620669364929199, + "learning_rate": 7.25822029440425e-05, + "loss": 0.25763897895812987, + "step": 64590 + }, + { + "epoch": 0.277341301529241, + "grad_norm": 0.027607867494225502, + "learning_rate": 7.257789122392488e-05, + "loss": 0.14064944982528688, + "step": 64600 + }, + { + "epoch": 0.27738423361926107, + "grad_norm": 0.013144542463123798, + "learning_rate": 7.257357950380726e-05, + "loss": 0.36486527919769285, + "step": 64610 + }, + { + "epoch": 0.27742716570928105, + "grad_norm": 0.011130384169518948, + "learning_rate": 7.256926778368963e-05, + "loss": 0.15375130176544188, + "step": 64620 + }, + { + "epoch": 0.27747009779930104, + "grad_norm": 0.0812644436955452, + "learning_rate": 7.256495606357201e-05, + "loss": 0.25952873229980467, + "step": 64630 + }, + { + "epoch": 0.2775130298893211, + "grad_norm": 1.9299334287643433, + "learning_rate": 7.256064434345438e-05, + "loss": 0.31588876247406006, + "step": 64640 + }, + { + "epoch": 0.27755596197934107, + "grad_norm": 0.5452513694763184, + "learning_rate": 7.255633262333675e-05, + "loss": 0.2496518611907959, + "step": 64650 + }, + { + "epoch": 0.27759889406936106, + "grad_norm": 1.4004237651824951, + "learning_rate": 7.255202090321913e-05, + "loss": 0.2847594738006592, + "step": 64660 + }, + { + "epoch": 0.2776418261593811, + "grad_norm": 0.16185790300369263, + "learning_rate": 7.254770918310151e-05, + "loss": 0.2840761184692383, + "step": 64670 + }, + { + "epoch": 0.2776847582494011, + "grad_norm": 5.281464099884033, + "learning_rate": 7.254339746298388e-05, + "loss": 0.23582305908203124, + "step": 64680 + }, + { + "epoch": 0.27772769033942113, + "grad_norm": 0.009189880453050137, + "learning_rate": 7.253908574286626e-05, + "loss": 0.2242586851119995, + "step": 64690 + }, + { + "epoch": 0.2777706224294411, + "grad_norm": 0.04949837923049927, + "learning_rate": 7.253477402274864e-05, + "loss": 0.1539124608039856, + "step": 64700 + }, + { + "epoch": 0.2778135545194611, + "grad_norm": 0.002345997141674161, + "learning_rate": 7.253046230263102e-05, + "loss": 0.2814460515975952, + "step": 64710 + }, + { + "epoch": 0.27785648660948115, + "grad_norm": 6.816084861755371, + "learning_rate": 7.25261505825134e-05, + "loss": 0.247141432762146, + "step": 64720 + }, + { + "epoch": 0.27789941869950113, + "grad_norm": 3.8454461097717285, + "learning_rate": 7.252183886239576e-05, + "loss": 0.12018647193908691, + "step": 64730 + }, + { + "epoch": 0.2779423507895211, + "grad_norm": 2.1291558742523193, + "learning_rate": 7.251752714227814e-05, + "loss": 0.2656750202178955, + "step": 64740 + }, + { + "epoch": 0.27798528287954116, + "grad_norm": 1.8832786083221436, + "learning_rate": 7.251321542216051e-05, + "loss": 0.40599308013916013, + "step": 64750 + }, + { + "epoch": 0.27802821496956115, + "grad_norm": 1.8022648096084595, + "learning_rate": 7.250890370204289e-05, + "loss": 0.3666477918624878, + "step": 64760 + }, + { + "epoch": 0.27807114705958114, + "grad_norm": 4.709822654724121, + "learning_rate": 7.250459198192527e-05, + "loss": 0.2990487813949585, + "step": 64770 + }, + { + "epoch": 0.2781140791496012, + "grad_norm": 1.3807032108306885, + "learning_rate": 7.250028026180766e-05, + "loss": 0.3683363437652588, + "step": 64780 + }, + { + "epoch": 0.27815701123962117, + "grad_norm": 0.8225862979888916, + "learning_rate": 7.249596854169004e-05, + "loss": 0.29018213748931887, + "step": 64790 + }, + { + "epoch": 0.27819994332964115, + "grad_norm": 0.14179366827011108, + "learning_rate": 7.249165682157241e-05, + "loss": 0.4528830051422119, + "step": 64800 + }, + { + "epoch": 0.2782428754196612, + "grad_norm": 0.004524201154708862, + "learning_rate": 7.248734510145478e-05, + "loss": 0.08117425441741943, + "step": 64810 + }, + { + "epoch": 0.2782858075096812, + "grad_norm": 1.0559335947036743, + "learning_rate": 7.248303338133715e-05, + "loss": 0.366524338722229, + "step": 64820 + }, + { + "epoch": 0.27832873959970117, + "grad_norm": 2.1862030029296875, + "learning_rate": 7.247872166121953e-05, + "loss": 0.4338692188262939, + "step": 64830 + }, + { + "epoch": 0.2783716716897212, + "grad_norm": 1.6690665483474731, + "learning_rate": 7.247440994110191e-05, + "loss": 0.3967538833618164, + "step": 64840 + }, + { + "epoch": 0.2784146037797412, + "grad_norm": 0.0165514275431633, + "learning_rate": 7.247009822098429e-05, + "loss": 0.14806462526321412, + "step": 64850 + }, + { + "epoch": 0.2784575358697612, + "grad_norm": 0.47653165459632874, + "learning_rate": 7.246578650086666e-05, + "loss": 0.176730740070343, + "step": 64860 + }, + { + "epoch": 0.27850046795978123, + "grad_norm": 0.2633458077907562, + "learning_rate": 7.246147478074904e-05, + "loss": 0.191974937915802, + "step": 64870 + }, + { + "epoch": 0.2785434000498012, + "grad_norm": 0.10138729214668274, + "learning_rate": 7.245716306063142e-05, + "loss": 0.2707113742828369, + "step": 64880 + }, + { + "epoch": 0.27858633213982126, + "grad_norm": 0.013255268335342407, + "learning_rate": 7.245285134051378e-05, + "loss": 0.03636242747306824, + "step": 64890 + }, + { + "epoch": 0.27862926422984124, + "grad_norm": 0.002024088054895401, + "learning_rate": 7.244853962039616e-05, + "loss": 0.14607818126678468, + "step": 64900 + }, + { + "epoch": 0.27867219631986123, + "grad_norm": 0.0680466890335083, + "learning_rate": 7.244422790027854e-05, + "loss": 0.2507662773132324, + "step": 64910 + }, + { + "epoch": 0.2787151284098813, + "grad_norm": 0.5838708281517029, + "learning_rate": 7.243991618016091e-05, + "loss": 0.3808858394622803, + "step": 64920 + }, + { + "epoch": 0.27875806049990126, + "grad_norm": 0.08791586011648178, + "learning_rate": 7.243560446004329e-05, + "loss": 0.31563124656677244, + "step": 64930 + }, + { + "epoch": 0.27880099258992125, + "grad_norm": 4.641262531280518, + "learning_rate": 7.243129273992567e-05, + "loss": 0.10856708288192748, + "step": 64940 + }, + { + "epoch": 0.2788439246799413, + "grad_norm": 3.5792038440704346, + "learning_rate": 7.242698101980805e-05, + "loss": 0.19974257946014404, + "step": 64950 + }, + { + "epoch": 0.2788868567699613, + "grad_norm": 0.45726874470710754, + "learning_rate": 7.242266929969042e-05, + "loss": 0.39138312339782716, + "step": 64960 + }, + { + "epoch": 0.27892978885998126, + "grad_norm": 26.862560272216797, + "learning_rate": 7.241835757957279e-05, + "loss": 0.31828885078430175, + "step": 64970 + }, + { + "epoch": 0.2789727209500013, + "grad_norm": 2.756909132003784, + "learning_rate": 7.241404585945516e-05, + "loss": 0.2442950963973999, + "step": 64980 + }, + { + "epoch": 0.2790156530400213, + "grad_norm": 0.04665480926632881, + "learning_rate": 7.240973413933754e-05, + "loss": 0.3142120838165283, + "step": 64990 + }, + { + "epoch": 0.2790585851300413, + "grad_norm": 1.2649515867233276, + "learning_rate": 7.240542241921993e-05, + "loss": 0.3911418914794922, + "step": 65000 + }, + { + "epoch": 0.2790585851300413, + "eval_loss": 0.4333057403564453, + "eval_runtime": 27.4187, + "eval_samples_per_second": 3.647, + "eval_steps_per_second": 3.647, + "step": 65000 + }, + { + "epoch": 0.2791015172200613, + "grad_norm": 2.0587477684020996, + "learning_rate": 7.240111069910231e-05, + "loss": 0.3389150142669678, + "step": 65010 + }, + { + "epoch": 0.2791444493100813, + "grad_norm": 3.1368260383605957, + "learning_rate": 7.239679897898469e-05, + "loss": 0.2532327651977539, + "step": 65020 + }, + { + "epoch": 0.2791873814001013, + "grad_norm": 0.43125808238983154, + "learning_rate": 7.239248725886706e-05, + "loss": 0.1379924535751343, + "step": 65030 + }, + { + "epoch": 0.27923031349012134, + "grad_norm": 1.2250033617019653, + "learning_rate": 7.238817553874944e-05, + "loss": 0.17046927213668822, + "step": 65040 + }, + { + "epoch": 0.2792732455801413, + "grad_norm": 0.06205086410045624, + "learning_rate": 7.23838638186318e-05, + "loss": 0.1221045970916748, + "step": 65050 + }, + { + "epoch": 0.2793161776701613, + "grad_norm": 4.098349571228027, + "learning_rate": 7.237955209851418e-05, + "loss": 0.350847053527832, + "step": 65060 + }, + { + "epoch": 0.27935910976018136, + "grad_norm": 0.5678528547286987, + "learning_rate": 7.237524037839656e-05, + "loss": 0.3726787567138672, + "step": 65070 + }, + { + "epoch": 0.27940204185020134, + "grad_norm": 0.13252170383930206, + "learning_rate": 7.237092865827894e-05, + "loss": 0.2217256784439087, + "step": 65080 + }, + { + "epoch": 0.27944497394022133, + "grad_norm": 15.361329078674316, + "learning_rate": 7.236661693816132e-05, + "loss": 0.2631575345993042, + "step": 65090 + }, + { + "epoch": 0.2794879060302414, + "grad_norm": 0.052907731384038925, + "learning_rate": 7.236230521804369e-05, + "loss": 0.06527657508850097, + "step": 65100 + }, + { + "epoch": 0.27953083812026136, + "grad_norm": 0.037164174020290375, + "learning_rate": 7.235799349792607e-05, + "loss": 0.25826058387756345, + "step": 65110 + }, + { + "epoch": 0.2795737702102814, + "grad_norm": 0.03161971643567085, + "learning_rate": 7.235368177780845e-05, + "loss": 0.21099441051483153, + "step": 65120 + }, + { + "epoch": 0.2796167023003014, + "grad_norm": 1.0425679683685303, + "learning_rate": 7.234937005769082e-05, + "loss": 0.203094482421875, + "step": 65130 + }, + { + "epoch": 0.2796596343903214, + "grad_norm": 0.013792422600090504, + "learning_rate": 7.234505833757319e-05, + "loss": 0.38997640609741213, + "step": 65140 + }, + { + "epoch": 0.2797025664803414, + "grad_norm": 0.2896706759929657, + "learning_rate": 7.234074661745557e-05, + "loss": 0.09263712167739868, + "step": 65150 + }, + { + "epoch": 0.2797454985703614, + "grad_norm": 4.325671672821045, + "learning_rate": 7.233643489733794e-05, + "loss": 0.42386541366577146, + "step": 65160 + }, + { + "epoch": 0.2797884306603814, + "grad_norm": 0.24621273577213287, + "learning_rate": 7.233212317722032e-05, + "loss": 0.16344807147979737, + "step": 65170 + }, + { + "epoch": 0.27983136275040144, + "grad_norm": 1.3230857849121094, + "learning_rate": 7.23278114571027e-05, + "loss": 0.010197050869464874, + "step": 65180 + }, + { + "epoch": 0.2798742948404214, + "grad_norm": 0.006933805998414755, + "learning_rate": 7.232349973698508e-05, + "loss": 0.22311086654663087, + "step": 65190 + }, + { + "epoch": 0.2799172269304414, + "grad_norm": 0.058281030505895615, + "learning_rate": 7.231918801686745e-05, + "loss": 0.2943753719329834, + "step": 65200 + }, + { + "epoch": 0.27996015902046145, + "grad_norm": 3.4062752723693848, + "learning_rate": 7.231487629674983e-05, + "loss": 0.3292430877685547, + "step": 65210 + }, + { + "epoch": 0.28000309111048144, + "grad_norm": 0.09572996944189072, + "learning_rate": 7.231056457663221e-05, + "loss": 0.21628963947296143, + "step": 65220 + }, + { + "epoch": 0.2800460232005014, + "grad_norm": 1.1184502840042114, + "learning_rate": 7.230625285651458e-05, + "loss": 0.23993027210235596, + "step": 65230 + }, + { + "epoch": 0.28008895529052147, + "grad_norm": 0.04423844814300537, + "learning_rate": 7.230194113639696e-05, + "loss": 0.24691359996795653, + "step": 65240 + }, + { + "epoch": 0.28013188738054146, + "grad_norm": 0.10323113948106766, + "learning_rate": 7.229762941627934e-05, + "loss": 0.17943016290664673, + "step": 65250 + }, + { + "epoch": 0.28017481947056144, + "grad_norm": 0.00937010906636715, + "learning_rate": 7.229331769616172e-05, + "loss": 0.3377179384231567, + "step": 65260 + }, + { + "epoch": 0.2802177515605815, + "grad_norm": 0.0016765515320003033, + "learning_rate": 7.22890059760441e-05, + "loss": 0.32412896156311033, + "step": 65270 + }, + { + "epoch": 0.28026068365060147, + "grad_norm": 0.14862895011901855, + "learning_rate": 7.228469425592647e-05, + "loss": 0.23623664379119874, + "step": 65280 + }, + { + "epoch": 0.28030361574062146, + "grad_norm": 1.2214627265930176, + "learning_rate": 7.228038253580885e-05, + "loss": 0.23559386730194093, + "step": 65290 + }, + { + "epoch": 0.2803465478306415, + "grad_norm": 1.4163013696670532, + "learning_rate": 7.227607081569121e-05, + "loss": 0.10575599670410156, + "step": 65300 + }, + { + "epoch": 0.2803894799206615, + "grad_norm": 0.042076561599969864, + "learning_rate": 7.227175909557359e-05, + "loss": 0.2040954351425171, + "step": 65310 + }, + { + "epoch": 0.28043241201068153, + "grad_norm": 0.006046614143997431, + "learning_rate": 7.226744737545597e-05, + "loss": 0.2361140012741089, + "step": 65320 + }, + { + "epoch": 0.2804753441007015, + "grad_norm": 2.3810617923736572, + "learning_rate": 7.226313565533834e-05, + "loss": 0.20760879516601563, + "step": 65330 + }, + { + "epoch": 0.2805182761907215, + "grad_norm": 0.013670995831489563, + "learning_rate": 7.225882393522072e-05, + "loss": 0.4020346164703369, + "step": 65340 + }, + { + "epoch": 0.28056120828074155, + "grad_norm": 0.6690922379493713, + "learning_rate": 7.22545122151031e-05, + "loss": 0.29248411655426027, + "step": 65350 + }, + { + "epoch": 0.28060414037076153, + "grad_norm": 1.0356889963150024, + "learning_rate": 7.225020049498548e-05, + "loss": 0.41963953971862794, + "step": 65360 + }, + { + "epoch": 0.2806470724607815, + "grad_norm": 0.17851421236991882, + "learning_rate": 7.224588877486785e-05, + "loss": 0.2683266639709473, + "step": 65370 + }, + { + "epoch": 0.28069000455080156, + "grad_norm": 0.002332353964447975, + "learning_rate": 7.224157705475022e-05, + "loss": 0.1336469292640686, + "step": 65380 + }, + { + "epoch": 0.28073293664082155, + "grad_norm": 1.1590781211853027, + "learning_rate": 7.22372653346326e-05, + "loss": 0.5141021251678467, + "step": 65390 + }, + { + "epoch": 0.28077586873084154, + "grad_norm": 0.9057397842407227, + "learning_rate": 7.223295361451497e-05, + "loss": 0.19209452867507934, + "step": 65400 + }, + { + "epoch": 0.2808188008208616, + "grad_norm": 0.00840451568365097, + "learning_rate": 7.222864189439735e-05, + "loss": 0.1584153175354004, + "step": 65410 + }, + { + "epoch": 0.28086173291088157, + "grad_norm": 2.096689224243164, + "learning_rate": 7.222433017427973e-05, + "loss": 0.26500711441040037, + "step": 65420 + }, + { + "epoch": 0.28090466500090155, + "grad_norm": 0.008882677182555199, + "learning_rate": 7.22200184541621e-05, + "loss": 0.28088667392730715, + "step": 65430 + }, + { + "epoch": 0.2809475970909216, + "grad_norm": 0.6018815636634827, + "learning_rate": 7.221570673404448e-05, + "loss": 0.1276005268096924, + "step": 65440 + }, + { + "epoch": 0.2809905291809416, + "grad_norm": 5.839265823364258, + "learning_rate": 7.221139501392686e-05, + "loss": 0.2710998773574829, + "step": 65450 + }, + { + "epoch": 0.28103346127096157, + "grad_norm": 1.5453418493270874, + "learning_rate": 7.220708329380924e-05, + "loss": 0.3031306266784668, + "step": 65460 + }, + { + "epoch": 0.2810763933609816, + "grad_norm": 0.0062354025430977345, + "learning_rate": 7.220277157369161e-05, + "loss": 0.13311941623687745, + "step": 65470 + }, + { + "epoch": 0.2811193254510016, + "grad_norm": 0.000264729984337464, + "learning_rate": 7.219845985357399e-05, + "loss": 0.2866328954696655, + "step": 65480 + }, + { + "epoch": 0.2811622575410216, + "grad_norm": 0.12876111268997192, + "learning_rate": 7.219414813345637e-05, + "loss": 0.21349844932556153, + "step": 65490 + }, + { + "epoch": 0.28120518963104163, + "grad_norm": 0.243903249502182, + "learning_rate": 7.218983641333875e-05, + "loss": 0.15504903793334962, + "step": 65500 + }, + { + "epoch": 0.2812481217210616, + "grad_norm": 0.019903426989912987, + "learning_rate": 7.218552469322112e-05, + "loss": 0.19683150053024293, + "step": 65510 + }, + { + "epoch": 0.2812910538110816, + "grad_norm": 0.027894608676433563, + "learning_rate": 7.21812129731035e-05, + "loss": 0.15850646495819093, + "step": 65520 + }, + { + "epoch": 0.28133398590110165, + "grad_norm": 0.2418050765991211, + "learning_rate": 7.217690125298588e-05, + "loss": 0.2152719497680664, + "step": 65530 + }, + { + "epoch": 0.28137691799112163, + "grad_norm": 0.0038344464264810085, + "learning_rate": 7.217258953286826e-05, + "loss": 0.2340897798538208, + "step": 65540 + }, + { + "epoch": 0.2814198500811417, + "grad_norm": 1.888227939605713, + "learning_rate": 7.216827781275062e-05, + "loss": 0.18425893783569336, + "step": 65550 + }, + { + "epoch": 0.28146278217116166, + "grad_norm": 0.010996916331350803, + "learning_rate": 7.2163966092633e-05, + "loss": 0.13371667861938477, + "step": 65560 + }, + { + "epoch": 0.28150571426118165, + "grad_norm": 0.4016614258289337, + "learning_rate": 7.215965437251537e-05, + "loss": 0.3497531652450562, + "step": 65570 + }, + { + "epoch": 0.2815486463512017, + "grad_norm": 0.10766914486885071, + "learning_rate": 7.215534265239775e-05, + "loss": 0.0871292769908905, + "step": 65580 + }, + { + "epoch": 0.2815915784412217, + "grad_norm": 1.6982859373092651, + "learning_rate": 7.215103093228013e-05, + "loss": 0.38111917972564696, + "step": 65590 + }, + { + "epoch": 0.28163451053124167, + "grad_norm": 0.00048144563334062696, + "learning_rate": 7.21467192121625e-05, + "loss": 0.02695021629333496, + "step": 65600 + }, + { + "epoch": 0.2816774426212617, + "grad_norm": 0.06240568682551384, + "learning_rate": 7.214240749204488e-05, + "loss": 0.1083644151687622, + "step": 65610 + }, + { + "epoch": 0.2817203747112817, + "grad_norm": 0.0006455762195400894, + "learning_rate": 7.213809577192726e-05, + "loss": 0.011432316899299622, + "step": 65620 + }, + { + "epoch": 0.2817633068013017, + "grad_norm": 0.010076041333377361, + "learning_rate": 7.213378405180962e-05, + "loss": 0.1570556640625, + "step": 65630 + }, + { + "epoch": 0.2818062388913217, + "grad_norm": 1.5382864475250244, + "learning_rate": 7.2129472331692e-05, + "loss": 0.21324005126953124, + "step": 65640 + }, + { + "epoch": 0.2818491709813417, + "grad_norm": 1.173854947090149, + "learning_rate": 7.212516061157438e-05, + "loss": 0.18811701536178588, + "step": 65650 + }, + { + "epoch": 0.2818921030713617, + "grad_norm": 0.16060331463813782, + "learning_rate": 7.212084889145676e-05, + "loss": 0.25907483100891116, + "step": 65660 + }, + { + "epoch": 0.28193503516138174, + "grad_norm": 0.09696793556213379, + "learning_rate": 7.211653717133913e-05, + "loss": 0.5335704803466796, + "step": 65670 + }, + { + "epoch": 0.28197796725140173, + "grad_norm": 3.50130558013916, + "learning_rate": 7.211222545122151e-05, + "loss": 0.32176215648651124, + "step": 65680 + }, + { + "epoch": 0.2820208993414217, + "grad_norm": 0.09307514131069183, + "learning_rate": 7.210791373110389e-05, + "loss": 0.37757627964019774, + "step": 65690 + }, + { + "epoch": 0.28206383143144176, + "grad_norm": 0.23182889819145203, + "learning_rate": 7.210360201098627e-05, + "loss": 0.30493721961975095, + "step": 65700 + }, + { + "epoch": 0.28210676352146175, + "grad_norm": 0.06591078639030457, + "learning_rate": 7.209929029086864e-05, + "loss": 0.20234324932098388, + "step": 65710 + }, + { + "epoch": 0.28214969561148173, + "grad_norm": 0.02225450985133648, + "learning_rate": 7.209497857075102e-05, + "loss": 0.35156002044677737, + "step": 65720 + }, + { + "epoch": 0.2821926277015018, + "grad_norm": 1.354718804359436, + "learning_rate": 7.20906668506334e-05, + "loss": 0.1738152265548706, + "step": 65730 + }, + { + "epoch": 0.28223555979152176, + "grad_norm": 3.91957426071167, + "learning_rate": 7.208635513051577e-05, + "loss": 0.3712831974029541, + "step": 65740 + }, + { + "epoch": 0.2822784918815418, + "grad_norm": 0.14563311636447906, + "learning_rate": 7.208204341039815e-05, + "loss": 0.19410089254379273, + "step": 65750 + }, + { + "epoch": 0.2823214239715618, + "grad_norm": 0.3047499358654022, + "learning_rate": 7.207773169028053e-05, + "loss": 0.25509042739868165, + "step": 65760 + }, + { + "epoch": 0.2823643560615818, + "grad_norm": 1.5158109664916992, + "learning_rate": 7.207341997016291e-05, + "loss": 0.09974059462547302, + "step": 65770 + }, + { + "epoch": 0.2824072881516018, + "grad_norm": 0.09712890535593033, + "learning_rate": 7.206910825004528e-05, + "loss": 0.15192004442214965, + "step": 65780 + }, + { + "epoch": 0.2824502202416218, + "grad_norm": 1.342818021774292, + "learning_rate": 7.206479652992765e-05, + "loss": 0.19996538162231445, + "step": 65790 + }, + { + "epoch": 0.2824931523316418, + "grad_norm": 1.0634775161743164, + "learning_rate": 7.206048480981003e-05, + "loss": 0.310014009475708, + "step": 65800 + }, + { + "epoch": 0.28253608442166184, + "grad_norm": 4.057530879974365, + "learning_rate": 7.20561730896924e-05, + "loss": 0.40732660293579104, + "step": 65810 + }, + { + "epoch": 0.2825790165116818, + "grad_norm": 1.2841671705245972, + "learning_rate": 7.205186136957478e-05, + "loss": 0.30788025856018064, + "step": 65820 + }, + { + "epoch": 0.2826219486017018, + "grad_norm": 0.039597101509571075, + "learning_rate": 7.204754964945716e-05, + "loss": 0.3503837823867798, + "step": 65830 + }, + { + "epoch": 0.28266488069172185, + "grad_norm": 3.8208327293395996, + "learning_rate": 7.204323792933953e-05, + "loss": 0.07890766263008117, + "step": 65840 + }, + { + "epoch": 0.28270781278174184, + "grad_norm": 0.004489063750952482, + "learning_rate": 7.203892620922191e-05, + "loss": 0.23929812908172607, + "step": 65850 + }, + { + "epoch": 0.2827507448717618, + "grad_norm": 0.13525481522083282, + "learning_rate": 7.203461448910429e-05, + "loss": 0.1935684561729431, + "step": 65860 + }, + { + "epoch": 0.28279367696178187, + "grad_norm": 0.10572947561740875, + "learning_rate": 7.203030276898667e-05, + "loss": 0.45615386962890625, + "step": 65870 + }, + { + "epoch": 0.28283660905180186, + "grad_norm": 10.497499465942383, + "learning_rate": 7.202599104886903e-05, + "loss": 0.22629842758178711, + "step": 65880 + }, + { + "epoch": 0.28287954114182184, + "grad_norm": 1.8166733980178833, + "learning_rate": 7.202167932875141e-05, + "loss": 0.37073335647583006, + "step": 65890 + }, + { + "epoch": 0.2829224732318419, + "grad_norm": 1.114739179611206, + "learning_rate": 7.201736760863379e-05, + "loss": 0.16271429061889647, + "step": 65900 + }, + { + "epoch": 0.2829654053218619, + "grad_norm": 3.9710497856140137, + "learning_rate": 7.201305588851616e-05, + "loss": 0.17485971450805665, + "step": 65910 + }, + { + "epoch": 0.28300833741188186, + "grad_norm": 2.7692909240722656, + "learning_rate": 7.200874416839854e-05, + "loss": 0.3034187078475952, + "step": 65920 + }, + { + "epoch": 0.2830512695019019, + "grad_norm": 0.754224419593811, + "learning_rate": 7.200443244828092e-05, + "loss": 0.39126248359680177, + "step": 65930 + }, + { + "epoch": 0.2830942015919219, + "grad_norm": 2.726775884628296, + "learning_rate": 7.20001207281633e-05, + "loss": 0.3949368953704834, + "step": 65940 + }, + { + "epoch": 0.2831371336819419, + "grad_norm": 0.03907603397965431, + "learning_rate": 7.199580900804567e-05, + "loss": 0.3045235872268677, + "step": 65950 + }, + { + "epoch": 0.2831800657719619, + "grad_norm": 0.15216980874538422, + "learning_rate": 7.199149728792805e-05, + "loss": 0.24437999725341797, + "step": 65960 + }, + { + "epoch": 0.2832229978619819, + "grad_norm": 0.5572879314422607, + "learning_rate": 7.198718556781043e-05, + "loss": 0.21077256202697753, + "step": 65970 + }, + { + "epoch": 0.28326592995200195, + "grad_norm": 2.963178873062134, + "learning_rate": 7.19828738476928e-05, + "loss": 0.451598072052002, + "step": 65980 + }, + { + "epoch": 0.28330886204202194, + "grad_norm": 0.026027904823422432, + "learning_rate": 7.197856212757518e-05, + "loss": 0.2929235935211182, + "step": 65990 + }, + { + "epoch": 0.2833517941320419, + "grad_norm": 0.835507333278656, + "learning_rate": 7.197425040745756e-05, + "loss": 0.36316940784454343, + "step": 66000 + }, + { + "epoch": 0.2833517941320419, + "eval_loss": 0.44882506132125854, + "eval_runtime": 27.4625, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 66000 + }, + { + "epoch": 0.28339472622206197, + "grad_norm": 0.22148838639259338, + "learning_rate": 7.196993868733994e-05, + "loss": 0.2025505542755127, + "step": 66010 + }, + { + "epoch": 0.28343765831208195, + "grad_norm": 0.04498464986681938, + "learning_rate": 7.196562696722231e-05, + "loss": 0.09973494410514831, + "step": 66020 + }, + { + "epoch": 0.28348059040210194, + "grad_norm": 0.04255395382642746, + "learning_rate": 7.196131524710469e-05, + "loss": 0.27003774642944334, + "step": 66030 + }, + { + "epoch": 0.283523522492122, + "grad_norm": 1.8950724601745605, + "learning_rate": 7.195700352698705e-05, + "loss": 0.15587079524993896, + "step": 66040 + }, + { + "epoch": 0.28356645458214197, + "grad_norm": 1.7096226215362549, + "learning_rate": 7.195269180686943e-05, + "loss": 0.1669179916381836, + "step": 66050 + }, + { + "epoch": 0.28360938667216196, + "grad_norm": 2.672077178955078, + "learning_rate": 7.194838008675181e-05, + "loss": 0.29187939167022703, + "step": 66060 + }, + { + "epoch": 0.283652318762182, + "grad_norm": 0.11574865877628326, + "learning_rate": 7.194406836663419e-05, + "loss": 0.2783296346664429, + "step": 66070 + }, + { + "epoch": 0.283695250852202, + "grad_norm": 0.25796473026275635, + "learning_rate": 7.193975664651656e-05, + "loss": 0.2895475149154663, + "step": 66080 + }, + { + "epoch": 0.283738182942222, + "grad_norm": 0.04511742293834686, + "learning_rate": 7.193544492639894e-05, + "loss": 0.12669681310653685, + "step": 66090 + }, + { + "epoch": 0.283781115032242, + "grad_norm": 0.29782024025917053, + "learning_rate": 7.193113320628132e-05, + "loss": 0.13074166774749757, + "step": 66100 + }, + { + "epoch": 0.283824047122262, + "grad_norm": 0.03458978608250618, + "learning_rate": 7.19268214861637e-05, + "loss": 0.27550182342529295, + "step": 66110 + }, + { + "epoch": 0.283866979212282, + "grad_norm": 1.4848647117614746, + "learning_rate": 7.192250976604606e-05, + "loss": 0.21677308082580565, + "step": 66120 + }, + { + "epoch": 0.28390991130230203, + "grad_norm": 0.0015100378077477217, + "learning_rate": 7.191819804592844e-05, + "loss": 0.291225266456604, + "step": 66130 + }, + { + "epoch": 0.283952843392322, + "grad_norm": 0.002405498642474413, + "learning_rate": 7.191388632581081e-05, + "loss": 0.2970550060272217, + "step": 66140 + }, + { + "epoch": 0.283995775482342, + "grad_norm": 2.132266044616699, + "learning_rate": 7.190957460569319e-05, + "loss": 0.2530239105224609, + "step": 66150 + }, + { + "epoch": 0.28403870757236205, + "grad_norm": 0.8479958772659302, + "learning_rate": 7.190526288557557e-05, + "loss": 0.14648141860961914, + "step": 66160 + }, + { + "epoch": 0.28408163966238204, + "grad_norm": 1.6402133703231812, + "learning_rate": 7.190095116545795e-05, + "loss": 0.26374919414520265, + "step": 66170 + }, + { + "epoch": 0.2841245717524021, + "grad_norm": 0.11759825795888901, + "learning_rate": 7.189663944534032e-05, + "loss": 0.25805234909057617, + "step": 66180 + }, + { + "epoch": 0.28416750384242206, + "grad_norm": 0.018354952335357666, + "learning_rate": 7.189232772522271e-05, + "loss": 0.3803222417831421, + "step": 66190 + }, + { + "epoch": 0.28421043593244205, + "grad_norm": 3.1105453968048096, + "learning_rate": 7.188801600510509e-05, + "loss": 0.35515432357788085, + "step": 66200 + }, + { + "epoch": 0.2842533680224621, + "grad_norm": 0.3357342481613159, + "learning_rate": 7.188370428498746e-05, + "loss": 0.06009315848350525, + "step": 66210 + }, + { + "epoch": 0.2842963001124821, + "grad_norm": 0.053794365376234055, + "learning_rate": 7.187939256486983e-05, + "loss": 0.13056585788726807, + "step": 66220 + }, + { + "epoch": 0.28433923220250207, + "grad_norm": 0.7919422388076782, + "learning_rate": 7.187508084475221e-05, + "loss": 0.1529833197593689, + "step": 66230 + }, + { + "epoch": 0.2843821642925221, + "grad_norm": 1.3930153846740723, + "learning_rate": 7.187076912463459e-05, + "loss": 0.230507230758667, + "step": 66240 + }, + { + "epoch": 0.2844250963825421, + "grad_norm": 0.05847423896193504, + "learning_rate": 7.186645740451697e-05, + "loss": 0.03660984933376312, + "step": 66250 + }, + { + "epoch": 0.2844680284725621, + "grad_norm": 2.146456718444824, + "learning_rate": 7.186214568439934e-05, + "loss": 0.10765037536621094, + "step": 66260 + }, + { + "epoch": 0.2845109605625821, + "grad_norm": 1.673628807067871, + "learning_rate": 7.185783396428172e-05, + "loss": 0.3041319131851196, + "step": 66270 + }, + { + "epoch": 0.2845538926526021, + "grad_norm": 0.039230331778526306, + "learning_rate": 7.18535222441641e-05, + "loss": 0.210577654838562, + "step": 66280 + }, + { + "epoch": 0.2845968247426221, + "grad_norm": 0.006200359668582678, + "learning_rate": 7.184921052404646e-05, + "loss": 0.23379092216491698, + "step": 66290 + }, + { + "epoch": 0.28463975683264214, + "grad_norm": 1.6170132160186768, + "learning_rate": 7.184489880392884e-05, + "loss": 0.2834723949432373, + "step": 66300 + }, + { + "epoch": 0.28468268892266213, + "grad_norm": 1.9372655153274536, + "learning_rate": 7.184058708381122e-05, + "loss": 0.31765921115875245, + "step": 66310 + }, + { + "epoch": 0.2847256210126821, + "grad_norm": 2.323282241821289, + "learning_rate": 7.183627536369359e-05, + "loss": 0.2632134199142456, + "step": 66320 + }, + { + "epoch": 0.28476855310270216, + "grad_norm": 8.279928207397461, + "learning_rate": 7.183196364357597e-05, + "loss": 0.383182692527771, + "step": 66330 + }, + { + "epoch": 0.28481148519272215, + "grad_norm": 1.6048333644866943, + "learning_rate": 7.182765192345835e-05, + "loss": 0.09258521795272827, + "step": 66340 + }, + { + "epoch": 0.28485441728274213, + "grad_norm": 3.5633950233459473, + "learning_rate": 7.182334020334072e-05, + "loss": 0.3062487363815308, + "step": 66350 + }, + { + "epoch": 0.2848973493727622, + "grad_norm": 0.0061171273700892925, + "learning_rate": 7.18190284832231e-05, + "loss": 0.29340639114379885, + "step": 66360 + }, + { + "epoch": 0.28494028146278216, + "grad_norm": 0.019328560680150986, + "learning_rate": 7.181471676310547e-05, + "loss": 0.40865530967712405, + "step": 66370 + }, + { + "epoch": 0.28498321355280215, + "grad_norm": 1.3364512920379639, + "learning_rate": 7.181040504298784e-05, + "loss": 0.3623195171356201, + "step": 66380 + }, + { + "epoch": 0.2850261456428222, + "grad_norm": 0.02536897547543049, + "learning_rate": 7.180609332287022e-05, + "loss": 0.24478592872619628, + "step": 66390 + }, + { + "epoch": 0.2850690777328422, + "grad_norm": 0.009088271297514439, + "learning_rate": 7.18017816027526e-05, + "loss": 0.1288319706916809, + "step": 66400 + }, + { + "epoch": 0.2851120098228622, + "grad_norm": 0.004414925817400217, + "learning_rate": 7.179746988263499e-05, + "loss": 0.1057388424873352, + "step": 66410 + }, + { + "epoch": 0.2851549419128822, + "grad_norm": 5.123507499694824, + "learning_rate": 7.179315816251737e-05, + "loss": 0.37679688930511473, + "step": 66420 + }, + { + "epoch": 0.2851978740029022, + "grad_norm": 0.06788595765829086, + "learning_rate": 7.178884644239974e-05, + "loss": 0.09671139717102051, + "step": 66430 + }, + { + "epoch": 0.28524080609292224, + "grad_norm": 0.06418441236019135, + "learning_rate": 7.178453472228212e-05, + "loss": 0.09671286940574646, + "step": 66440 + }, + { + "epoch": 0.2852837381829422, + "grad_norm": 3.502713680267334, + "learning_rate": 7.178022300216448e-05, + "loss": 0.2851149559020996, + "step": 66450 + }, + { + "epoch": 0.2853266702729622, + "grad_norm": 0.14976099133491516, + "learning_rate": 7.177591128204686e-05, + "loss": 0.17432072162628173, + "step": 66460 + }, + { + "epoch": 0.28536960236298226, + "grad_norm": 0.05505011975765228, + "learning_rate": 7.177159956192924e-05, + "loss": 0.02504686117172241, + "step": 66470 + }, + { + "epoch": 0.28541253445300224, + "grad_norm": 2.7165024280548096, + "learning_rate": 7.176728784181162e-05, + "loss": 0.3118079662322998, + "step": 66480 + }, + { + "epoch": 0.28545546654302223, + "grad_norm": 0.00147499970626086, + "learning_rate": 7.1762976121694e-05, + "loss": 0.2843587398529053, + "step": 66490 + }, + { + "epoch": 0.28549839863304227, + "grad_norm": 0.2036382257938385, + "learning_rate": 7.175866440157637e-05, + "loss": 0.03493208289146423, + "step": 66500 + }, + { + "epoch": 0.28554133072306226, + "grad_norm": 5.055944442749023, + "learning_rate": 7.175435268145875e-05, + "loss": 0.17728381156921386, + "step": 66510 + }, + { + "epoch": 0.28558426281308225, + "grad_norm": 1.751897931098938, + "learning_rate": 7.175004096134113e-05, + "loss": 0.30946590900421145, + "step": 66520 + }, + { + "epoch": 0.2856271949031023, + "grad_norm": 0.10786402225494385, + "learning_rate": 7.174572924122349e-05, + "loss": 0.07910744547843933, + "step": 66530 + }, + { + "epoch": 0.2856701269931223, + "grad_norm": 2.8931570053100586, + "learning_rate": 7.174141752110587e-05, + "loss": 0.37531414031982424, + "step": 66540 + }, + { + "epoch": 0.28571305908314226, + "grad_norm": 0.021041302010416985, + "learning_rate": 7.173710580098824e-05, + "loss": 0.25701830387115476, + "step": 66550 + }, + { + "epoch": 0.2857559911731623, + "grad_norm": 0.1704130321741104, + "learning_rate": 7.173279408087062e-05, + "loss": 0.33431432247161863, + "step": 66560 + }, + { + "epoch": 0.2857989232631823, + "grad_norm": 0.004376660101115704, + "learning_rate": 7.1728482360753e-05, + "loss": 0.1233670949935913, + "step": 66570 + }, + { + "epoch": 0.2858418553532023, + "grad_norm": 21.381446838378906, + "learning_rate": 7.172417064063538e-05, + "loss": 0.27247138023376466, + "step": 66580 + }, + { + "epoch": 0.2858847874432223, + "grad_norm": 3.642610549926758, + "learning_rate": 7.171985892051775e-05, + "loss": 0.46688222885131836, + "step": 66590 + }, + { + "epoch": 0.2859277195332423, + "grad_norm": 1.3897994756698608, + "learning_rate": 7.171554720040013e-05, + "loss": 0.2915907621383667, + "step": 66600 + }, + { + "epoch": 0.28597065162326235, + "grad_norm": 0.07208789139986038, + "learning_rate": 7.171123548028251e-05, + "loss": 0.1163141369819641, + "step": 66610 + }, + { + "epoch": 0.28601358371328234, + "grad_norm": 0.042390916496515274, + "learning_rate": 7.170692376016487e-05, + "loss": 0.13545933961868287, + "step": 66620 + }, + { + "epoch": 0.2860565158033023, + "grad_norm": 2.161851644515991, + "learning_rate": 7.170261204004726e-05, + "loss": 0.14295434951782227, + "step": 66630 + }, + { + "epoch": 0.28609944789332237, + "grad_norm": 1.8208708763122559, + "learning_rate": 7.169830031992964e-05, + "loss": 0.25981476306915285, + "step": 66640 + }, + { + "epoch": 0.28614237998334235, + "grad_norm": 0.023029552772641182, + "learning_rate": 7.169398859981202e-05, + "loss": 0.2782176971435547, + "step": 66650 + }, + { + "epoch": 0.28618531207336234, + "grad_norm": 4.7402167320251465, + "learning_rate": 7.16896768796944e-05, + "loss": 0.3271470546722412, + "step": 66660 + }, + { + "epoch": 0.2862282441633824, + "grad_norm": 0.027461495250463486, + "learning_rate": 7.168536515957677e-05, + "loss": 0.13882352113723756, + "step": 66670 + }, + { + "epoch": 0.28627117625340237, + "grad_norm": 9.312287330627441, + "learning_rate": 7.168105343945915e-05, + "loss": 0.33524518013000487, + "step": 66680 + }, + { + "epoch": 0.28631410834342236, + "grad_norm": 0.040560901165008545, + "learning_rate": 7.167674171934153e-05, + "loss": 0.030652105808258057, + "step": 66690 + }, + { + "epoch": 0.2863570404334424, + "grad_norm": 0.009825235232710838, + "learning_rate": 7.167242999922389e-05, + "loss": 0.09193103313446045, + "step": 66700 + }, + { + "epoch": 0.2863999725234624, + "grad_norm": 0.24824531376361847, + "learning_rate": 7.166811827910627e-05, + "loss": 0.16448047161102294, + "step": 66710 + }, + { + "epoch": 0.2864429046134824, + "grad_norm": 1.160897970199585, + "learning_rate": 7.166380655898865e-05, + "loss": 0.29864816665649413, + "step": 66720 + }, + { + "epoch": 0.2864858367035024, + "grad_norm": 0.026330584660172462, + "learning_rate": 7.165949483887102e-05, + "loss": 0.07965230941772461, + "step": 66730 + }, + { + "epoch": 0.2865287687935224, + "grad_norm": 7.835456371307373, + "learning_rate": 7.16551831187534e-05, + "loss": 0.21117513179779052, + "step": 66740 + }, + { + "epoch": 0.2865717008835424, + "grad_norm": 6.6683030128479, + "learning_rate": 7.165087139863578e-05, + "loss": 0.23535704612731934, + "step": 66750 + }, + { + "epoch": 0.28661463297356243, + "grad_norm": 2.4860293865203857, + "learning_rate": 7.164655967851816e-05, + "loss": 0.12097716331481934, + "step": 66760 + }, + { + "epoch": 0.2866575650635824, + "grad_norm": 0.5875166058540344, + "learning_rate": 7.164224795840053e-05, + "loss": 0.11552761793136597, + "step": 66770 + }, + { + "epoch": 0.2867004971536024, + "grad_norm": 0.6846771240234375, + "learning_rate": 7.16379362382829e-05, + "loss": 0.37979416847229003, + "step": 66780 + }, + { + "epoch": 0.28674342924362245, + "grad_norm": 0.0027324643451720476, + "learning_rate": 7.163362451816527e-05, + "loss": 0.22684979438781738, + "step": 66790 + }, + { + "epoch": 0.28678636133364244, + "grad_norm": 6.466740608215332, + "learning_rate": 7.162931279804765e-05, + "loss": 0.17428145408630372, + "step": 66800 + }, + { + "epoch": 0.2868292934236624, + "grad_norm": 0.005279346369206905, + "learning_rate": 7.162500107793003e-05, + "loss": 0.305089545249939, + "step": 66810 + }, + { + "epoch": 0.28687222551368247, + "grad_norm": 0.0043772244825959206, + "learning_rate": 7.16206893578124e-05, + "loss": 0.07529987692832947, + "step": 66820 + }, + { + "epoch": 0.28691515760370245, + "grad_norm": 0.015128378756344318, + "learning_rate": 7.161637763769478e-05, + "loss": 0.1512007474899292, + "step": 66830 + }, + { + "epoch": 0.2869580896937225, + "grad_norm": 0.033646125346422195, + "learning_rate": 7.161206591757716e-05, + "loss": 0.2316761016845703, + "step": 66840 + }, + { + "epoch": 0.2870010217837425, + "grad_norm": 0.038412321358919144, + "learning_rate": 7.160775419745954e-05, + "loss": 0.2079862356185913, + "step": 66850 + }, + { + "epoch": 0.28704395387376247, + "grad_norm": 41.8834228515625, + "learning_rate": 7.160344247734192e-05, + "loss": 0.1129407525062561, + "step": 66860 + }, + { + "epoch": 0.2870868859637825, + "grad_norm": 0.0051442538388073444, + "learning_rate": 7.159913075722429e-05, + "loss": 0.12028219699859619, + "step": 66870 + }, + { + "epoch": 0.2871298180538025, + "grad_norm": 0.014405815862119198, + "learning_rate": 7.159481903710667e-05, + "loss": 0.1231608271598816, + "step": 66880 + }, + { + "epoch": 0.2871727501438225, + "grad_norm": 0.27976086735725403, + "learning_rate": 7.159050731698905e-05, + "loss": 0.23082361221313477, + "step": 66890 + }, + { + "epoch": 0.28721568223384253, + "grad_norm": 0.2852309048175812, + "learning_rate": 7.158619559687142e-05, + "loss": 0.44276866912841795, + "step": 66900 + }, + { + "epoch": 0.2872586143238625, + "grad_norm": 0.0018568943487480283, + "learning_rate": 7.15818838767538e-05, + "loss": 0.09268001317977906, + "step": 66910 + }, + { + "epoch": 0.2873015464138825, + "grad_norm": 0.0033980561420321465, + "learning_rate": 7.157757215663618e-05, + "loss": 0.06650593280792236, + "step": 66920 + }, + { + "epoch": 0.28734447850390255, + "grad_norm": 0.44758087396621704, + "learning_rate": 7.157326043651856e-05, + "loss": 0.1356669306755066, + "step": 66930 + }, + { + "epoch": 0.28738741059392253, + "grad_norm": 0.2766318619251251, + "learning_rate": 7.156894871640093e-05, + "loss": 0.22032556533813477, + "step": 66940 + }, + { + "epoch": 0.2874303426839425, + "grad_norm": 0.01758783869445324, + "learning_rate": 7.15646369962833e-05, + "loss": 0.20557994842529298, + "step": 66950 + }, + { + "epoch": 0.28747327477396256, + "grad_norm": 0.0034948191605508327, + "learning_rate": 7.156032527616568e-05, + "loss": 0.27143542766571044, + "step": 66960 + }, + { + "epoch": 0.28751620686398255, + "grad_norm": 2.4819626808166504, + "learning_rate": 7.155601355604805e-05, + "loss": 0.4705145835876465, + "step": 66970 + }, + { + "epoch": 0.28755913895400254, + "grad_norm": 5.097228050231934, + "learning_rate": 7.155170183593043e-05, + "loss": 0.39361963272094724, + "step": 66980 + }, + { + "epoch": 0.2876020710440226, + "grad_norm": 0.016346966847777367, + "learning_rate": 7.154739011581281e-05, + "loss": 0.2513259410858154, + "step": 66990 + }, + { + "epoch": 0.28764500313404257, + "grad_norm": 0.09284532070159912, + "learning_rate": 7.154307839569518e-05, + "loss": 0.2665305852890015, + "step": 67000 + }, + { + "epoch": 0.28764500313404257, + "eval_loss": 0.43117061257362366, + "eval_runtime": 27.4525, + "eval_samples_per_second": 3.643, + "eval_steps_per_second": 3.643, + "step": 67000 + }, + { + "epoch": 0.28768793522406255, + "grad_norm": 0.4193928837776184, + "learning_rate": 7.153876667557756e-05, + "loss": 0.2339691400527954, + "step": 67010 + }, + { + "epoch": 0.2877308673140826, + "grad_norm": 0.05756061151623726, + "learning_rate": 7.153445495545994e-05, + "loss": 0.16777576208114625, + "step": 67020 + }, + { + "epoch": 0.2877737994041026, + "grad_norm": 0.1018446758389473, + "learning_rate": 7.15301432353423e-05, + "loss": 0.22809865474700927, + "step": 67030 + }, + { + "epoch": 0.2878167314941226, + "grad_norm": 0.7277085185050964, + "learning_rate": 7.152583151522468e-05, + "loss": 0.22245891094207765, + "step": 67040 + }, + { + "epoch": 0.2878596635841426, + "grad_norm": 0.005714256316423416, + "learning_rate": 7.152151979510706e-05, + "loss": 0.029804161190986632, + "step": 67050 + }, + { + "epoch": 0.2879025956741626, + "grad_norm": 0.5130506753921509, + "learning_rate": 7.151720807498943e-05, + "loss": 0.18485331535339355, + "step": 67060 + }, + { + "epoch": 0.28794552776418264, + "grad_norm": 0.007058766670525074, + "learning_rate": 7.151289635487181e-05, + "loss": 0.10065820217132568, + "step": 67070 + }, + { + "epoch": 0.2879884598542026, + "grad_norm": 0.005129760131239891, + "learning_rate": 7.150858463475419e-05, + "loss": 0.3468196153640747, + "step": 67080 + }, + { + "epoch": 0.2880313919442226, + "grad_norm": 0.015246695838868618, + "learning_rate": 7.150427291463657e-05, + "loss": 0.3642794847488403, + "step": 67090 + }, + { + "epoch": 0.28807432403424266, + "grad_norm": 0.2805442810058594, + "learning_rate": 7.149996119451894e-05, + "loss": 0.17592545747756957, + "step": 67100 + }, + { + "epoch": 0.28811725612426264, + "grad_norm": 3.7581307888031006, + "learning_rate": 7.149564947440132e-05, + "loss": 0.20889370441436766, + "step": 67110 + }, + { + "epoch": 0.28816018821428263, + "grad_norm": 0.5101104974746704, + "learning_rate": 7.14913377542837e-05, + "loss": 0.25342817306518556, + "step": 67120 + }, + { + "epoch": 0.2882031203043027, + "grad_norm": 0.0816846638917923, + "learning_rate": 7.148702603416608e-05, + "loss": 0.27064502239227295, + "step": 67130 + }, + { + "epoch": 0.28824605239432266, + "grad_norm": 0.0570346862077713, + "learning_rate": 7.148271431404845e-05, + "loss": 0.18118938207626342, + "step": 67140 + }, + { + "epoch": 0.28828898448434265, + "grad_norm": 0.31971192359924316, + "learning_rate": 7.147840259393083e-05, + "loss": 0.15357050895690919, + "step": 67150 + }, + { + "epoch": 0.2883319165743627, + "grad_norm": 0.07988704741001129, + "learning_rate": 7.147409087381321e-05, + "loss": 0.24975745677947997, + "step": 67160 + }, + { + "epoch": 0.2883748486643827, + "grad_norm": 0.0052708168514072895, + "learning_rate": 7.146977915369559e-05, + "loss": 0.092576003074646, + "step": 67170 + }, + { + "epoch": 0.28841778075440266, + "grad_norm": 0.08117439597845078, + "learning_rate": 7.146546743357796e-05, + "loss": 0.12661879062652587, + "step": 67180 + }, + { + "epoch": 0.2884607128444227, + "grad_norm": 5.074268341064453, + "learning_rate": 7.146115571346033e-05, + "loss": 0.1526340961456299, + "step": 67190 + }, + { + "epoch": 0.2885036449344427, + "grad_norm": 0.05365569517016411, + "learning_rate": 7.14568439933427e-05, + "loss": 0.143519389629364, + "step": 67200 + }, + { + "epoch": 0.2885465770244627, + "grad_norm": 3.570014238357544, + "learning_rate": 7.145253227322508e-05, + "loss": 0.17729175090789795, + "step": 67210 + }, + { + "epoch": 0.2885895091144827, + "grad_norm": 2.256596088409424, + "learning_rate": 7.144822055310746e-05, + "loss": 0.31090943813323973, + "step": 67220 + }, + { + "epoch": 0.2886324412045027, + "grad_norm": 0.00617602001875639, + "learning_rate": 7.144390883298984e-05, + "loss": 0.38187849521636963, + "step": 67230 + }, + { + "epoch": 0.2886753732945227, + "grad_norm": 0.03439110144972801, + "learning_rate": 7.143959711287221e-05, + "loss": 0.40892391204833983, + "step": 67240 + }, + { + "epoch": 0.28871830538454274, + "grad_norm": 1.2373721599578857, + "learning_rate": 7.143528539275459e-05, + "loss": 0.0434207022190094, + "step": 67250 + }, + { + "epoch": 0.2887612374745627, + "grad_norm": 0.04851150885224342, + "learning_rate": 7.143097367263697e-05, + "loss": 0.10299129486083984, + "step": 67260 + }, + { + "epoch": 0.28880416956458277, + "grad_norm": 0.47567617893218994, + "learning_rate": 7.142666195251935e-05, + "loss": 0.40652313232421877, + "step": 67270 + }, + { + "epoch": 0.28884710165460276, + "grad_norm": 0.44068795442581177, + "learning_rate": 7.142235023240171e-05, + "loss": 0.01274416446685791, + "step": 67280 + }, + { + "epoch": 0.28889003374462274, + "grad_norm": 1.2657181024551392, + "learning_rate": 7.141803851228409e-05, + "loss": 0.2137005090713501, + "step": 67290 + }, + { + "epoch": 0.2889329658346428, + "grad_norm": 3.1350607872009277, + "learning_rate": 7.141372679216646e-05, + "loss": 0.25659914016723634, + "step": 67300 + }, + { + "epoch": 0.2889758979246628, + "grad_norm": 2.780965566635132, + "learning_rate": 7.140941507204884e-05, + "loss": 0.3482322692871094, + "step": 67310 + }, + { + "epoch": 0.28901883001468276, + "grad_norm": 1.2729345560073853, + "learning_rate": 7.140510335193122e-05, + "loss": 0.11825804710388184, + "step": 67320 + }, + { + "epoch": 0.2890617621047028, + "grad_norm": 0.5161279439926147, + "learning_rate": 7.14007916318136e-05, + "loss": 0.3750854253768921, + "step": 67330 + }, + { + "epoch": 0.2891046941947228, + "grad_norm": 1.1493275165557861, + "learning_rate": 7.139647991169597e-05, + "loss": 0.2764685392379761, + "step": 67340 + }, + { + "epoch": 0.2891476262847428, + "grad_norm": 6.515742778778076, + "learning_rate": 7.139216819157835e-05, + "loss": 0.249403977394104, + "step": 67350 + }, + { + "epoch": 0.2891905583747628, + "grad_norm": 11.825860977172852, + "learning_rate": 7.138785647146073e-05, + "loss": 0.28220572471618655, + "step": 67360 + }, + { + "epoch": 0.2892334904647828, + "grad_norm": 3.0165984630584717, + "learning_rate": 7.13835447513431e-05, + "loss": 0.3849741697311401, + "step": 67370 + }, + { + "epoch": 0.2892764225548028, + "grad_norm": 0.16258086264133453, + "learning_rate": 7.137923303122548e-05, + "loss": 0.1100645899772644, + "step": 67380 + }, + { + "epoch": 0.28931935464482283, + "grad_norm": 0.044878143817186356, + "learning_rate": 7.137492131110786e-05, + "loss": 0.21178960800170898, + "step": 67390 + }, + { + "epoch": 0.2893622867348428, + "grad_norm": 0.0038589336909353733, + "learning_rate": 7.137060959099024e-05, + "loss": 0.2345569133758545, + "step": 67400 + }, + { + "epoch": 0.2894052188248628, + "grad_norm": 0.0768958106637001, + "learning_rate": 7.136629787087261e-05, + "loss": 0.21593918800354003, + "step": 67410 + }, + { + "epoch": 0.28944815091488285, + "grad_norm": 7.253081798553467, + "learning_rate": 7.136198615075499e-05, + "loss": 0.4909097194671631, + "step": 67420 + }, + { + "epoch": 0.28949108300490284, + "grad_norm": 0.04475796967744827, + "learning_rate": 7.135767443063737e-05, + "loss": 0.4545122623443604, + "step": 67430 + }, + { + "epoch": 0.2895340150949228, + "grad_norm": 0.08378562331199646, + "learning_rate": 7.135336271051973e-05, + "loss": 0.1959518313407898, + "step": 67440 + }, + { + "epoch": 0.28957694718494287, + "grad_norm": 0.05304625630378723, + "learning_rate": 7.134905099040211e-05, + "loss": 0.08540345430374145, + "step": 67450 + }, + { + "epoch": 0.28961987927496285, + "grad_norm": 0.02249019965529442, + "learning_rate": 7.134473927028449e-05, + "loss": 0.2787889003753662, + "step": 67460 + }, + { + "epoch": 0.2896628113649829, + "grad_norm": 0.8801037669181824, + "learning_rate": 7.134042755016687e-05, + "loss": 0.4670060157775879, + "step": 67470 + }, + { + "epoch": 0.2897057434550029, + "grad_norm": 1.267013430595398, + "learning_rate": 7.133611583004924e-05, + "loss": 0.13036876916885376, + "step": 67480 + }, + { + "epoch": 0.28974867554502287, + "grad_norm": 0.3037998378276825, + "learning_rate": 7.133180410993162e-05, + "loss": 0.10819134712219239, + "step": 67490 + }, + { + "epoch": 0.2897916076350429, + "grad_norm": 0.7789495587348938, + "learning_rate": 7.1327492389814e-05, + "loss": 0.17645723819732667, + "step": 67500 + }, + { + "epoch": 0.2898345397250629, + "grad_norm": 0.044124431908130646, + "learning_rate": 7.132318066969637e-05, + "loss": 0.0688193678855896, + "step": 67510 + }, + { + "epoch": 0.2898774718150829, + "grad_norm": 0.00780621450394392, + "learning_rate": 7.131886894957874e-05, + "loss": 0.3849108457565308, + "step": 67520 + }, + { + "epoch": 0.28992040390510293, + "grad_norm": 0.6088647842407227, + "learning_rate": 7.131455722946112e-05, + "loss": 0.1863906502723694, + "step": 67530 + }, + { + "epoch": 0.2899633359951229, + "grad_norm": 0.7508590817451477, + "learning_rate": 7.13102455093435e-05, + "loss": 0.2708500146865845, + "step": 67540 + }, + { + "epoch": 0.2900062680851429, + "grad_norm": 3.738001823425293, + "learning_rate": 7.130593378922587e-05, + "loss": 0.130331552028656, + "step": 67550 + }, + { + "epoch": 0.29004920017516295, + "grad_norm": 4.640090465545654, + "learning_rate": 7.130162206910825e-05, + "loss": 0.2554504871368408, + "step": 67560 + }, + { + "epoch": 0.29009213226518293, + "grad_norm": 0.12082592397928238, + "learning_rate": 7.129731034899063e-05, + "loss": 0.1677726149559021, + "step": 67570 + }, + { + "epoch": 0.2901350643552029, + "grad_norm": 0.016347909346222878, + "learning_rate": 7.1292998628873e-05, + "loss": 0.17969557046890258, + "step": 67580 + }, + { + "epoch": 0.29017799644522296, + "grad_norm": 0.012287224642932415, + "learning_rate": 7.128868690875538e-05, + "loss": 0.1445378541946411, + "step": 67590 + }, + { + "epoch": 0.29022092853524295, + "grad_norm": 0.002170619321987033, + "learning_rate": 7.128437518863776e-05, + "loss": 0.1796630859375, + "step": 67600 + }, + { + "epoch": 0.29026386062526294, + "grad_norm": 0.3804681897163391, + "learning_rate": 7.128006346852013e-05, + "loss": 0.3209116220474243, + "step": 67610 + }, + { + "epoch": 0.290306792715283, + "grad_norm": 1.3779412508010864, + "learning_rate": 7.127575174840251e-05, + "loss": 0.33678340911865234, + "step": 67620 + }, + { + "epoch": 0.29034972480530297, + "grad_norm": 3.7422566413879395, + "learning_rate": 7.127144002828489e-05, + "loss": 0.1122518539428711, + "step": 67630 + }, + { + "epoch": 0.29039265689532295, + "grad_norm": 0.008284986019134521, + "learning_rate": 7.126712830816727e-05, + "loss": 0.3207037687301636, + "step": 67640 + }, + { + "epoch": 0.290435588985343, + "grad_norm": 0.4197596609592438, + "learning_rate": 7.126281658804964e-05, + "loss": 0.2883680105209351, + "step": 67650 + }, + { + "epoch": 0.290478521075363, + "grad_norm": 0.02145151048898697, + "learning_rate": 7.125850486793202e-05, + "loss": 0.16351989507675171, + "step": 67660 + }, + { + "epoch": 0.29052145316538297, + "grad_norm": 0.7493966221809387, + "learning_rate": 7.12541931478144e-05, + "loss": 0.30967581272125244, + "step": 67670 + }, + { + "epoch": 0.290564385255403, + "grad_norm": 0.016078148037195206, + "learning_rate": 7.124988142769678e-05, + "loss": 0.13175559043884277, + "step": 67680 + }, + { + "epoch": 0.290607317345423, + "grad_norm": 0.4908279776573181, + "learning_rate": 7.124556970757914e-05, + "loss": 0.02809107005596161, + "step": 67690 + }, + { + "epoch": 0.29065024943544304, + "grad_norm": 0.9280158281326294, + "learning_rate": 7.124125798746152e-05, + "loss": 0.33881824016571044, + "step": 67700 + }, + { + "epoch": 0.29069318152546303, + "grad_norm": 1.8869291543960571, + "learning_rate": 7.12369462673439e-05, + "loss": 0.2556769847869873, + "step": 67710 + }, + { + "epoch": 0.290736113615483, + "grad_norm": 0.4702579975128174, + "learning_rate": 7.123263454722627e-05, + "loss": 0.13894178867340087, + "step": 67720 + }, + { + "epoch": 0.29077904570550306, + "grad_norm": 0.002194383880123496, + "learning_rate": 7.122832282710865e-05, + "loss": 0.35664730072021483, + "step": 67730 + }, + { + "epoch": 0.29082197779552305, + "grad_norm": 0.21256765723228455, + "learning_rate": 7.122401110699103e-05, + "loss": 0.10273618698120117, + "step": 67740 + }, + { + "epoch": 0.29086490988554303, + "grad_norm": 0.020599033683538437, + "learning_rate": 7.12196993868734e-05, + "loss": 0.12157354354858399, + "step": 67750 + }, + { + "epoch": 0.2909078419755631, + "grad_norm": 2.6698665618896484, + "learning_rate": 7.121538766675578e-05, + "loss": 0.19090875387191772, + "step": 67760 + }, + { + "epoch": 0.29095077406558306, + "grad_norm": 0.03797188401222229, + "learning_rate": 7.121107594663815e-05, + "loss": 0.1711806058883667, + "step": 67770 + }, + { + "epoch": 0.29099370615560305, + "grad_norm": 2.388545513153076, + "learning_rate": 7.120676422652052e-05, + "loss": 0.26054646968841555, + "step": 67780 + }, + { + "epoch": 0.2910366382456231, + "grad_norm": 0.009794231504201889, + "learning_rate": 7.12024525064029e-05, + "loss": 0.4301589488983154, + "step": 67790 + }, + { + "epoch": 0.2910795703356431, + "grad_norm": 2.32464337348938, + "learning_rate": 7.119814078628528e-05, + "loss": 0.22652745246887207, + "step": 67800 + }, + { + "epoch": 0.29112250242566307, + "grad_norm": 3.0685410499572754, + "learning_rate": 7.119382906616765e-05, + "loss": 0.08744192123413086, + "step": 67810 + }, + { + "epoch": 0.2911654345156831, + "grad_norm": 0.011893724091351032, + "learning_rate": 7.118951734605005e-05, + "loss": 0.13607373237609863, + "step": 67820 + }, + { + "epoch": 0.2912083666057031, + "grad_norm": 1.304750680923462, + "learning_rate": 7.118520562593242e-05, + "loss": 0.2696163892745972, + "step": 67830 + }, + { + "epoch": 0.2912512986957231, + "grad_norm": 0.005757071543484926, + "learning_rate": 7.11808939058148e-05, + "loss": 0.10221372842788697, + "step": 67840 + }, + { + "epoch": 0.2912942307857431, + "grad_norm": 1.33371102809906, + "learning_rate": 7.117658218569716e-05, + "loss": 0.36364994049072263, + "step": 67850 + }, + { + "epoch": 0.2913371628757631, + "grad_norm": 3.079373598098755, + "learning_rate": 7.117227046557954e-05, + "loss": 0.36846587657928465, + "step": 67860 + }, + { + "epoch": 0.2913800949657831, + "grad_norm": 5.180749416351318, + "learning_rate": 7.116795874546192e-05, + "loss": 0.4598280906677246, + "step": 67870 + }, + { + "epoch": 0.29142302705580314, + "grad_norm": 3.1711230278015137, + "learning_rate": 7.11636470253443e-05, + "loss": 0.3365186214447021, + "step": 67880 + }, + { + "epoch": 0.29146595914582313, + "grad_norm": 0.05353054031729698, + "learning_rate": 7.115933530522667e-05, + "loss": 0.20130722522735595, + "step": 67890 + }, + { + "epoch": 0.29150889123584317, + "grad_norm": 0.003241224680095911, + "learning_rate": 7.115502358510905e-05, + "loss": 0.28517961502075195, + "step": 67900 + }, + { + "epoch": 0.29155182332586316, + "grad_norm": 0.6127050518989563, + "learning_rate": 7.115071186499143e-05, + "loss": 0.47597332000732423, + "step": 67910 + }, + { + "epoch": 0.29159475541588314, + "grad_norm": 0.01519143395125866, + "learning_rate": 7.11464001448738e-05, + "loss": 0.11472551822662354, + "step": 67920 + }, + { + "epoch": 0.2916376875059032, + "grad_norm": 1.374168038368225, + "learning_rate": 7.114208842475617e-05, + "loss": 0.22478947639465333, + "step": 67930 + }, + { + "epoch": 0.2916806195959232, + "grad_norm": 0.489167183637619, + "learning_rate": 7.113777670463855e-05, + "loss": 0.3274041175842285, + "step": 67940 + }, + { + "epoch": 0.29172355168594316, + "grad_norm": 0.012719135731458664, + "learning_rate": 7.113346498452092e-05, + "loss": 0.1357038736343384, + "step": 67950 + }, + { + "epoch": 0.2917664837759632, + "grad_norm": 0.001968483906239271, + "learning_rate": 7.11291532644033e-05, + "loss": 0.13717812299728394, + "step": 67960 + }, + { + "epoch": 0.2918094158659832, + "grad_norm": 0.003384147770702839, + "learning_rate": 7.112484154428568e-05, + "loss": 0.2852107763290405, + "step": 67970 + }, + { + "epoch": 0.2918523479560032, + "grad_norm": 0.6869329810142517, + "learning_rate": 7.112052982416806e-05, + "loss": 0.26191256046295164, + "step": 67980 + }, + { + "epoch": 0.2918952800460232, + "grad_norm": 0.020543674007058144, + "learning_rate": 7.111621810405043e-05, + "loss": 0.0620194137096405, + "step": 67990 + }, + { + "epoch": 0.2919382121360432, + "grad_norm": 4.142035007476807, + "learning_rate": 7.111190638393281e-05, + "loss": 0.20071187019348144, + "step": 68000 + }, + { + "epoch": 0.2919382121360432, + "eval_loss": 0.43928349018096924, + "eval_runtime": 27.3939, + "eval_samples_per_second": 3.65, + "eval_steps_per_second": 3.65, + "step": 68000 + }, + { + "epoch": 0.2919811442260632, + "grad_norm": 2.2618143558502197, + "learning_rate": 7.110759466381519e-05, + "loss": 0.45159592628479006, + "step": 68010 + }, + { + "epoch": 0.29202407631608324, + "grad_norm": 0.001515640295110643, + "learning_rate": 7.110328294369755e-05, + "loss": 0.14594651460647584, + "step": 68020 + }, + { + "epoch": 0.2920670084061032, + "grad_norm": 0.8493977189064026, + "learning_rate": 7.109897122357993e-05, + "loss": 0.2396256685256958, + "step": 68030 + }, + { + "epoch": 0.2921099404961232, + "grad_norm": 4.17466926574707, + "learning_rate": 7.109465950346232e-05, + "loss": 0.18733444213867187, + "step": 68040 + }, + { + "epoch": 0.29215287258614325, + "grad_norm": 3.635678291320801, + "learning_rate": 7.10903477833447e-05, + "loss": 0.20183224678039552, + "step": 68050 + }, + { + "epoch": 0.29219580467616324, + "grad_norm": 1.21640145778656, + "learning_rate": 7.108603606322707e-05, + "loss": 0.22180395126342772, + "step": 68060 + }, + { + "epoch": 0.2922387367661832, + "grad_norm": 2.761414051055908, + "learning_rate": 7.108172434310945e-05, + "loss": 0.44273886680603025, + "step": 68070 + }, + { + "epoch": 0.29228166885620327, + "grad_norm": 0.0016735456883907318, + "learning_rate": 7.107741262299183e-05, + "loss": 0.15880162715911866, + "step": 68080 + }, + { + "epoch": 0.29232460094622326, + "grad_norm": 1.637378215789795, + "learning_rate": 7.10731009028742e-05, + "loss": 0.31997501850128174, + "step": 68090 + }, + { + "epoch": 0.29236753303624324, + "grad_norm": 0.04662550240755081, + "learning_rate": 7.106878918275657e-05, + "loss": 0.3928894758224487, + "step": 68100 + }, + { + "epoch": 0.2924104651262633, + "grad_norm": 4.416837692260742, + "learning_rate": 7.106447746263895e-05, + "loss": 0.49892539978027345, + "step": 68110 + }, + { + "epoch": 0.2924533972162833, + "grad_norm": 0.03326794505119324, + "learning_rate": 7.106016574252132e-05, + "loss": 0.15234217643737794, + "step": 68120 + }, + { + "epoch": 0.2924963293063033, + "grad_norm": 1.8150595426559448, + "learning_rate": 7.10558540224037e-05, + "loss": 0.2633387327194214, + "step": 68130 + }, + { + "epoch": 0.2925392613963233, + "grad_norm": 0.07998789101839066, + "learning_rate": 7.105154230228608e-05, + "loss": 0.2532381772994995, + "step": 68140 + }, + { + "epoch": 0.2925821934863433, + "grad_norm": 0.01822078414261341, + "learning_rate": 7.104723058216846e-05, + "loss": 0.11664667129516601, + "step": 68150 + }, + { + "epoch": 0.29262512557636333, + "grad_norm": 0.9801276922225952, + "learning_rate": 7.104291886205083e-05, + "loss": 0.1693058967590332, + "step": 68160 + }, + { + "epoch": 0.2926680576663833, + "grad_norm": 0.011544733308255672, + "learning_rate": 7.103860714193321e-05, + "loss": 0.09046794772148133, + "step": 68170 + }, + { + "epoch": 0.2927109897564033, + "grad_norm": 0.043812867254018784, + "learning_rate": 7.103429542181558e-05, + "loss": 0.12290755510330201, + "step": 68180 + }, + { + "epoch": 0.29275392184642335, + "grad_norm": 0.06413479149341583, + "learning_rate": 7.102998370169795e-05, + "loss": 0.293648362159729, + "step": 68190 + }, + { + "epoch": 0.29279685393644334, + "grad_norm": 0.049631934612989426, + "learning_rate": 7.102567198158033e-05, + "loss": 0.3742121458053589, + "step": 68200 + }, + { + "epoch": 0.2928397860264633, + "grad_norm": 0.16051065921783447, + "learning_rate": 7.102136026146271e-05, + "loss": 0.31449887752532957, + "step": 68210 + }, + { + "epoch": 0.29288271811648336, + "grad_norm": 6.327853679656982, + "learning_rate": 7.101704854134508e-05, + "loss": 0.3112722158432007, + "step": 68220 + }, + { + "epoch": 0.29292565020650335, + "grad_norm": 0.05760200321674347, + "learning_rate": 7.101273682122746e-05, + "loss": 0.09027496576309205, + "step": 68230 + }, + { + "epoch": 0.29296858229652334, + "grad_norm": 0.029430439695715904, + "learning_rate": 7.100842510110984e-05, + "loss": 0.2163465738296509, + "step": 68240 + }, + { + "epoch": 0.2930115143865434, + "grad_norm": 1.0040699243545532, + "learning_rate": 7.100411338099222e-05, + "loss": 0.4775634765625, + "step": 68250 + }, + { + "epoch": 0.29305444647656337, + "grad_norm": 1.0060920715332031, + "learning_rate": 7.09998016608746e-05, + "loss": 0.3704617977142334, + "step": 68260 + }, + { + "epoch": 0.29309737856658336, + "grad_norm": 1.6154614686965942, + "learning_rate": 7.099548994075697e-05, + "loss": 0.12538934946060182, + "step": 68270 + }, + { + "epoch": 0.2931403106566034, + "grad_norm": 0.7863723039627075, + "learning_rate": 7.099117822063935e-05, + "loss": 0.14734208583831787, + "step": 68280 + }, + { + "epoch": 0.2931832427466234, + "grad_norm": 0.03183925896883011, + "learning_rate": 7.098686650052173e-05, + "loss": 0.3232817888259888, + "step": 68290 + }, + { + "epoch": 0.29322617483664337, + "grad_norm": 0.5198999643325806, + "learning_rate": 7.09825547804041e-05, + "loss": 0.11065690517425537, + "step": 68300 + }, + { + "epoch": 0.2932691069266634, + "grad_norm": 0.624901294708252, + "learning_rate": 7.097824306028648e-05, + "loss": 0.4588596343994141, + "step": 68310 + }, + { + "epoch": 0.2933120390166834, + "grad_norm": 0.07914484292268753, + "learning_rate": 7.097393134016886e-05, + "loss": 0.055303043127059935, + "step": 68320 + }, + { + "epoch": 0.29335497110670344, + "grad_norm": 0.6766675114631653, + "learning_rate": 7.096961962005124e-05, + "loss": 0.15219202041625976, + "step": 68330 + }, + { + "epoch": 0.29339790319672343, + "grad_norm": 0.02503044158220291, + "learning_rate": 7.09653078999336e-05, + "loss": 0.19463895559310912, + "step": 68340 + }, + { + "epoch": 0.2934408352867434, + "grad_norm": 0.24121232330799103, + "learning_rate": 7.096099617981598e-05, + "loss": 0.16598477363586425, + "step": 68350 + }, + { + "epoch": 0.29348376737676346, + "grad_norm": 0.007208712864667177, + "learning_rate": 7.095668445969835e-05, + "loss": 0.16176241636276245, + "step": 68360 + }, + { + "epoch": 0.29352669946678345, + "grad_norm": 0.0008645054767839611, + "learning_rate": 7.095237273958073e-05, + "loss": 0.09948625564575195, + "step": 68370 + }, + { + "epoch": 0.29356963155680343, + "grad_norm": 2.0432870388031006, + "learning_rate": 7.094806101946311e-05, + "loss": 0.462565279006958, + "step": 68380 + }, + { + "epoch": 0.2936125636468235, + "grad_norm": 0.0818505734205246, + "learning_rate": 7.094374929934549e-05, + "loss": 0.344118595123291, + "step": 68390 + }, + { + "epoch": 0.29365549573684346, + "grad_norm": 0.09661252796649933, + "learning_rate": 7.093943757922786e-05, + "loss": 0.04238423705101013, + "step": 68400 + }, + { + "epoch": 0.29369842782686345, + "grad_norm": 0.04637506231665611, + "learning_rate": 7.093512585911024e-05, + "loss": 0.11444630622863769, + "step": 68410 + }, + { + "epoch": 0.2937413599168835, + "grad_norm": 0.03868831321597099, + "learning_rate": 7.093081413899262e-05, + "loss": 0.19466127157211305, + "step": 68420 + }, + { + "epoch": 0.2937842920069035, + "grad_norm": 1.6524996757507324, + "learning_rate": 7.092650241887498e-05, + "loss": 0.39613971710205076, + "step": 68430 + }, + { + "epoch": 0.29382722409692347, + "grad_norm": 0.06135096400976181, + "learning_rate": 7.092219069875736e-05, + "loss": 0.2066957712173462, + "step": 68440 + }, + { + "epoch": 0.2938701561869435, + "grad_norm": 0.0012592696584761143, + "learning_rate": 7.091787897863974e-05, + "loss": 0.06958057880401611, + "step": 68450 + }, + { + "epoch": 0.2939130882769635, + "grad_norm": 1.4627882242202759, + "learning_rate": 7.091356725852211e-05, + "loss": 0.26883270740509035, + "step": 68460 + }, + { + "epoch": 0.2939560203669835, + "grad_norm": 0.07310977578163147, + "learning_rate": 7.090925553840449e-05, + "loss": 0.15684688091278076, + "step": 68470 + }, + { + "epoch": 0.2939989524570035, + "grad_norm": 3.431220531463623, + "learning_rate": 7.090494381828687e-05, + "loss": 0.34428939819335935, + "step": 68480 + }, + { + "epoch": 0.2940418845470235, + "grad_norm": 5.300132751464844, + "learning_rate": 7.090063209816925e-05, + "loss": 0.22276744842529297, + "step": 68490 + }, + { + "epoch": 0.2940848166370435, + "grad_norm": 0.12124665081501007, + "learning_rate": 7.089632037805162e-05, + "loss": 0.1893091917037964, + "step": 68500 + }, + { + "epoch": 0.29412774872706354, + "grad_norm": 0.025070803239941597, + "learning_rate": 7.0892008657934e-05, + "loss": 0.24306130409240723, + "step": 68510 + }, + { + "epoch": 0.29417068081708353, + "grad_norm": 5.163175106048584, + "learning_rate": 7.088769693781638e-05, + "loss": 0.26326632499694824, + "step": 68520 + }, + { + "epoch": 0.2942136129071035, + "grad_norm": 0.02456679567694664, + "learning_rate": 7.088338521769876e-05, + "loss": 0.22474491596221924, + "step": 68530 + }, + { + "epoch": 0.29425654499712356, + "grad_norm": 1.350258708000183, + "learning_rate": 7.087907349758113e-05, + "loss": 0.18058979511260986, + "step": 68540 + }, + { + "epoch": 0.29429947708714355, + "grad_norm": 2.1283679008483887, + "learning_rate": 7.087476177746351e-05, + "loss": 0.4500577449798584, + "step": 68550 + }, + { + "epoch": 0.2943424091771636, + "grad_norm": 2.402839183807373, + "learning_rate": 7.087045005734589e-05, + "loss": 0.25330147743225095, + "step": 68560 + }, + { + "epoch": 0.2943853412671836, + "grad_norm": 0.008127194829285145, + "learning_rate": 7.086613833722826e-05, + "loss": 0.1757526159286499, + "step": 68570 + }, + { + "epoch": 0.29442827335720356, + "grad_norm": 1.464109182357788, + "learning_rate": 7.086182661711064e-05, + "loss": 0.18842368125915526, + "step": 68580 + }, + { + "epoch": 0.2944712054472236, + "grad_norm": 0.08519778400659561, + "learning_rate": 7.0857514896993e-05, + "loss": 0.309142017364502, + "step": 68590 + }, + { + "epoch": 0.2945141375372436, + "grad_norm": 0.08478078246116638, + "learning_rate": 7.085320317687538e-05, + "loss": 0.18999476432800294, + "step": 68600 + }, + { + "epoch": 0.2945570696272636, + "grad_norm": 0.05416850745677948, + "learning_rate": 7.084889145675776e-05, + "loss": 0.2712838649749756, + "step": 68610 + }, + { + "epoch": 0.2946000017172836, + "grad_norm": 0.4658629596233368, + "learning_rate": 7.084457973664014e-05, + "loss": 0.11947870254516602, + "step": 68620 + }, + { + "epoch": 0.2946429338073036, + "grad_norm": 0.005190864205360413, + "learning_rate": 7.084026801652252e-05, + "loss": 0.15439292192459106, + "step": 68630 + }, + { + "epoch": 0.2946858658973236, + "grad_norm": 1.495048999786377, + "learning_rate": 7.083595629640489e-05, + "loss": 0.1966106414794922, + "step": 68640 + }, + { + "epoch": 0.29472879798734364, + "grad_norm": 0.19685158133506775, + "learning_rate": 7.083164457628727e-05, + "loss": 0.09233436584472657, + "step": 68650 + }, + { + "epoch": 0.2947717300773636, + "grad_norm": 0.1266477257013321, + "learning_rate": 7.082733285616965e-05, + "loss": 0.17769633531570433, + "step": 68660 + }, + { + "epoch": 0.2948146621673836, + "grad_norm": 0.002195771085098386, + "learning_rate": 7.082302113605201e-05, + "loss": 0.13989298343658446, + "step": 68670 + }, + { + "epoch": 0.29485759425740365, + "grad_norm": 0.7707890868186951, + "learning_rate": 7.081870941593439e-05, + "loss": 0.05584725141525269, + "step": 68680 + }, + { + "epoch": 0.29490052634742364, + "grad_norm": 0.5744310617446899, + "learning_rate": 7.081439769581677e-05, + "loss": 0.3359058141708374, + "step": 68690 + }, + { + "epoch": 0.29494345843744363, + "grad_norm": 1.2864564657211304, + "learning_rate": 7.081008597569914e-05, + "loss": 0.19174413681030272, + "step": 68700 + }, + { + "epoch": 0.29498639052746367, + "grad_norm": 0.052966032177209854, + "learning_rate": 7.080577425558152e-05, + "loss": 0.12695059776306153, + "step": 68710 + }, + { + "epoch": 0.29502932261748366, + "grad_norm": 1.6851404905319214, + "learning_rate": 7.08014625354639e-05, + "loss": 0.29085845947265626, + "step": 68720 + }, + { + "epoch": 0.29507225470750365, + "grad_norm": 3.938408851623535, + "learning_rate": 7.079715081534628e-05, + "loss": 0.31063499450683596, + "step": 68730 + }, + { + "epoch": 0.2951151867975237, + "grad_norm": 1.0033713579177856, + "learning_rate": 7.079283909522865e-05, + "loss": 0.3633733749389648, + "step": 68740 + }, + { + "epoch": 0.2951581188875437, + "grad_norm": 0.31211256980895996, + "learning_rate": 7.078852737511103e-05, + "loss": 0.1458314061164856, + "step": 68750 + }, + { + "epoch": 0.2952010509775637, + "grad_norm": 0.009130554273724556, + "learning_rate": 7.078421565499341e-05, + "loss": 0.36089942455291746, + "step": 68760 + }, + { + "epoch": 0.2952439830675837, + "grad_norm": 1.35874605178833, + "learning_rate": 7.077990393487578e-05, + "loss": 0.10489203929901122, + "step": 68770 + }, + { + "epoch": 0.2952869151576037, + "grad_norm": 7.000607490539551, + "learning_rate": 7.077559221475816e-05, + "loss": 0.3951005458831787, + "step": 68780 + }, + { + "epoch": 0.29532984724762373, + "grad_norm": 0.020006684586405754, + "learning_rate": 7.077128049464054e-05, + "loss": 0.3116251230239868, + "step": 68790 + }, + { + "epoch": 0.2953727793376437, + "grad_norm": 0.32888463139533997, + "learning_rate": 7.076696877452292e-05, + "loss": 0.1497477889060974, + "step": 68800 + }, + { + "epoch": 0.2954157114276637, + "grad_norm": 5.472551345825195, + "learning_rate": 7.07626570544053e-05, + "loss": 0.2952433109283447, + "step": 68810 + }, + { + "epoch": 0.29545864351768375, + "grad_norm": 0.719142496585846, + "learning_rate": 7.075834533428767e-05, + "loss": 0.28906295299530027, + "step": 68820 + }, + { + "epoch": 0.29550157560770374, + "grad_norm": 0.2726041376590729, + "learning_rate": 7.075403361417005e-05, + "loss": 0.19550033807754516, + "step": 68830 + }, + { + "epoch": 0.2955445076977237, + "grad_norm": 1.4349033832550049, + "learning_rate": 7.074972189405241e-05, + "loss": 0.15906682014465331, + "step": 68840 + }, + { + "epoch": 0.29558743978774377, + "grad_norm": 0.06669703871011734, + "learning_rate": 7.074541017393479e-05, + "loss": 0.21837499141693115, + "step": 68850 + }, + { + "epoch": 0.29563037187776375, + "grad_norm": 2.129164934158325, + "learning_rate": 7.074109845381717e-05, + "loss": 0.04467359185218811, + "step": 68860 + }, + { + "epoch": 0.29567330396778374, + "grad_norm": 1.0307523012161255, + "learning_rate": 7.073678673369954e-05, + "loss": 0.19184517860412598, + "step": 68870 + }, + { + "epoch": 0.2957162360578038, + "grad_norm": 2.2985711097717285, + "learning_rate": 7.073247501358192e-05, + "loss": 0.09638724327087403, + "step": 68880 + }, + { + "epoch": 0.29575916814782377, + "grad_norm": 1.2666059732437134, + "learning_rate": 7.07281632934643e-05, + "loss": 0.16064202785491943, + "step": 68890 + }, + { + "epoch": 0.29580210023784376, + "grad_norm": 2.663952350616455, + "learning_rate": 7.072385157334668e-05, + "loss": 0.21061155796051026, + "step": 68900 + }, + { + "epoch": 0.2958450323278638, + "grad_norm": 5.3934645652771, + "learning_rate": 7.071953985322905e-05, + "loss": 0.20032162666320802, + "step": 68910 + }, + { + "epoch": 0.2958879644178838, + "grad_norm": 1.9548537731170654, + "learning_rate": 7.071522813311142e-05, + "loss": 0.3333185911178589, + "step": 68920 + }, + { + "epoch": 0.2959308965079038, + "grad_norm": 5.220151901245117, + "learning_rate": 7.07109164129938e-05, + "loss": 0.32145798206329346, + "step": 68930 + }, + { + "epoch": 0.2959738285979238, + "grad_norm": 1.7756158113479614, + "learning_rate": 7.070660469287617e-05, + "loss": 0.3678144931793213, + "step": 68940 + }, + { + "epoch": 0.2960167606879438, + "grad_norm": 0.0175294429063797, + "learning_rate": 7.070229297275855e-05, + "loss": 0.1472090721130371, + "step": 68950 + }, + { + "epoch": 0.2960596927779638, + "grad_norm": 0.5595688819885254, + "learning_rate": 7.069798125264093e-05, + "loss": 0.4218907833099365, + "step": 68960 + }, + { + "epoch": 0.29610262486798383, + "grad_norm": 0.03466328606009483, + "learning_rate": 7.06936695325233e-05, + "loss": 0.28534204959869386, + "step": 68970 + }, + { + "epoch": 0.2961455569580038, + "grad_norm": 0.37167027592658997, + "learning_rate": 7.068935781240568e-05, + "loss": 0.20634891986846923, + "step": 68980 + }, + { + "epoch": 0.29618848904802386, + "grad_norm": 2.8753232955932617, + "learning_rate": 7.068504609228806e-05, + "loss": 0.13908033370971679, + "step": 68990 + }, + { + "epoch": 0.29623142113804385, + "grad_norm": 0.08938822895288467, + "learning_rate": 7.068073437217044e-05, + "loss": 0.11308764219284058, + "step": 69000 + }, + { + "epoch": 0.29623142113804385, + "eval_loss": 0.43790289759635925, + "eval_runtime": 27.3935, + "eval_samples_per_second": 3.65, + "eval_steps_per_second": 3.65, + "step": 69000 + }, + { + "epoch": 0.29627435322806384, + "grad_norm": 1.1804087162017822, + "learning_rate": 7.067642265205281e-05, + "loss": 0.22331843376159669, + "step": 69010 + }, + { + "epoch": 0.2963172853180839, + "grad_norm": 3.558773994445801, + "learning_rate": 7.067211093193519e-05, + "loss": 0.26021552085876465, + "step": 69020 + }, + { + "epoch": 0.29636021740810387, + "grad_norm": 0.1038755252957344, + "learning_rate": 7.066779921181757e-05, + "loss": 0.17431793212890626, + "step": 69030 + }, + { + "epoch": 0.29640314949812385, + "grad_norm": 0.0383944995701313, + "learning_rate": 7.066348749169995e-05, + "loss": 0.05176795721054077, + "step": 69040 + }, + { + "epoch": 0.2964460815881439, + "grad_norm": 0.09961086511611938, + "learning_rate": 7.065917577158232e-05, + "loss": 0.190877366065979, + "step": 69050 + }, + { + "epoch": 0.2964890136781639, + "grad_norm": 0.02250022441148758, + "learning_rate": 7.06548640514647e-05, + "loss": 0.2138050079345703, + "step": 69060 + }, + { + "epoch": 0.29653194576818387, + "grad_norm": 0.122157022356987, + "learning_rate": 7.065055233134708e-05, + "loss": 0.30884108543395994, + "step": 69070 + }, + { + "epoch": 0.2965748778582039, + "grad_norm": 6.911462783813477, + "learning_rate": 7.064624061122945e-05, + "loss": 0.09625717401504516, + "step": 69080 + }, + { + "epoch": 0.2966178099482239, + "grad_norm": 0.8818526268005371, + "learning_rate": 7.064192889111182e-05, + "loss": 0.09233015179634094, + "step": 69090 + }, + { + "epoch": 0.2966607420382439, + "grad_norm": 1.978317141532898, + "learning_rate": 7.06376171709942e-05, + "loss": 0.12204514741897583, + "step": 69100 + }, + { + "epoch": 0.29670367412826393, + "grad_norm": 0.002704236889258027, + "learning_rate": 7.063330545087657e-05, + "loss": 0.3270133018493652, + "step": 69110 + }, + { + "epoch": 0.2967466062182839, + "grad_norm": 3.32967209815979, + "learning_rate": 7.062899373075895e-05, + "loss": 0.32077441215515134, + "step": 69120 + }, + { + "epoch": 0.2967895383083039, + "grad_norm": 2.797989845275879, + "learning_rate": 7.062468201064133e-05, + "loss": 0.14576636552810668, + "step": 69130 + }, + { + "epoch": 0.29683247039832394, + "grad_norm": 0.3258748948574066, + "learning_rate": 7.06203702905237e-05, + "loss": 0.3708260774612427, + "step": 69140 + }, + { + "epoch": 0.29687540248834393, + "grad_norm": 0.020132817327976227, + "learning_rate": 7.061605857040608e-05, + "loss": 0.17735791206359863, + "step": 69150 + }, + { + "epoch": 0.2969183345783639, + "grad_norm": 0.07214616239070892, + "learning_rate": 7.061174685028846e-05, + "loss": 0.31556928157806396, + "step": 69160 + }, + { + "epoch": 0.29696126666838396, + "grad_norm": 0.4991401731967926, + "learning_rate": 7.060743513017082e-05, + "loss": 0.12665189504623414, + "step": 69170 + }, + { + "epoch": 0.29700419875840395, + "grad_norm": 0.6309173703193665, + "learning_rate": 7.06031234100532e-05, + "loss": 0.27704677581787107, + "step": 69180 + }, + { + "epoch": 0.297047130848424, + "grad_norm": 2.7611091136932373, + "learning_rate": 7.059881168993558e-05, + "loss": 0.17989909648895264, + "step": 69190 + }, + { + "epoch": 0.297090062938444, + "grad_norm": 0.04773523658514023, + "learning_rate": 7.059449996981796e-05, + "loss": 0.13966834545135498, + "step": 69200 + }, + { + "epoch": 0.29713299502846396, + "grad_norm": 8.029800415039062, + "learning_rate": 7.059018824970033e-05, + "loss": 0.2252267599105835, + "step": 69210 + }, + { + "epoch": 0.297175927118484, + "grad_norm": 38.85501480102539, + "learning_rate": 7.058587652958271e-05, + "loss": 0.1387203812599182, + "step": 69220 + }, + { + "epoch": 0.297218859208504, + "grad_norm": 3.258969783782959, + "learning_rate": 7.05815648094651e-05, + "loss": 0.1611212134361267, + "step": 69230 + }, + { + "epoch": 0.297261791298524, + "grad_norm": 1.5080939531326294, + "learning_rate": 7.057725308934748e-05, + "loss": 0.25683202743530276, + "step": 69240 + }, + { + "epoch": 0.297304723388544, + "grad_norm": 0.06847647577524185, + "learning_rate": 7.057294136922984e-05, + "loss": 0.14771102666854857, + "step": 69250 + }, + { + "epoch": 0.297347655478564, + "grad_norm": 0.033461250364780426, + "learning_rate": 7.056862964911222e-05, + "loss": 0.24316053390502929, + "step": 69260 + }, + { + "epoch": 0.297390587568584, + "grad_norm": 1.9887748956680298, + "learning_rate": 7.05643179289946e-05, + "loss": 0.4812785148620605, + "step": 69270 + }, + { + "epoch": 0.29743351965860404, + "grad_norm": 1.3982434272766113, + "learning_rate": 7.056000620887697e-05, + "loss": 0.2308140754699707, + "step": 69280 + }, + { + "epoch": 0.297476451748624, + "grad_norm": 0.10920906066894531, + "learning_rate": 7.055569448875935e-05, + "loss": 0.10632593631744384, + "step": 69290 + }, + { + "epoch": 0.297519383838644, + "grad_norm": 0.03187263384461403, + "learning_rate": 7.055138276864173e-05, + "loss": 0.24314732551574708, + "step": 69300 + }, + { + "epoch": 0.29756231592866406, + "grad_norm": 0.7231481671333313, + "learning_rate": 7.05470710485241e-05, + "loss": 0.1606484532356262, + "step": 69310 + }, + { + "epoch": 0.29760524801868404, + "grad_norm": 0.023674849420785904, + "learning_rate": 7.054275932840648e-05, + "loss": 0.28617658615112307, + "step": 69320 + }, + { + "epoch": 0.29764818010870403, + "grad_norm": 0.12634412944316864, + "learning_rate": 7.053844760828885e-05, + "loss": 0.168298602104187, + "step": 69330 + }, + { + "epoch": 0.2976911121987241, + "grad_norm": 0.1226951852440834, + "learning_rate": 7.053413588817123e-05, + "loss": 0.30879578590393064, + "step": 69340 + }, + { + "epoch": 0.29773404428874406, + "grad_norm": 0.12052540481090546, + "learning_rate": 7.05298241680536e-05, + "loss": 0.2331835985183716, + "step": 69350 + }, + { + "epoch": 0.29777697637876405, + "grad_norm": 3.8223581314086914, + "learning_rate": 7.052551244793598e-05, + "loss": 0.23448073863983154, + "step": 69360 + }, + { + "epoch": 0.2978199084687841, + "grad_norm": 0.14944325387477875, + "learning_rate": 7.052120072781836e-05, + "loss": 0.20286400318145753, + "step": 69370 + }, + { + "epoch": 0.2978628405588041, + "grad_norm": 0.1501256376504898, + "learning_rate": 7.051688900770073e-05, + "loss": 0.30396556854248047, + "step": 69380 + }, + { + "epoch": 0.29790577264882406, + "grad_norm": 3.373429775238037, + "learning_rate": 7.051257728758311e-05, + "loss": 0.27126803398132326, + "step": 69390 + }, + { + "epoch": 0.2979487047388441, + "grad_norm": 14.590777397155762, + "learning_rate": 7.050826556746549e-05, + "loss": 0.29711642265319826, + "step": 69400 + }, + { + "epoch": 0.2979916368288641, + "grad_norm": 2.8559489250183105, + "learning_rate": 7.050395384734785e-05, + "loss": 0.21114108562469483, + "step": 69410 + }, + { + "epoch": 0.29803456891888414, + "grad_norm": 0.48258069157600403, + "learning_rate": 7.049964212723023e-05, + "loss": 0.13180533647537232, + "step": 69420 + }, + { + "epoch": 0.2980775010089041, + "grad_norm": 0.030082767829298973, + "learning_rate": 7.049533040711261e-05, + "loss": 0.10944579839706421, + "step": 69430 + }, + { + "epoch": 0.2981204330989241, + "grad_norm": 0.02870088629424572, + "learning_rate": 7.049101868699499e-05, + "loss": 0.3478094577789307, + "step": 69440 + }, + { + "epoch": 0.29816336518894415, + "grad_norm": 2.597738027572632, + "learning_rate": 7.048670696687738e-05, + "loss": 0.23423190116882325, + "step": 69450 + }, + { + "epoch": 0.29820629727896414, + "grad_norm": 2.4963412284851074, + "learning_rate": 7.048239524675975e-05, + "loss": 0.17621252536773682, + "step": 69460 + }, + { + "epoch": 0.2982492293689841, + "grad_norm": 0.020082371309399605, + "learning_rate": 7.047808352664213e-05, + "loss": 0.253519868850708, + "step": 69470 + }, + { + "epoch": 0.29829216145900417, + "grad_norm": 0.0038464032113552094, + "learning_rate": 7.047377180652451e-05, + "loss": 0.2777001619338989, + "step": 69480 + }, + { + "epoch": 0.29833509354902416, + "grad_norm": 0.08930832892656326, + "learning_rate": 7.046946008640689e-05, + "loss": 0.12960487604141235, + "step": 69490 + }, + { + "epoch": 0.29837802563904414, + "grad_norm": 3.2200212478637695, + "learning_rate": 7.046514836628925e-05, + "loss": 0.2296536684036255, + "step": 69500 + }, + { + "epoch": 0.2984209577290642, + "grad_norm": 1.753504991531372, + "learning_rate": 7.046083664617163e-05, + "loss": 0.13932392597198487, + "step": 69510 + }, + { + "epoch": 0.29846388981908417, + "grad_norm": 2.262216091156006, + "learning_rate": 7.0456524926054e-05, + "loss": 0.2738852262496948, + "step": 69520 + }, + { + "epoch": 0.29850682190910416, + "grad_norm": 0.021889301016926765, + "learning_rate": 7.045221320593638e-05, + "loss": 0.23410627841949463, + "step": 69530 + }, + { + "epoch": 0.2985497539991242, + "grad_norm": 0.9385930299758911, + "learning_rate": 7.044790148581876e-05, + "loss": 0.2929945468902588, + "step": 69540 + }, + { + "epoch": 0.2985926860891442, + "grad_norm": 0.025515921413898468, + "learning_rate": 7.044358976570114e-05, + "loss": 0.1091389536857605, + "step": 69550 + }, + { + "epoch": 0.2986356181791642, + "grad_norm": 1.3053462505340576, + "learning_rate": 7.043927804558351e-05, + "loss": 0.37116103172302245, + "step": 69560 + }, + { + "epoch": 0.2986785502691842, + "grad_norm": 1.738684058189392, + "learning_rate": 7.043496632546589e-05, + "loss": 0.30579180717468263, + "step": 69570 + }, + { + "epoch": 0.2987214823592042, + "grad_norm": 0.06269654631614685, + "learning_rate": 7.043065460534825e-05, + "loss": 0.17386873960494995, + "step": 69580 + }, + { + "epoch": 0.2987644144492242, + "grad_norm": 4.997331142425537, + "learning_rate": 7.042634288523063e-05, + "loss": 0.3920506715774536, + "step": 69590 + }, + { + "epoch": 0.29880734653924423, + "grad_norm": 0.6335716843605042, + "learning_rate": 7.042203116511301e-05, + "loss": 0.16370421648025513, + "step": 69600 + }, + { + "epoch": 0.2988502786292642, + "grad_norm": 0.012116850353777409, + "learning_rate": 7.041771944499539e-05, + "loss": 0.11120028495788574, + "step": 69610 + }, + { + "epoch": 0.29889321071928426, + "grad_norm": 0.7054433226585388, + "learning_rate": 7.041340772487776e-05, + "loss": 0.1680054783821106, + "step": 69620 + }, + { + "epoch": 0.29893614280930425, + "grad_norm": 0.049377284944057465, + "learning_rate": 7.040909600476014e-05, + "loss": 0.19020957946777345, + "step": 69630 + }, + { + "epoch": 0.29897907489932424, + "grad_norm": 0.00042023861897177994, + "learning_rate": 7.040478428464252e-05, + "loss": 0.177628231048584, + "step": 69640 + }, + { + "epoch": 0.2990220069893443, + "grad_norm": 0.062201373279094696, + "learning_rate": 7.04004725645249e-05, + "loss": 0.20424156188964843, + "step": 69650 + }, + { + "epoch": 0.29906493907936427, + "grad_norm": 0.0926150307059288, + "learning_rate": 7.039616084440726e-05, + "loss": 0.30764992237091066, + "step": 69660 + }, + { + "epoch": 0.29910787116938425, + "grad_norm": 0.006616492290049791, + "learning_rate": 7.039184912428965e-05, + "loss": 0.1089470624923706, + "step": 69670 + }, + { + "epoch": 0.2991508032594043, + "grad_norm": 1.036904215812683, + "learning_rate": 7.038753740417203e-05, + "loss": 0.23160901069641113, + "step": 69680 + }, + { + "epoch": 0.2991937353494243, + "grad_norm": 5.831925868988037, + "learning_rate": 7.03832256840544e-05, + "loss": 0.15994585752487184, + "step": 69690 + }, + { + "epoch": 0.29923666743944427, + "grad_norm": 0.011926480568945408, + "learning_rate": 7.037891396393678e-05, + "loss": 0.10858608484268188, + "step": 69700 + }, + { + "epoch": 0.2992795995294643, + "grad_norm": 2.732725143432617, + "learning_rate": 7.037460224381916e-05, + "loss": 0.327674412727356, + "step": 69710 + }, + { + "epoch": 0.2993225316194843, + "grad_norm": 0.10609839856624603, + "learning_rate": 7.037029052370154e-05, + "loss": 0.38145086765289304, + "step": 69720 + }, + { + "epoch": 0.2993654637095043, + "grad_norm": 1.7052667140960693, + "learning_rate": 7.036597880358391e-05, + "loss": 0.31780190467834474, + "step": 69730 + }, + { + "epoch": 0.29940839579952433, + "grad_norm": 0.3516540229320526, + "learning_rate": 7.036166708346628e-05, + "loss": 0.22891452312469482, + "step": 69740 + }, + { + "epoch": 0.2994513278895443, + "grad_norm": 6.445019721984863, + "learning_rate": 7.035735536334866e-05, + "loss": 0.1429811716079712, + "step": 69750 + }, + { + "epoch": 0.2994942599795643, + "grad_norm": 0.27430006861686707, + "learning_rate": 7.035304364323103e-05, + "loss": 0.15352680683135986, + "step": 69760 + }, + { + "epoch": 0.29953719206958435, + "grad_norm": 5.184157371520996, + "learning_rate": 7.034873192311341e-05, + "loss": 0.28881843090057374, + "step": 69770 + }, + { + "epoch": 0.29958012415960433, + "grad_norm": 0.6079696416854858, + "learning_rate": 7.034442020299579e-05, + "loss": 0.32554547786712645, + "step": 69780 + }, + { + "epoch": 0.2996230562496243, + "grad_norm": 1.9618726968765259, + "learning_rate": 7.034010848287817e-05, + "loss": 0.4187826156616211, + "step": 69790 + }, + { + "epoch": 0.29966598833964436, + "grad_norm": 1.6079884767532349, + "learning_rate": 7.033579676276054e-05, + "loss": 0.1687183380126953, + "step": 69800 + }, + { + "epoch": 0.29970892042966435, + "grad_norm": 2.7777047157287598, + "learning_rate": 7.033148504264292e-05, + "loss": 0.4267683506011963, + "step": 69810 + }, + { + "epoch": 0.29975185251968434, + "grad_norm": 1.4220722913742065, + "learning_rate": 7.03271733225253e-05, + "loss": 0.2763724088668823, + "step": 69820 + }, + { + "epoch": 0.2997947846097044, + "grad_norm": 0.5493017435073853, + "learning_rate": 7.032286160240766e-05, + "loss": 0.2672281265258789, + "step": 69830 + }, + { + "epoch": 0.29983771669972437, + "grad_norm": 0.06145060807466507, + "learning_rate": 7.031854988229004e-05, + "loss": 0.35633857250213624, + "step": 69840 + }, + { + "epoch": 0.2998806487897444, + "grad_norm": 0.10371090471744537, + "learning_rate": 7.031423816217242e-05, + "loss": 0.328914475440979, + "step": 69850 + }, + { + "epoch": 0.2999235808797644, + "grad_norm": 2.0575599670410156, + "learning_rate": 7.030992644205479e-05, + "loss": 0.49924526214599607, + "step": 69860 + }, + { + "epoch": 0.2999665129697844, + "grad_norm": 2.1705596446990967, + "learning_rate": 7.030561472193717e-05, + "loss": 0.24522743225097657, + "step": 69870 + }, + { + "epoch": 0.3000094450598044, + "grad_norm": 0.055285509675741196, + "learning_rate": 7.030130300181955e-05, + "loss": 0.15846848487854004, + "step": 69880 + }, + { + "epoch": 0.3000523771498244, + "grad_norm": 3.6779916286468506, + "learning_rate": 7.029699128170192e-05, + "loss": 0.29879536628723147, + "step": 69890 + }, + { + "epoch": 0.3000953092398444, + "grad_norm": 12.207128524780273, + "learning_rate": 7.02926795615843e-05, + "loss": 0.31245851516723633, + "step": 69900 + }, + { + "epoch": 0.30013824132986444, + "grad_norm": 0.012846590019762516, + "learning_rate": 7.028836784146668e-05, + "loss": 0.2062364101409912, + "step": 69910 + }, + { + "epoch": 0.30018117341988443, + "grad_norm": 0.05187515914440155, + "learning_rate": 7.028405612134906e-05, + "loss": 0.18940675258636475, + "step": 69920 + }, + { + "epoch": 0.3002241055099044, + "grad_norm": 1.3474045991897583, + "learning_rate": 7.027974440123143e-05, + "loss": 0.38222217559814453, + "step": 69930 + }, + { + "epoch": 0.30026703759992446, + "grad_norm": 0.007387023419141769, + "learning_rate": 7.027543268111381e-05, + "loss": 0.1886347770690918, + "step": 69940 + }, + { + "epoch": 0.30030996968994444, + "grad_norm": 0.03471825644373894, + "learning_rate": 7.027112096099619e-05, + "loss": 0.20559465885162354, + "step": 69950 + }, + { + "epoch": 0.30035290177996443, + "grad_norm": 0.9603984951972961, + "learning_rate": 7.026680924087857e-05, + "loss": 0.23707988262176513, + "step": 69960 + }, + { + "epoch": 0.3003958338699845, + "grad_norm": 0.7676080465316772, + "learning_rate": 7.026249752076094e-05, + "loss": 0.17447848320007325, + "step": 69970 + }, + { + "epoch": 0.30043876596000446, + "grad_norm": 0.2487894594669342, + "learning_rate": 7.025818580064332e-05, + "loss": 0.06069231033325195, + "step": 69980 + }, + { + "epoch": 0.30048169805002445, + "grad_norm": 1.2301409244537354, + "learning_rate": 7.025387408052568e-05, + "loss": 0.15381040573120117, + "step": 69990 + }, + { + "epoch": 0.3005246301400445, + "grad_norm": 1.970497727394104, + "learning_rate": 7.024956236040806e-05, + "loss": 0.1707751750946045, + "step": 70000 + }, + { + "epoch": 0.3005246301400445, + "eval_loss": 0.46616458892822266, + "eval_runtime": 27.4872, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 3.638, + "step": 70000 + }, + { + "epoch": 0.3005675622300645, + "grad_norm": 0.060369785875082016, + "learning_rate": 7.024525064029044e-05, + "loss": 0.07141480445861817, + "step": 70010 + }, + { + "epoch": 0.30061049432008446, + "grad_norm": 5.950823783874512, + "learning_rate": 7.024093892017282e-05, + "loss": 0.28537325859069823, + "step": 70020 + }, + { + "epoch": 0.3006534264101045, + "grad_norm": 0.11006926745176315, + "learning_rate": 7.02366272000552e-05, + "loss": 0.2802592754364014, + "step": 70030 + }, + { + "epoch": 0.3006963585001245, + "grad_norm": 0.1244291216135025, + "learning_rate": 7.023231547993757e-05, + "loss": 0.04770747721195221, + "step": 70040 + }, + { + "epoch": 0.30073929059014454, + "grad_norm": 0.04021500051021576, + "learning_rate": 7.022800375981995e-05, + "loss": 0.1004258632659912, + "step": 70050 + }, + { + "epoch": 0.3007822226801645, + "grad_norm": 2.83384108543396, + "learning_rate": 7.022369203970233e-05, + "loss": 0.2835664987564087, + "step": 70060 + }, + { + "epoch": 0.3008251547701845, + "grad_norm": 0.002115644747391343, + "learning_rate": 7.021938031958469e-05, + "loss": 0.3332578897476196, + "step": 70070 + }, + { + "epoch": 0.30086808686020455, + "grad_norm": 0.010867947712540627, + "learning_rate": 7.021506859946707e-05, + "loss": 0.22531168460845946, + "step": 70080 + }, + { + "epoch": 0.30091101895022454, + "grad_norm": 0.05015119910240173, + "learning_rate": 7.021075687934944e-05, + "loss": 0.05852286815643311, + "step": 70090 + }, + { + "epoch": 0.3009539510402445, + "grad_norm": 2.656311273574829, + "learning_rate": 7.020644515923182e-05, + "loss": 0.24830961227416992, + "step": 70100 + }, + { + "epoch": 0.30099688313026457, + "grad_norm": 1.9714524745941162, + "learning_rate": 7.02021334391142e-05, + "loss": 0.23079366683959962, + "step": 70110 + }, + { + "epoch": 0.30103981522028456, + "grad_norm": 0.1279212236404419, + "learning_rate": 7.019782171899658e-05, + "loss": 0.46315436363220214, + "step": 70120 + }, + { + "epoch": 0.30108274731030454, + "grad_norm": 2.974700450897217, + "learning_rate": 7.019350999887895e-05, + "loss": 0.3252775430679321, + "step": 70130 + }, + { + "epoch": 0.3011256794003246, + "grad_norm": 0.02138546295464039, + "learning_rate": 7.018919827876133e-05, + "loss": 0.2596855401992798, + "step": 70140 + }, + { + "epoch": 0.3011686114903446, + "grad_norm": 0.11534173041582108, + "learning_rate": 7.018488655864371e-05, + "loss": 0.149857234954834, + "step": 70150 + }, + { + "epoch": 0.30121154358036456, + "grad_norm": 23.620689392089844, + "learning_rate": 7.018057483852609e-05, + "loss": 0.26149740219116213, + "step": 70160 + }, + { + "epoch": 0.3012544756703846, + "grad_norm": 0.0409373976290226, + "learning_rate": 7.017626311840846e-05, + "loss": 0.09922696948051453, + "step": 70170 + }, + { + "epoch": 0.3012974077604046, + "grad_norm": 0.11758884787559509, + "learning_rate": 7.017195139829084e-05, + "loss": 0.14113497734069824, + "step": 70180 + }, + { + "epoch": 0.3013403398504246, + "grad_norm": 1.1054000854492188, + "learning_rate": 7.016763967817322e-05, + "loss": 0.04130702018737793, + "step": 70190 + }, + { + "epoch": 0.3013832719404446, + "grad_norm": 0.002341791521757841, + "learning_rate": 7.01633279580556e-05, + "loss": 0.1668557643890381, + "step": 70200 + }, + { + "epoch": 0.3014262040304646, + "grad_norm": 2.24662184715271, + "learning_rate": 7.015901623793797e-05, + "loss": 0.22402334213256836, + "step": 70210 + }, + { + "epoch": 0.3014691361204846, + "grad_norm": 0.0016270908527076244, + "learning_rate": 7.015470451782035e-05, + "loss": 0.20528647899627686, + "step": 70220 + }, + { + "epoch": 0.30151206821050464, + "grad_norm": 3.114729642868042, + "learning_rate": 7.015039279770273e-05, + "loss": 0.050827699899673465, + "step": 70230 + }, + { + "epoch": 0.3015550003005246, + "grad_norm": 0.05784850940108299, + "learning_rate": 7.014608107758509e-05, + "loss": 0.2820718288421631, + "step": 70240 + }, + { + "epoch": 0.3015979323905446, + "grad_norm": 2.5486347675323486, + "learning_rate": 7.014176935746747e-05, + "loss": 0.2247065305709839, + "step": 70250 + }, + { + "epoch": 0.30164086448056465, + "grad_norm": 0.03793482482433319, + "learning_rate": 7.013745763734985e-05, + "loss": 0.37115206718444826, + "step": 70260 + }, + { + "epoch": 0.30168379657058464, + "grad_norm": 2.88460373878479, + "learning_rate": 7.013314591723222e-05, + "loss": 0.31713476181030276, + "step": 70270 + }, + { + "epoch": 0.3017267286606047, + "grad_norm": 0.14875128865242004, + "learning_rate": 7.01288341971146e-05, + "loss": 0.14124822616577148, + "step": 70280 + }, + { + "epoch": 0.30176966075062467, + "grad_norm": 0.025722453370690346, + "learning_rate": 7.012452247699698e-05, + "loss": 0.4006799697875977, + "step": 70290 + }, + { + "epoch": 0.30181259284064466, + "grad_norm": 0.25134965777397156, + "learning_rate": 7.012021075687936e-05, + "loss": 0.24204401969909667, + "step": 70300 + }, + { + "epoch": 0.3018555249306647, + "grad_norm": 0.04924899339675903, + "learning_rate": 7.011589903676173e-05, + "loss": 0.13902003765106202, + "step": 70310 + }, + { + "epoch": 0.3018984570206847, + "grad_norm": 0.13278919458389282, + "learning_rate": 7.01115873166441e-05, + "loss": 0.24274742603302002, + "step": 70320 + }, + { + "epoch": 0.30194138911070467, + "grad_norm": 0.0036019114777445793, + "learning_rate": 7.010727559652647e-05, + "loss": 0.13215283155441285, + "step": 70330 + }, + { + "epoch": 0.3019843212007247, + "grad_norm": 0.4529847800731659, + "learning_rate": 7.010296387640885e-05, + "loss": 0.2583324909210205, + "step": 70340 + }, + { + "epoch": 0.3020272532907447, + "grad_norm": 23.820642471313477, + "learning_rate": 7.009865215629123e-05, + "loss": 0.20266332626342773, + "step": 70350 + }, + { + "epoch": 0.3020701853807647, + "grad_norm": 0.018192386254668236, + "learning_rate": 7.00943404361736e-05, + "loss": 0.2031033754348755, + "step": 70360 + }, + { + "epoch": 0.30211311747078473, + "grad_norm": 0.046048324555158615, + "learning_rate": 7.009002871605598e-05, + "loss": 0.1358464002609253, + "step": 70370 + }, + { + "epoch": 0.3021560495608047, + "grad_norm": 0.020841889083385468, + "learning_rate": 7.008571699593836e-05, + "loss": 0.057205605506896975, + "step": 70380 + }, + { + "epoch": 0.3021989816508247, + "grad_norm": 0.0114286495372653, + "learning_rate": 7.008140527582074e-05, + "loss": 0.2284595251083374, + "step": 70390 + }, + { + "epoch": 0.30224191374084475, + "grad_norm": 0.003460024017840624, + "learning_rate": 7.007709355570312e-05, + "loss": 0.18587533235549927, + "step": 70400 + }, + { + "epoch": 0.30228484583086473, + "grad_norm": 0.02072158455848694, + "learning_rate": 7.007278183558549e-05, + "loss": 0.14335600137710572, + "step": 70410 + }, + { + "epoch": 0.3023277779208847, + "grad_norm": 0.047993358224630356, + "learning_rate": 7.006847011546787e-05, + "loss": 0.21287600994110106, + "step": 70420 + }, + { + "epoch": 0.30237071001090476, + "grad_norm": 0.08274947106838226, + "learning_rate": 7.006415839535025e-05, + "loss": 0.2267000913619995, + "step": 70430 + }, + { + "epoch": 0.30241364210092475, + "grad_norm": 0.0346342995762825, + "learning_rate": 7.005984667523262e-05, + "loss": 0.19687675237655639, + "step": 70440 + }, + { + "epoch": 0.30245657419094474, + "grad_norm": 0.008197636343538761, + "learning_rate": 7.0055534955115e-05, + "loss": 0.29254143238067626, + "step": 70450 + }, + { + "epoch": 0.3024995062809648, + "grad_norm": 0.18406546115875244, + "learning_rate": 7.005122323499738e-05, + "loss": 0.1060758113861084, + "step": 70460 + }, + { + "epoch": 0.30254243837098477, + "grad_norm": 2.8977458477020264, + "learning_rate": 7.004691151487976e-05, + "loss": 0.2420039415359497, + "step": 70470 + }, + { + "epoch": 0.3025853704610048, + "grad_norm": 1.7334059476852417, + "learning_rate": 7.004259979476212e-05, + "loss": 0.13808833360671996, + "step": 70480 + }, + { + "epoch": 0.3026283025510248, + "grad_norm": 0.10556143522262573, + "learning_rate": 7.00382880746445e-05, + "loss": 0.15416749715805053, + "step": 70490 + }, + { + "epoch": 0.3026712346410448, + "grad_norm": 1.188407301902771, + "learning_rate": 7.003397635452688e-05, + "loss": 0.1022602915763855, + "step": 70500 + }, + { + "epoch": 0.3027141667310648, + "grad_norm": 0.012010055594146252, + "learning_rate": 7.002966463440925e-05, + "loss": 0.06482537388801575, + "step": 70510 + }, + { + "epoch": 0.3027570988210848, + "grad_norm": 0.19251011312007904, + "learning_rate": 7.002535291429163e-05, + "loss": 0.5247401714324951, + "step": 70520 + }, + { + "epoch": 0.3028000309111048, + "grad_norm": 0.18594901263713837, + "learning_rate": 7.002104119417401e-05, + "loss": 0.3951719760894775, + "step": 70530 + }, + { + "epoch": 0.30284296300112484, + "grad_norm": 2.7943243980407715, + "learning_rate": 7.001672947405638e-05, + "loss": 0.4492646217346191, + "step": 70540 + }, + { + "epoch": 0.30288589509114483, + "grad_norm": 0.014538971707224846, + "learning_rate": 7.001241775393876e-05, + "loss": 0.05542449355125427, + "step": 70550 + }, + { + "epoch": 0.3029288271811648, + "grad_norm": 1.1423956155776978, + "learning_rate": 7.000810603382114e-05, + "loss": 0.2628788948059082, + "step": 70560 + }, + { + "epoch": 0.30297175927118486, + "grad_norm": 2.0259108543395996, + "learning_rate": 7.00037943137035e-05, + "loss": 0.313212251663208, + "step": 70570 + }, + { + "epoch": 0.30301469136120485, + "grad_norm": 0.6860762238502502, + "learning_rate": 6.999948259358588e-05, + "loss": 0.31650581359863283, + "step": 70580 + }, + { + "epoch": 0.30305762345122483, + "grad_norm": 0.48423272371292114, + "learning_rate": 6.999517087346826e-05, + "loss": 0.5084094524383544, + "step": 70590 + }, + { + "epoch": 0.3031005555412449, + "grad_norm": 0.3925926387310028, + "learning_rate": 6.999085915335063e-05, + "loss": 0.2517316102981567, + "step": 70600 + }, + { + "epoch": 0.30314348763126486, + "grad_norm": 2.200504779815674, + "learning_rate": 6.998654743323301e-05, + "loss": 0.42492990493774413, + "step": 70610 + }, + { + "epoch": 0.30318641972128485, + "grad_norm": 0.0720943883061409, + "learning_rate": 6.998223571311539e-05, + "loss": 0.1834435820579529, + "step": 70620 + }, + { + "epoch": 0.3032293518113049, + "grad_norm": 0.38618120551109314, + "learning_rate": 6.997792399299777e-05, + "loss": 0.11539990901947021, + "step": 70630 + }, + { + "epoch": 0.3032722839013249, + "grad_norm": 0.1201120987534523, + "learning_rate": 6.997361227288016e-05, + "loss": 0.3394498348236084, + "step": 70640 + }, + { + "epoch": 0.30331521599134487, + "grad_norm": 1.1716912984848022, + "learning_rate": 6.996930055276252e-05, + "loss": 0.1997692346572876, + "step": 70650 + }, + { + "epoch": 0.3033581480813649, + "grad_norm": 0.014452880248427391, + "learning_rate": 6.99649888326449e-05, + "loss": 0.3649930953979492, + "step": 70660 + }, + { + "epoch": 0.3034010801713849, + "grad_norm": 0.08201514929533005, + "learning_rate": 6.996067711252728e-05, + "loss": 0.2555066108703613, + "step": 70670 + }, + { + "epoch": 0.3034440122614049, + "grad_norm": 0.1451413631439209, + "learning_rate": 6.995636539240965e-05, + "loss": 0.10937966108322143, + "step": 70680 + }, + { + "epoch": 0.3034869443514249, + "grad_norm": 5.239630222320557, + "learning_rate": 6.995205367229203e-05, + "loss": 0.3625665187835693, + "step": 70690 + }, + { + "epoch": 0.3035298764414449, + "grad_norm": 0.008332494646310806, + "learning_rate": 6.994774195217441e-05, + "loss": 0.2099602222442627, + "step": 70700 + }, + { + "epoch": 0.30357280853146495, + "grad_norm": 0.022745434194803238, + "learning_rate": 6.994343023205679e-05, + "loss": 0.10325167179107667, + "step": 70710 + }, + { + "epoch": 0.30361574062148494, + "grad_norm": 0.854358434677124, + "learning_rate": 6.993911851193916e-05, + "loss": 0.14770570993423462, + "step": 70720 + }, + { + "epoch": 0.30365867271150493, + "grad_norm": 0.2198524922132492, + "learning_rate": 6.993480679182153e-05, + "loss": 0.3121105432510376, + "step": 70730 + }, + { + "epoch": 0.30370160480152497, + "grad_norm": 0.21637657284736633, + "learning_rate": 6.99304950717039e-05, + "loss": 0.28340332508087157, + "step": 70740 + }, + { + "epoch": 0.30374453689154496, + "grad_norm": 1.856569766998291, + "learning_rate": 6.992618335158628e-05, + "loss": 0.2528532028198242, + "step": 70750 + }, + { + "epoch": 0.30378746898156495, + "grad_norm": 0.015768440440297127, + "learning_rate": 6.992187163146866e-05, + "loss": 0.3581262111663818, + "step": 70760 + }, + { + "epoch": 0.303830401071585, + "grad_norm": 1.2636868953704834, + "learning_rate": 6.991755991135104e-05, + "loss": 0.25124645233154297, + "step": 70770 + }, + { + "epoch": 0.303873333161605, + "grad_norm": 0.03816381096839905, + "learning_rate": 6.991324819123341e-05, + "loss": 0.13545433282852173, + "step": 70780 + }, + { + "epoch": 0.30391626525162496, + "grad_norm": 29.71138572692871, + "learning_rate": 6.990893647111579e-05, + "loss": 0.26535260677337646, + "step": 70790 + }, + { + "epoch": 0.303959197341645, + "grad_norm": 0.0020745396614074707, + "learning_rate": 6.990462475099817e-05, + "loss": 0.08012622594833374, + "step": 70800 + }, + { + "epoch": 0.304002129431665, + "grad_norm": 0.01548818126320839, + "learning_rate": 6.990031303088053e-05, + "loss": 0.22493376731872558, + "step": 70810 + }, + { + "epoch": 0.304045061521685, + "grad_norm": 0.6447784304618835, + "learning_rate": 6.989600131076291e-05, + "loss": 0.2580659627914429, + "step": 70820 + }, + { + "epoch": 0.304087993611705, + "grad_norm": 0.018228799104690552, + "learning_rate": 6.989168959064529e-05, + "loss": 0.08474425077438355, + "step": 70830 + }, + { + "epoch": 0.304130925701725, + "grad_norm": 1.223036527633667, + "learning_rate": 6.988737787052766e-05, + "loss": 0.5076069831848145, + "step": 70840 + }, + { + "epoch": 0.304173857791745, + "grad_norm": 67.93170166015625, + "learning_rate": 6.988306615041004e-05, + "loss": 0.23856496810913086, + "step": 70850 + }, + { + "epoch": 0.30421678988176504, + "grad_norm": 1.5760152339935303, + "learning_rate": 6.987875443029243e-05, + "loss": 0.3166388988494873, + "step": 70860 + }, + { + "epoch": 0.304259721971785, + "grad_norm": 0.09973370283842087, + "learning_rate": 6.987444271017481e-05, + "loss": 0.1548475742340088, + "step": 70870 + }, + { + "epoch": 0.304302654061805, + "grad_norm": 0.08377603441476822, + "learning_rate": 6.987013099005719e-05, + "loss": 0.3000709295272827, + "step": 70880 + }, + { + "epoch": 0.30434558615182505, + "grad_norm": 1.1292842626571655, + "learning_rate": 6.986581926993955e-05, + "loss": 0.2452075958251953, + "step": 70890 + }, + { + "epoch": 0.30438851824184504, + "grad_norm": 0.024154068902134895, + "learning_rate": 6.986150754982193e-05, + "loss": 0.12045891284942627, + "step": 70900 + }, + { + "epoch": 0.3044314503318651, + "grad_norm": 3.3469178676605225, + "learning_rate": 6.98571958297043e-05, + "loss": 0.2237870454788208, + "step": 70910 + }, + { + "epoch": 0.30447438242188507, + "grad_norm": 0.032343361526727676, + "learning_rate": 6.985288410958668e-05, + "loss": 0.15854912996292114, + "step": 70920 + }, + { + "epoch": 0.30451731451190506, + "grad_norm": 0.8938616514205933, + "learning_rate": 6.984857238946906e-05, + "loss": 0.29627814292907717, + "step": 70930 + }, + { + "epoch": 0.3045602466019251, + "grad_norm": 0.006615492049604654, + "learning_rate": 6.984426066935144e-05, + "loss": 0.10426685810089112, + "step": 70940 + }, + { + "epoch": 0.3046031786919451, + "grad_norm": 0.1046452522277832, + "learning_rate": 6.983994894923381e-05, + "loss": 0.23253164291381836, + "step": 70950 + }, + { + "epoch": 0.3046461107819651, + "grad_norm": 0.8799397349357605, + "learning_rate": 6.983563722911619e-05, + "loss": 0.1915224313735962, + "step": 70960 + }, + { + "epoch": 0.3046890428719851, + "grad_norm": 0.2595174312591553, + "learning_rate": 6.983132550899857e-05, + "loss": 0.12004296779632569, + "step": 70970 + }, + { + "epoch": 0.3047319749620051, + "grad_norm": 0.1250929832458496, + "learning_rate": 6.982701378888093e-05, + "loss": 0.2012336254119873, + "step": 70980 + }, + { + "epoch": 0.3047749070520251, + "grad_norm": 0.023574380204081535, + "learning_rate": 6.982270206876331e-05, + "loss": 0.2144022226333618, + "step": 70990 + }, + { + "epoch": 0.30481783914204513, + "grad_norm": 8.696722984313965, + "learning_rate": 6.981839034864569e-05, + "loss": 0.17126057147979737, + "step": 71000 + }, + { + "epoch": 0.30481783914204513, + "eval_loss": 0.4333540201187134, + "eval_runtime": 27.4472, + "eval_samples_per_second": 3.643, + "eval_steps_per_second": 3.643, + "step": 71000 + }, + { + "epoch": 0.3048607712320651, + "grad_norm": 0.08526608347892761, + "learning_rate": 6.981407862852807e-05, + "loss": 0.0943373441696167, + "step": 71010 + }, + { + "epoch": 0.3049037033220851, + "grad_norm": 0.00957313273102045, + "learning_rate": 6.980976690841044e-05, + "loss": 0.2425380229949951, + "step": 71020 + }, + { + "epoch": 0.30494663541210515, + "grad_norm": 0.13296842575073242, + "learning_rate": 6.980545518829282e-05, + "loss": 0.21628108024597167, + "step": 71030 + }, + { + "epoch": 0.30498956750212514, + "grad_norm": 0.4499169886112213, + "learning_rate": 6.98011434681752e-05, + "loss": 0.31735036373138426, + "step": 71040 + }, + { + "epoch": 0.3050324995921451, + "grad_norm": 2.7393083572387695, + "learning_rate": 6.979683174805757e-05, + "loss": 0.49162960052490234, + "step": 71050 + }, + { + "epoch": 0.30507543168216517, + "grad_norm": 17.325475692749023, + "learning_rate": 6.979252002793994e-05, + "loss": 0.2496135950088501, + "step": 71060 + }, + { + "epoch": 0.30511836377218515, + "grad_norm": 0.9037123918533325, + "learning_rate": 6.978820830782232e-05, + "loss": 0.2521227836608887, + "step": 71070 + }, + { + "epoch": 0.30516129586220514, + "grad_norm": 2.061171293258667, + "learning_rate": 6.97838965877047e-05, + "loss": 0.3289464235305786, + "step": 71080 + }, + { + "epoch": 0.3052042279522252, + "grad_norm": 0.059992242604494095, + "learning_rate": 6.977958486758708e-05, + "loss": 0.1082227110862732, + "step": 71090 + }, + { + "epoch": 0.30524716004224517, + "grad_norm": 0.16897283494472504, + "learning_rate": 6.977527314746946e-05, + "loss": 0.27176353931427, + "step": 71100 + }, + { + "epoch": 0.30529009213226516, + "grad_norm": 1.2346889972686768, + "learning_rate": 6.977096142735184e-05, + "loss": 0.3244923114776611, + "step": 71110 + }, + { + "epoch": 0.3053330242222852, + "grad_norm": 1.0333776473999023, + "learning_rate": 6.976664970723422e-05, + "loss": 0.28680243492126467, + "step": 71120 + }, + { + "epoch": 0.3053759563123052, + "grad_norm": 3.706778049468994, + "learning_rate": 6.97623379871166e-05, + "loss": 0.28829283714294435, + "step": 71130 + }, + { + "epoch": 0.30541888840232523, + "grad_norm": 0.7992908358573914, + "learning_rate": 6.975802626699896e-05, + "loss": 0.19528234004974365, + "step": 71140 + }, + { + "epoch": 0.3054618204923452, + "grad_norm": 0.07427681982517242, + "learning_rate": 6.975371454688133e-05, + "loss": 0.1444224238395691, + "step": 71150 + }, + { + "epoch": 0.3055047525823652, + "grad_norm": 0.055710192769765854, + "learning_rate": 6.974940282676371e-05, + "loss": 0.14568614959716797, + "step": 71160 + }, + { + "epoch": 0.30554768467238524, + "grad_norm": 0.008985698223114014, + "learning_rate": 6.974509110664609e-05, + "loss": 0.41927037239074705, + "step": 71170 + }, + { + "epoch": 0.30559061676240523, + "grad_norm": 0.0678943544626236, + "learning_rate": 6.974077938652847e-05, + "loss": 0.24876203536987304, + "step": 71180 + }, + { + "epoch": 0.3056335488524252, + "grad_norm": 0.27909696102142334, + "learning_rate": 6.973646766641084e-05, + "loss": 0.20138399600982665, + "step": 71190 + }, + { + "epoch": 0.30567648094244526, + "grad_norm": 0.06332625448703766, + "learning_rate": 6.973215594629322e-05, + "loss": 0.41253089904785156, + "step": 71200 + }, + { + "epoch": 0.30571941303246525, + "grad_norm": 3.1160600185394287, + "learning_rate": 6.97278442261756e-05, + "loss": 0.4033061027526855, + "step": 71210 + }, + { + "epoch": 0.30576234512248524, + "grad_norm": 0.02980274148285389, + "learning_rate": 6.972353250605796e-05, + "loss": 0.19519845247268677, + "step": 71220 + }, + { + "epoch": 0.3058052772125053, + "grad_norm": 0.04582500085234642, + "learning_rate": 6.971922078594034e-05, + "loss": 0.21372950077056885, + "step": 71230 + }, + { + "epoch": 0.30584820930252526, + "grad_norm": 1.7604162693023682, + "learning_rate": 6.971490906582272e-05, + "loss": 0.14104797840118408, + "step": 71240 + }, + { + "epoch": 0.30589114139254525, + "grad_norm": 2.9321041107177734, + "learning_rate": 6.97105973457051e-05, + "loss": 0.27418735027313235, + "step": 71250 + }, + { + "epoch": 0.3059340734825653, + "grad_norm": 0.4169948101043701, + "learning_rate": 6.970628562558747e-05, + "loss": 0.2097461462020874, + "step": 71260 + }, + { + "epoch": 0.3059770055725853, + "grad_norm": 0.19751910865306854, + "learning_rate": 6.970197390546985e-05, + "loss": 0.10453450679779053, + "step": 71270 + }, + { + "epoch": 0.30601993766260527, + "grad_norm": 5.6523637771606445, + "learning_rate": 6.969766218535223e-05, + "loss": 0.37703814506530764, + "step": 71280 + }, + { + "epoch": 0.3060628697526253, + "grad_norm": 0.0058932071551680565, + "learning_rate": 6.96933504652346e-05, + "loss": 0.11596277952194214, + "step": 71290 + }, + { + "epoch": 0.3061058018426453, + "grad_norm": 2.179844617843628, + "learning_rate": 6.968903874511698e-05, + "loss": 0.3431446075439453, + "step": 71300 + }, + { + "epoch": 0.3061487339326653, + "grad_norm": 4.3101043701171875, + "learning_rate": 6.968472702499936e-05, + "loss": 0.21665070056915284, + "step": 71310 + }, + { + "epoch": 0.3061916660226853, + "grad_norm": 0.08224528282880783, + "learning_rate": 6.968041530488174e-05, + "loss": 0.2836304664611816, + "step": 71320 + }, + { + "epoch": 0.3062345981127053, + "grad_norm": 1.7605150938034058, + "learning_rate": 6.967610358476411e-05, + "loss": 0.1909969210624695, + "step": 71330 + }, + { + "epoch": 0.30627753020272536, + "grad_norm": 0.15108336508274078, + "learning_rate": 6.967179186464649e-05, + "loss": 0.19276317358016967, + "step": 71340 + }, + { + "epoch": 0.30632046229274534, + "grad_norm": 0.17520533502101898, + "learning_rate": 6.966748014452887e-05, + "loss": 0.3102010011672974, + "step": 71350 + }, + { + "epoch": 0.30636339438276533, + "grad_norm": 0.004262985661625862, + "learning_rate": 6.966316842441125e-05, + "loss": 0.21410059928894043, + "step": 71360 + }, + { + "epoch": 0.3064063264727854, + "grad_norm": 0.07552611082792282, + "learning_rate": 6.965885670429362e-05, + "loss": 0.12926443815231323, + "step": 71370 + }, + { + "epoch": 0.30644925856280536, + "grad_norm": 0.0392976850271225, + "learning_rate": 6.9654544984176e-05, + "loss": 0.3050688743591309, + "step": 71380 + }, + { + "epoch": 0.30649219065282535, + "grad_norm": 0.03745187073945999, + "learning_rate": 6.965023326405836e-05, + "loss": 0.13380508422851561, + "step": 71390 + }, + { + "epoch": 0.3065351227428454, + "grad_norm": 6.481908798217773, + "learning_rate": 6.964592154394074e-05, + "loss": 0.3676483631134033, + "step": 71400 + }, + { + "epoch": 0.3065780548328654, + "grad_norm": 0.06287180632352829, + "learning_rate": 6.964160982382312e-05, + "loss": 0.20733683109283446, + "step": 71410 + }, + { + "epoch": 0.30662098692288536, + "grad_norm": 0.9534348249435425, + "learning_rate": 6.96372981037055e-05, + "loss": 0.3034853458404541, + "step": 71420 + }, + { + "epoch": 0.3066639190129054, + "grad_norm": 0.8343759179115295, + "learning_rate": 6.963298638358787e-05, + "loss": 0.18012170791625975, + "step": 71430 + }, + { + "epoch": 0.3067068511029254, + "grad_norm": 0.07756144553422928, + "learning_rate": 6.962867466347025e-05, + "loss": 0.29481561183929444, + "step": 71440 + }, + { + "epoch": 0.3067497831929454, + "grad_norm": 0.06801965832710266, + "learning_rate": 6.962436294335263e-05, + "loss": 0.3844615459442139, + "step": 71450 + }, + { + "epoch": 0.3067927152829654, + "grad_norm": 0.026445409283041954, + "learning_rate": 6.9620051223235e-05, + "loss": 0.33446853160858153, + "step": 71460 + }, + { + "epoch": 0.3068356473729854, + "grad_norm": 0.30800873041152954, + "learning_rate": 6.961573950311737e-05, + "loss": 0.18386055231094361, + "step": 71470 + }, + { + "epoch": 0.3068785794630054, + "grad_norm": 0.5849953889846802, + "learning_rate": 6.961142778299975e-05, + "loss": 0.1202467918395996, + "step": 71480 + }, + { + "epoch": 0.30692151155302544, + "grad_norm": 0.10062210261821747, + "learning_rate": 6.960711606288212e-05, + "loss": 0.23699476718902587, + "step": 71490 + }, + { + "epoch": 0.3069644436430454, + "grad_norm": 1.1067657470703125, + "learning_rate": 6.96028043427645e-05, + "loss": 0.2898426532745361, + "step": 71500 + }, + { + "epoch": 0.3070073757330654, + "grad_norm": 0.0480431467294693, + "learning_rate": 6.959849262264688e-05, + "loss": 0.18034435510635377, + "step": 71510 + }, + { + "epoch": 0.30705030782308546, + "grad_norm": 0.010232225991785526, + "learning_rate": 6.959418090252926e-05, + "loss": 0.20612285137176514, + "step": 71520 + }, + { + "epoch": 0.30709323991310544, + "grad_norm": 0.0968809723854065, + "learning_rate": 6.958986918241163e-05, + "loss": 0.20248787403106688, + "step": 71530 + }, + { + "epoch": 0.30713617200312543, + "grad_norm": 0.018053434789180756, + "learning_rate": 6.958555746229401e-05, + "loss": 0.34410016536712645, + "step": 71540 + }, + { + "epoch": 0.30717910409314547, + "grad_norm": 0.11472825706005096, + "learning_rate": 6.958124574217639e-05, + "loss": 0.283540153503418, + "step": 71550 + }, + { + "epoch": 0.30722203618316546, + "grad_norm": 0.10586521029472351, + "learning_rate": 6.957693402205876e-05, + "loss": 0.004880695044994355, + "step": 71560 + }, + { + "epoch": 0.3072649682731855, + "grad_norm": 4.804992198944092, + "learning_rate": 6.957262230194114e-05, + "loss": 0.2283937931060791, + "step": 71570 + }, + { + "epoch": 0.3073079003632055, + "grad_norm": 0.0098240552470088, + "learning_rate": 6.956831058182352e-05, + "loss": 0.21513535976409912, + "step": 71580 + }, + { + "epoch": 0.3073508324532255, + "grad_norm": 0.007225578185170889, + "learning_rate": 6.95639988617059e-05, + "loss": 0.1269293785095215, + "step": 71590 + }, + { + "epoch": 0.3073937645432455, + "grad_norm": 1.4325543642044067, + "learning_rate": 6.955968714158827e-05, + "loss": 0.23351283073425294, + "step": 71600 + }, + { + "epoch": 0.3074366966332655, + "grad_norm": 0.5364208221435547, + "learning_rate": 6.955537542147065e-05, + "loss": 0.1389673113822937, + "step": 71610 + }, + { + "epoch": 0.3074796287232855, + "grad_norm": 0.0047192987985908985, + "learning_rate": 6.955106370135303e-05, + "loss": 0.47339601516723634, + "step": 71620 + }, + { + "epoch": 0.30752256081330553, + "grad_norm": 0.027396438643336296, + "learning_rate": 6.95467519812354e-05, + "loss": 0.29778280258178713, + "step": 71630 + }, + { + "epoch": 0.3075654929033255, + "grad_norm": 0.6361259818077087, + "learning_rate": 6.954244026111777e-05, + "loss": 0.3184041500091553, + "step": 71640 + }, + { + "epoch": 0.3076084249933455, + "grad_norm": 0.19302690029144287, + "learning_rate": 6.953812854100015e-05, + "loss": 0.009601826965808868, + "step": 71650 + }, + { + "epoch": 0.30765135708336555, + "grad_norm": 2.3101580142974854, + "learning_rate": 6.953381682088252e-05, + "loss": 0.17634633779525757, + "step": 71660 + }, + { + "epoch": 0.30769428917338554, + "grad_norm": 0.10807690024375916, + "learning_rate": 6.95295051007649e-05, + "loss": 0.35539629459381106, + "step": 71670 + }, + { + "epoch": 0.3077372212634055, + "grad_norm": 0.008286534808576107, + "learning_rate": 6.952519338064728e-05, + "loss": 0.10938578844070435, + "step": 71680 + }, + { + "epoch": 0.30778015335342557, + "grad_norm": 0.8377420902252197, + "learning_rate": 6.952088166052966e-05, + "loss": 0.27280521392822266, + "step": 71690 + }, + { + "epoch": 0.30782308544344555, + "grad_norm": 5.454530715942383, + "learning_rate": 6.951656994041203e-05, + "loss": 0.4018850326538086, + "step": 71700 + }, + { + "epoch": 0.30786601753346554, + "grad_norm": 0.4904543161392212, + "learning_rate": 6.951225822029441e-05, + "loss": 0.391569995880127, + "step": 71710 + }, + { + "epoch": 0.3079089496234856, + "grad_norm": 0.6449229121208191, + "learning_rate": 6.950794650017678e-05, + "loss": 0.052797901630401614, + "step": 71720 + }, + { + "epoch": 0.30795188171350557, + "grad_norm": 2.045611619949341, + "learning_rate": 6.950363478005915e-05, + "loss": 0.2161909818649292, + "step": 71730 + }, + { + "epoch": 0.30799481380352556, + "grad_norm": 0.11193674057722092, + "learning_rate": 6.949932305994153e-05, + "loss": 0.2853853225708008, + "step": 71740 + }, + { + "epoch": 0.3080377458935456, + "grad_norm": 0.03395124524831772, + "learning_rate": 6.949501133982391e-05, + "loss": 0.38172454833984376, + "step": 71750 + }, + { + "epoch": 0.3080806779835656, + "grad_norm": 9.028738975524902, + "learning_rate": 6.949069961970628e-05, + "loss": 0.1702930212020874, + "step": 71760 + }, + { + "epoch": 0.30812361007358563, + "grad_norm": 0.923577606678009, + "learning_rate": 6.948638789958866e-05, + "loss": 0.2921849966049194, + "step": 71770 + }, + { + "epoch": 0.3081665421636056, + "grad_norm": 0.685820460319519, + "learning_rate": 6.948207617947104e-05, + "loss": 0.22313647270202636, + "step": 71780 + }, + { + "epoch": 0.3082094742536256, + "grad_norm": 3.264664888381958, + "learning_rate": 6.947776445935342e-05, + "loss": 0.05310268402099609, + "step": 71790 + }, + { + "epoch": 0.30825240634364565, + "grad_norm": 14.52664566040039, + "learning_rate": 6.94734527392358e-05, + "loss": 0.28851985931396484, + "step": 71800 + }, + { + "epoch": 0.30829533843366563, + "grad_norm": 13.558172225952148, + "learning_rate": 6.946914101911817e-05, + "loss": 0.054348152875900266, + "step": 71810 + }, + { + "epoch": 0.3083382705236856, + "grad_norm": 0.22968202829360962, + "learning_rate": 6.946482929900055e-05, + "loss": 0.23391096591949462, + "step": 71820 + }, + { + "epoch": 0.30838120261370566, + "grad_norm": 0.044967085123062134, + "learning_rate": 6.946051757888293e-05, + "loss": 0.35891730785369874, + "step": 71830 + }, + { + "epoch": 0.30842413470372565, + "grad_norm": 2.2250916957855225, + "learning_rate": 6.94562058587653e-05, + "loss": 0.28689022064208985, + "step": 71840 + }, + { + "epoch": 0.30846706679374564, + "grad_norm": 3.7705438137054443, + "learning_rate": 6.945189413864768e-05, + "loss": 0.3127246379852295, + "step": 71850 + }, + { + "epoch": 0.3085099988837657, + "grad_norm": 0.48120859265327454, + "learning_rate": 6.944758241853006e-05, + "loss": 0.17397620677947997, + "step": 71860 + }, + { + "epoch": 0.30855293097378567, + "grad_norm": 17.947019577026367, + "learning_rate": 6.944327069841244e-05, + "loss": 0.32586348056793213, + "step": 71870 + }, + { + "epoch": 0.30859586306380565, + "grad_norm": 1.6339031457901, + "learning_rate": 6.94389589782948e-05, + "loss": 0.3424349308013916, + "step": 71880 + }, + { + "epoch": 0.3086387951538257, + "grad_norm": 1.6345598697662354, + "learning_rate": 6.943464725817718e-05, + "loss": 0.07327746748924255, + "step": 71890 + }, + { + "epoch": 0.3086817272438457, + "grad_norm": 0.05315762758255005, + "learning_rate": 6.943033553805955e-05, + "loss": 0.2570945262908936, + "step": 71900 + }, + { + "epoch": 0.30872465933386567, + "grad_norm": 0.1542612761259079, + "learning_rate": 6.942602381794193e-05, + "loss": 0.34127881526947024, + "step": 71910 + }, + { + "epoch": 0.3087675914238857, + "grad_norm": 0.016485799103975296, + "learning_rate": 6.942171209782431e-05, + "loss": 0.2881777286529541, + "step": 71920 + }, + { + "epoch": 0.3088105235139057, + "grad_norm": 0.6435169577598572, + "learning_rate": 6.941740037770669e-05, + "loss": 0.2654039144515991, + "step": 71930 + }, + { + "epoch": 0.3088534556039257, + "grad_norm": 1.8059978485107422, + "learning_rate": 6.941308865758906e-05, + "loss": 0.27082371711730957, + "step": 71940 + }, + { + "epoch": 0.30889638769394573, + "grad_norm": 2.602182149887085, + "learning_rate": 6.940877693747144e-05, + "loss": 0.22784204483032228, + "step": 71950 + }, + { + "epoch": 0.3089393197839657, + "grad_norm": 2.098834991455078, + "learning_rate": 6.94044652173538e-05, + "loss": 0.32148375511169436, + "step": 71960 + }, + { + "epoch": 0.3089822518739857, + "grad_norm": 0.5187646150588989, + "learning_rate": 6.940015349723618e-05, + "loss": 0.15208892822265624, + "step": 71970 + }, + { + "epoch": 0.30902518396400575, + "grad_norm": 0.12590381503105164, + "learning_rate": 6.939584177711856e-05, + "loss": 0.15220378637313842, + "step": 71980 + }, + { + "epoch": 0.30906811605402573, + "grad_norm": 2.8593339920043945, + "learning_rate": 6.939153005700094e-05, + "loss": 0.668053674697876, + "step": 71990 + }, + { + "epoch": 0.3091110481440458, + "grad_norm": 0.256454735994339, + "learning_rate": 6.938721833688331e-05, + "loss": 0.17973003387451172, + "step": 72000 + }, + { + "epoch": 0.3091110481440458, + "eval_loss": 0.42271023988723755, + "eval_runtime": 27.4321, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 72000 + }, + { + "epoch": 0.30915398023406576, + "grad_norm": 0.047602299600839615, + "learning_rate": 6.938290661676569e-05, + "loss": 0.2981420516967773, + "step": 72010 + }, + { + "epoch": 0.30919691232408575, + "grad_norm": 0.27392998337745667, + "learning_rate": 6.937859489664807e-05, + "loss": 0.25982108116149905, + "step": 72020 + }, + { + "epoch": 0.3092398444141058, + "grad_norm": 0.010714475996792316, + "learning_rate": 6.937428317653045e-05, + "loss": 0.28580350875854493, + "step": 72030 + }, + { + "epoch": 0.3092827765041258, + "grad_norm": 1.4530272483825684, + "learning_rate": 6.936997145641284e-05, + "loss": 0.1686078667640686, + "step": 72040 + }, + { + "epoch": 0.30932570859414577, + "grad_norm": 0.15697383880615234, + "learning_rate": 6.93656597362952e-05, + "loss": 0.10101045370101928, + "step": 72050 + }, + { + "epoch": 0.3093686406841658, + "grad_norm": 0.018088942393660545, + "learning_rate": 6.936134801617758e-05, + "loss": 0.16199415922164917, + "step": 72060 + }, + { + "epoch": 0.3094115727741858, + "grad_norm": 8.666268348693848, + "learning_rate": 6.935703629605996e-05, + "loss": 0.3642237424850464, + "step": 72070 + }, + { + "epoch": 0.3094545048642058, + "grad_norm": 0.11770039796829224, + "learning_rate": 6.935272457594233e-05, + "loss": 0.23574357032775878, + "step": 72080 + }, + { + "epoch": 0.3094974369542258, + "grad_norm": 0.0496184416115284, + "learning_rate": 6.934841285582471e-05, + "loss": 0.05372920632362366, + "step": 72090 + }, + { + "epoch": 0.3095403690442458, + "grad_norm": 29.345449447631836, + "learning_rate": 6.934410113570709e-05, + "loss": 0.2644599437713623, + "step": 72100 + }, + { + "epoch": 0.3095833011342658, + "grad_norm": 2.0647425651550293, + "learning_rate": 6.933978941558946e-05, + "loss": 0.16928220987319947, + "step": 72110 + }, + { + "epoch": 0.30962623322428584, + "grad_norm": 0.017716316506266594, + "learning_rate": 6.933547769547184e-05, + "loss": 0.2569295406341553, + "step": 72120 + }, + { + "epoch": 0.3096691653143058, + "grad_norm": 0.0034017576836049557, + "learning_rate": 6.93311659753542e-05, + "loss": 0.20230543613433838, + "step": 72130 + }, + { + "epoch": 0.3097120974043258, + "grad_norm": 0.9167028069496155, + "learning_rate": 6.932685425523658e-05, + "loss": 0.22303533554077148, + "step": 72140 + }, + { + "epoch": 0.30975502949434586, + "grad_norm": 1.8220903873443604, + "learning_rate": 6.932254253511896e-05, + "loss": 0.28549671173095703, + "step": 72150 + }, + { + "epoch": 0.30979796158436584, + "grad_norm": 0.004097741097211838, + "learning_rate": 6.931823081500134e-05, + "loss": 0.07687397599220276, + "step": 72160 + }, + { + "epoch": 0.30984089367438583, + "grad_norm": 0.12848952412605286, + "learning_rate": 6.931391909488372e-05, + "loss": 0.18670018911361694, + "step": 72170 + }, + { + "epoch": 0.3098838257644059, + "grad_norm": 1.0430200099945068, + "learning_rate": 6.930960737476609e-05, + "loss": 0.27053136825561525, + "step": 72180 + }, + { + "epoch": 0.30992675785442586, + "grad_norm": 1.271836757659912, + "learning_rate": 6.930529565464847e-05, + "loss": 0.28454022407531737, + "step": 72190 + }, + { + "epoch": 0.3099696899444459, + "grad_norm": 0.024914631620049477, + "learning_rate": 6.930098393453085e-05, + "loss": 0.15766094923019408, + "step": 72200 + }, + { + "epoch": 0.3100126220344659, + "grad_norm": 2.382819652557373, + "learning_rate": 6.929667221441321e-05, + "loss": 0.32537169456481935, + "step": 72210 + }, + { + "epoch": 0.3100555541244859, + "grad_norm": 0.07590825110673904, + "learning_rate": 6.929236049429559e-05, + "loss": 0.2822270393371582, + "step": 72220 + }, + { + "epoch": 0.3100984862145059, + "grad_norm": 2.425994873046875, + "learning_rate": 6.928804877417797e-05, + "loss": 0.207551908493042, + "step": 72230 + }, + { + "epoch": 0.3101414183045259, + "grad_norm": 0.014780079945921898, + "learning_rate": 6.928373705406034e-05, + "loss": 0.2179882287979126, + "step": 72240 + }, + { + "epoch": 0.3101843503945459, + "grad_norm": 0.013886822387576103, + "learning_rate": 6.927942533394272e-05, + "loss": 0.3754821062088013, + "step": 72250 + }, + { + "epoch": 0.31022728248456594, + "grad_norm": 0.035959966480731964, + "learning_rate": 6.927511361382511e-05, + "loss": 0.1514652729034424, + "step": 72260 + }, + { + "epoch": 0.3102702145745859, + "grad_norm": 0.0032398924231529236, + "learning_rate": 6.927080189370749e-05, + "loss": 0.1065395712852478, + "step": 72270 + }, + { + "epoch": 0.3103131466646059, + "grad_norm": 0.05704474821686745, + "learning_rate": 6.926649017358987e-05, + "loss": 0.051316624879837035, + "step": 72280 + }, + { + "epoch": 0.31035607875462595, + "grad_norm": 0.048175569623708725, + "learning_rate": 6.926217845347223e-05, + "loss": 0.3055478572845459, + "step": 72290 + }, + { + "epoch": 0.31039901084464594, + "grad_norm": 1.6316146850585938, + "learning_rate": 6.925786673335461e-05, + "loss": 0.10036553144454956, + "step": 72300 + }, + { + "epoch": 0.3104419429346659, + "grad_norm": 0.14009523391723633, + "learning_rate": 6.925355501323698e-05, + "loss": 0.21372523307800292, + "step": 72310 + }, + { + "epoch": 0.31048487502468597, + "grad_norm": 0.007102209143340588, + "learning_rate": 6.924924329311936e-05, + "loss": 0.1738092541694641, + "step": 72320 + }, + { + "epoch": 0.31052780711470596, + "grad_norm": 1.0038173198699951, + "learning_rate": 6.924493157300174e-05, + "loss": 0.2518671989440918, + "step": 72330 + }, + { + "epoch": 0.31057073920472594, + "grad_norm": 0.21360069513320923, + "learning_rate": 6.924061985288412e-05, + "loss": 0.15478349924087526, + "step": 72340 + }, + { + "epoch": 0.310613671294746, + "grad_norm": 0.0035239006392657757, + "learning_rate": 6.92363081327665e-05, + "loss": 0.1550774335861206, + "step": 72350 + }, + { + "epoch": 0.310656603384766, + "grad_norm": 0.20199644565582275, + "learning_rate": 6.923199641264887e-05, + "loss": 0.22670469284057618, + "step": 72360 + }, + { + "epoch": 0.31069953547478596, + "grad_norm": 0.1095641553401947, + "learning_rate": 6.922768469253125e-05, + "loss": 0.34234280586242677, + "step": 72370 + }, + { + "epoch": 0.310742467564806, + "grad_norm": 0.009776102378964424, + "learning_rate": 6.922337297241361e-05, + "loss": 0.20321106910705566, + "step": 72380 + }, + { + "epoch": 0.310785399654826, + "grad_norm": 0.00039248340181075037, + "learning_rate": 6.921906125229599e-05, + "loss": 0.2770804166793823, + "step": 72390 + }, + { + "epoch": 0.310828331744846, + "grad_norm": 6.714611530303955, + "learning_rate": 6.921474953217837e-05, + "loss": 0.3584035873413086, + "step": 72400 + }, + { + "epoch": 0.310871263834866, + "grad_norm": 0.2857927680015564, + "learning_rate": 6.921043781206074e-05, + "loss": 0.1929535150527954, + "step": 72410 + }, + { + "epoch": 0.310914195924886, + "grad_norm": 1.3031257390975952, + "learning_rate": 6.920612609194312e-05, + "loss": 0.20522711277008057, + "step": 72420 + }, + { + "epoch": 0.31095712801490605, + "grad_norm": 0.08546615391969681, + "learning_rate": 6.92018143718255e-05, + "loss": 0.02166993319988251, + "step": 72430 + }, + { + "epoch": 0.31100006010492603, + "grad_norm": 1.505024790763855, + "learning_rate": 6.919750265170788e-05, + "loss": 0.45034017562866213, + "step": 72440 + }, + { + "epoch": 0.311042992194946, + "grad_norm": 0.022659284994006157, + "learning_rate": 6.919319093159025e-05, + "loss": 0.1870231509208679, + "step": 72450 + }, + { + "epoch": 0.31108592428496606, + "grad_norm": 1.7719067335128784, + "learning_rate": 6.918887921147262e-05, + "loss": 0.2802696228027344, + "step": 72460 + }, + { + "epoch": 0.31112885637498605, + "grad_norm": 2.1437034606933594, + "learning_rate": 6.9184567491355e-05, + "loss": 0.332187294960022, + "step": 72470 + }, + { + "epoch": 0.31117178846500604, + "grad_norm": 4.503712177276611, + "learning_rate": 6.918025577123739e-05, + "loss": 0.2613640308380127, + "step": 72480 + }, + { + "epoch": 0.3112147205550261, + "grad_norm": 2.8580639362335205, + "learning_rate": 6.917594405111976e-05, + "loss": 0.42650775909423827, + "step": 72490 + }, + { + "epoch": 0.31125765264504607, + "grad_norm": 0.2938775420188904, + "learning_rate": 6.917163233100214e-05, + "loss": 0.008270031958818435, + "step": 72500 + }, + { + "epoch": 0.31130058473506605, + "grad_norm": 0.04436694458127022, + "learning_rate": 6.916732061088452e-05, + "loss": 0.48149800300598145, + "step": 72510 + }, + { + "epoch": 0.3113435168250861, + "grad_norm": 0.09820667654275894, + "learning_rate": 6.91630088907669e-05, + "loss": 0.1050878882408142, + "step": 72520 + }, + { + "epoch": 0.3113864489151061, + "grad_norm": 0.04176689684391022, + "learning_rate": 6.915869717064927e-05, + "loss": 0.3850319623947144, + "step": 72530 + }, + { + "epoch": 0.31142938100512607, + "grad_norm": 2.246731996536255, + "learning_rate": 6.915438545053164e-05, + "loss": 0.2246485948562622, + "step": 72540 + }, + { + "epoch": 0.3114723130951461, + "grad_norm": 0.03071710281074047, + "learning_rate": 6.915007373041401e-05, + "loss": 0.2287735939025879, + "step": 72550 + }, + { + "epoch": 0.3115152451851661, + "grad_norm": 1.2058755159378052, + "learning_rate": 6.914576201029639e-05, + "loss": 0.3163428783416748, + "step": 72560 + }, + { + "epoch": 0.3115581772751861, + "grad_norm": 1.6118654012680054, + "learning_rate": 6.914145029017877e-05, + "loss": 0.43007454872131345, + "step": 72570 + }, + { + "epoch": 0.31160110936520613, + "grad_norm": 0.9232771396636963, + "learning_rate": 6.913713857006115e-05, + "loss": 0.2806085586547852, + "step": 72580 + }, + { + "epoch": 0.3116440414552261, + "grad_norm": 0.06702742725610733, + "learning_rate": 6.913282684994352e-05, + "loss": 0.3380074739456177, + "step": 72590 + }, + { + "epoch": 0.3116869735452461, + "grad_norm": 2.0718653202056885, + "learning_rate": 6.91285151298259e-05, + "loss": 0.26555030345916747, + "step": 72600 + }, + { + "epoch": 0.31172990563526615, + "grad_norm": 2.719993829727173, + "learning_rate": 6.912420340970828e-05, + "loss": 0.304264760017395, + "step": 72610 + }, + { + "epoch": 0.31177283772528613, + "grad_norm": 1.0876257419586182, + "learning_rate": 6.911989168959064e-05, + "loss": 0.13385069370269775, + "step": 72620 + }, + { + "epoch": 0.3118157698153062, + "grad_norm": 4.189262866973877, + "learning_rate": 6.911557996947302e-05, + "loss": 0.43100762367248535, + "step": 72630 + }, + { + "epoch": 0.31185870190532616, + "grad_norm": 0.024808503687381744, + "learning_rate": 6.91112682493554e-05, + "loss": 0.4125965118408203, + "step": 72640 + }, + { + "epoch": 0.31190163399534615, + "grad_norm": 0.04348301887512207, + "learning_rate": 6.910695652923777e-05, + "loss": 0.2724820852279663, + "step": 72650 + }, + { + "epoch": 0.3119445660853662, + "grad_norm": 25.774370193481445, + "learning_rate": 6.910264480912015e-05, + "loss": 0.16499853134155273, + "step": 72660 + }, + { + "epoch": 0.3119874981753862, + "grad_norm": 0.05208896845579147, + "learning_rate": 6.909833308900253e-05, + "loss": 0.2335808277130127, + "step": 72670 + }, + { + "epoch": 0.31203043026540617, + "grad_norm": 1.2792284488677979, + "learning_rate": 6.90940213688849e-05, + "loss": 0.2558072566986084, + "step": 72680 + }, + { + "epoch": 0.3120733623554262, + "grad_norm": 0.013050318695604801, + "learning_rate": 6.908970964876728e-05, + "loss": 0.27600100040435793, + "step": 72690 + }, + { + "epoch": 0.3121162944454462, + "grad_norm": 0.13687624037265778, + "learning_rate": 6.908539792864966e-05, + "loss": 0.2716856002807617, + "step": 72700 + }, + { + "epoch": 0.3121592265354662, + "grad_norm": 3.646947145462036, + "learning_rate": 6.908108620853204e-05, + "loss": 0.3618535757064819, + "step": 72710 + }, + { + "epoch": 0.3122021586254862, + "grad_norm": 0.8492512106895447, + "learning_rate": 6.907677448841441e-05, + "loss": 0.1546394109725952, + "step": 72720 + }, + { + "epoch": 0.3122450907155062, + "grad_norm": 0.43165260553359985, + "learning_rate": 6.907246276829679e-05, + "loss": 0.08699611425399781, + "step": 72730 + }, + { + "epoch": 0.3122880228055262, + "grad_norm": 2.5769057273864746, + "learning_rate": 6.906815104817917e-05, + "loss": 0.41105146408081056, + "step": 72740 + }, + { + "epoch": 0.31233095489554624, + "grad_norm": 0.024652574211359024, + "learning_rate": 6.906383932806155e-05, + "loss": 0.14959990978240967, + "step": 72750 + }, + { + "epoch": 0.31237388698556623, + "grad_norm": 0.3725256621837616, + "learning_rate": 6.905952760794392e-05, + "loss": 0.24816575050354003, + "step": 72760 + }, + { + "epoch": 0.3124168190755862, + "grad_norm": 0.0020951335318386555, + "learning_rate": 6.90552158878263e-05, + "loss": 0.20660154819488524, + "step": 72770 + }, + { + "epoch": 0.31245975116560626, + "grad_norm": 0.001740924664773047, + "learning_rate": 6.905090416770868e-05, + "loss": 0.2389366626739502, + "step": 72780 + }, + { + "epoch": 0.31250268325562625, + "grad_norm": 0.007729761768132448, + "learning_rate": 6.904659244759104e-05, + "loss": 0.1722651243209839, + "step": 72790 + }, + { + "epoch": 0.31254561534564623, + "grad_norm": 3.117405891418457, + "learning_rate": 6.904228072747342e-05, + "loss": 0.5300877571105957, + "step": 72800 + }, + { + "epoch": 0.3125885474356663, + "grad_norm": 4.650268077850342, + "learning_rate": 6.90379690073558e-05, + "loss": 0.46052017211914065, + "step": 72810 + }, + { + "epoch": 0.31263147952568626, + "grad_norm": 0.14857514202594757, + "learning_rate": 6.903365728723817e-05, + "loss": 0.30590009689331055, + "step": 72820 + }, + { + "epoch": 0.31267441161570625, + "grad_norm": 0.005492858123034239, + "learning_rate": 6.902934556712055e-05, + "loss": 0.21419360637664794, + "step": 72830 + }, + { + "epoch": 0.3127173437057263, + "grad_norm": 0.028469033539295197, + "learning_rate": 6.902503384700293e-05, + "loss": 0.23694248199462892, + "step": 72840 + }, + { + "epoch": 0.3127602757957463, + "grad_norm": 14.499115943908691, + "learning_rate": 6.90207221268853e-05, + "loss": 0.20476045608520507, + "step": 72850 + }, + { + "epoch": 0.3128032078857663, + "grad_norm": 0.03953484818339348, + "learning_rate": 6.901641040676768e-05, + "loss": 0.04689075648784637, + "step": 72860 + }, + { + "epoch": 0.3128461399757863, + "grad_norm": 0.0067932335659861565, + "learning_rate": 6.901209868665005e-05, + "loss": 0.22675859928131104, + "step": 72870 + }, + { + "epoch": 0.3128890720658063, + "grad_norm": 0.010385886766016483, + "learning_rate": 6.900778696653243e-05, + "loss": 0.25794124603271484, + "step": 72880 + }, + { + "epoch": 0.31293200415582634, + "grad_norm": 4.429790496826172, + "learning_rate": 6.90034752464148e-05, + "loss": 0.2344876766204834, + "step": 72890 + }, + { + "epoch": 0.3129749362458463, + "grad_norm": 0.017854005098342896, + "learning_rate": 6.899916352629718e-05, + "loss": 0.18946893215179444, + "step": 72900 + }, + { + "epoch": 0.3130178683358663, + "grad_norm": 4.130064487457275, + "learning_rate": 6.899485180617956e-05, + "loss": 0.25511124134063723, + "step": 72910 + }, + { + "epoch": 0.31306080042588635, + "grad_norm": 2.605947732925415, + "learning_rate": 6.899054008606193e-05, + "loss": 0.17304036617279053, + "step": 72920 + }, + { + "epoch": 0.31310373251590634, + "grad_norm": 0.005477610044181347, + "learning_rate": 6.898622836594431e-05, + "loss": 0.3045832872390747, + "step": 72930 + }, + { + "epoch": 0.31314666460592633, + "grad_norm": 0.02059878222644329, + "learning_rate": 6.898191664582669e-05, + "loss": 0.3340297222137451, + "step": 72940 + }, + { + "epoch": 0.31318959669594637, + "grad_norm": 0.27422747015953064, + "learning_rate": 6.897760492570907e-05, + "loss": 0.19570144414901733, + "step": 72950 + }, + { + "epoch": 0.31323252878596636, + "grad_norm": 0.03176238015294075, + "learning_rate": 6.897329320559144e-05, + "loss": 0.5011603355407714, + "step": 72960 + }, + { + "epoch": 0.31327546087598634, + "grad_norm": 0.1306491494178772, + "learning_rate": 6.896898148547382e-05, + "loss": 0.1884814977645874, + "step": 72970 + }, + { + "epoch": 0.3133183929660064, + "grad_norm": 0.5676948428153992, + "learning_rate": 6.89646697653562e-05, + "loss": 0.22620975971221924, + "step": 72980 + }, + { + "epoch": 0.3133613250560264, + "grad_norm": 1.5327205657958984, + "learning_rate": 6.896035804523858e-05, + "loss": 0.3974642515182495, + "step": 72990 + }, + { + "epoch": 0.31340425714604636, + "grad_norm": 0.1722613275051117, + "learning_rate": 6.895604632512095e-05, + "loss": 0.348058819770813, + "step": 73000 + }, + { + "epoch": 0.31340425714604636, + "eval_loss": 0.4194343686103821, + "eval_runtime": 27.4782, + "eval_samples_per_second": 3.639, + "eval_steps_per_second": 3.639, + "step": 73000 + }, + { + "epoch": 0.3134471892360664, + "grad_norm": 0.05100061744451523, + "learning_rate": 6.895173460500333e-05, + "loss": 0.08028401732444763, + "step": 73010 + }, + { + "epoch": 0.3134901213260864, + "grad_norm": 0.04568387567996979, + "learning_rate": 6.894742288488571e-05, + "loss": 0.14936983585357666, + "step": 73020 + }, + { + "epoch": 0.3135330534161064, + "grad_norm": 0.04025249928236008, + "learning_rate": 6.894311116476807e-05, + "loss": 0.3439459323883057, + "step": 73030 + }, + { + "epoch": 0.3135759855061264, + "grad_norm": 0.29795241355895996, + "learning_rate": 6.893879944465045e-05, + "loss": 0.3217799425125122, + "step": 73040 + }, + { + "epoch": 0.3136189175961464, + "grad_norm": 0.019797299057245255, + "learning_rate": 6.893448772453283e-05, + "loss": 0.200309419631958, + "step": 73050 + }, + { + "epoch": 0.31366184968616645, + "grad_norm": 1.5113530158996582, + "learning_rate": 6.89301760044152e-05, + "loss": 0.34020171165466306, + "step": 73060 + }, + { + "epoch": 0.31370478177618644, + "grad_norm": 0.3040628731250763, + "learning_rate": 6.892586428429758e-05, + "loss": 0.11433427333831787, + "step": 73070 + }, + { + "epoch": 0.3137477138662064, + "grad_norm": 0.3089415729045868, + "learning_rate": 6.892155256417996e-05, + "loss": 0.19839117527008057, + "step": 73080 + }, + { + "epoch": 0.31379064595622647, + "grad_norm": 0.00015485839685425162, + "learning_rate": 6.891724084406234e-05, + "loss": 0.22629897594451903, + "step": 73090 + }, + { + "epoch": 0.31383357804624645, + "grad_norm": 1.1724250316619873, + "learning_rate": 6.891292912394471e-05, + "loss": 0.164984393119812, + "step": 73100 + }, + { + "epoch": 0.31387651013626644, + "grad_norm": 6.634596347808838, + "learning_rate": 6.890861740382709e-05, + "loss": 0.2887473821640015, + "step": 73110 + }, + { + "epoch": 0.3139194422262865, + "grad_norm": 0.07911381870508194, + "learning_rate": 6.890430568370945e-05, + "loss": 0.23019626140594482, + "step": 73120 + }, + { + "epoch": 0.31396237431630647, + "grad_norm": 0.08132822811603546, + "learning_rate": 6.889999396359183e-05, + "loss": 0.13312993049621583, + "step": 73130 + }, + { + "epoch": 0.31400530640632646, + "grad_norm": 6.580303192138672, + "learning_rate": 6.889568224347421e-05, + "loss": 0.3567745447158813, + "step": 73140 + }, + { + "epoch": 0.3140482384963465, + "grad_norm": 0.11964696645736694, + "learning_rate": 6.889137052335659e-05, + "loss": 0.09761974215507507, + "step": 73150 + }, + { + "epoch": 0.3140911705863665, + "grad_norm": 2.271083116531372, + "learning_rate": 6.888705880323896e-05, + "loss": 0.29648566246032715, + "step": 73160 + }, + { + "epoch": 0.3141341026763865, + "grad_norm": 1.732062578201294, + "learning_rate": 6.888274708312134e-05, + "loss": 0.15959405899047852, + "step": 73170 + }, + { + "epoch": 0.3141770347664065, + "grad_norm": 0.026476643979549408, + "learning_rate": 6.887843536300372e-05, + "loss": 0.10188188552856445, + "step": 73180 + }, + { + "epoch": 0.3142199668564265, + "grad_norm": 0.05823148041963577, + "learning_rate": 6.88741236428861e-05, + "loss": 0.0034772615879774095, + "step": 73190 + }, + { + "epoch": 0.3142628989464465, + "grad_norm": 1.547411561012268, + "learning_rate": 6.886981192276847e-05, + "loss": 0.20585508346557618, + "step": 73200 + }, + { + "epoch": 0.31430583103646653, + "grad_norm": 0.08510097116231918, + "learning_rate": 6.886550020265085e-05, + "loss": 0.23898713588714598, + "step": 73210 + }, + { + "epoch": 0.3143487631264865, + "grad_norm": 0.7906961441040039, + "learning_rate": 6.886118848253323e-05, + "loss": 0.20077693462371826, + "step": 73220 + }, + { + "epoch": 0.3143916952165065, + "grad_norm": 11.559499740600586, + "learning_rate": 6.88568767624156e-05, + "loss": 0.3527076721191406, + "step": 73230 + }, + { + "epoch": 0.31443462730652655, + "grad_norm": 0.05195503309369087, + "learning_rate": 6.885256504229798e-05, + "loss": 0.20611882209777832, + "step": 73240 + }, + { + "epoch": 0.31447755939654654, + "grad_norm": 0.00046401165309362113, + "learning_rate": 6.884825332218036e-05, + "loss": 0.15216498374938964, + "step": 73250 + }, + { + "epoch": 0.3145204914865665, + "grad_norm": 0.16569949686527252, + "learning_rate": 6.884394160206274e-05, + "loss": 0.33675241470336914, + "step": 73260 + }, + { + "epoch": 0.31456342357658656, + "grad_norm": 3.5443081855773926, + "learning_rate": 6.883962988194511e-05, + "loss": 0.3302395582199097, + "step": 73270 + }, + { + "epoch": 0.31460635566660655, + "grad_norm": 1.1270393133163452, + "learning_rate": 6.883531816182748e-05, + "loss": 0.3283742666244507, + "step": 73280 + }, + { + "epoch": 0.3146492877566266, + "grad_norm": 2.613358974456787, + "learning_rate": 6.883100644170986e-05, + "loss": 0.3033371210098267, + "step": 73290 + }, + { + "epoch": 0.3146922198466466, + "grad_norm": 0.4473738670349121, + "learning_rate": 6.882669472159223e-05, + "loss": 0.11812324523925781, + "step": 73300 + }, + { + "epoch": 0.31473515193666657, + "grad_norm": 1.625314474105835, + "learning_rate": 6.882238300147461e-05, + "loss": 0.03831766247749328, + "step": 73310 + }, + { + "epoch": 0.3147780840266866, + "grad_norm": 0.1999804824590683, + "learning_rate": 6.881807128135699e-05, + "loss": 0.2408984899520874, + "step": 73320 + }, + { + "epoch": 0.3148210161167066, + "grad_norm": 0.01782737672328949, + "learning_rate": 6.881375956123936e-05, + "loss": 0.2375476837158203, + "step": 73330 + }, + { + "epoch": 0.3148639482067266, + "grad_norm": 0.005965593736618757, + "learning_rate": 6.880944784112174e-05, + "loss": 0.22016007900238038, + "step": 73340 + }, + { + "epoch": 0.3149068802967466, + "grad_norm": 0.18683241307735443, + "learning_rate": 6.880513612100412e-05, + "loss": 0.34534764289855957, + "step": 73350 + }, + { + "epoch": 0.3149498123867666, + "grad_norm": 0.07245934754610062, + "learning_rate": 6.880082440088648e-05, + "loss": 0.20898077487945557, + "step": 73360 + }, + { + "epoch": 0.3149927444767866, + "grad_norm": 0.06859976798295975, + "learning_rate": 6.879651268076886e-05, + "loss": 0.32060167789459226, + "step": 73370 + }, + { + "epoch": 0.31503567656680664, + "grad_norm": 0.044097669422626495, + "learning_rate": 6.879220096065124e-05, + "loss": 0.5895505428314209, + "step": 73380 + }, + { + "epoch": 0.31507860865682663, + "grad_norm": 3.05902099609375, + "learning_rate": 6.878788924053362e-05, + "loss": 0.2975430488586426, + "step": 73390 + }, + { + "epoch": 0.3151215407468466, + "grad_norm": 2.7847342491149902, + "learning_rate": 6.878357752041599e-05, + "loss": 0.24648573398590087, + "step": 73400 + }, + { + "epoch": 0.31516447283686666, + "grad_norm": 0.6280997395515442, + "learning_rate": 6.877926580029837e-05, + "loss": 0.16343621015548707, + "step": 73410 + }, + { + "epoch": 0.31520740492688665, + "grad_norm": 14.133800506591797, + "learning_rate": 6.877495408018075e-05, + "loss": 0.20844955444335939, + "step": 73420 + }, + { + "epoch": 0.31525033701690663, + "grad_norm": 0.0016242277342826128, + "learning_rate": 6.877064236006312e-05, + "loss": 0.3425706148147583, + "step": 73430 + }, + { + "epoch": 0.3152932691069267, + "grad_norm": 0.28610384464263916, + "learning_rate": 6.87663306399455e-05, + "loss": 0.1372280716896057, + "step": 73440 + }, + { + "epoch": 0.31533620119694666, + "grad_norm": 1.4336791038513184, + "learning_rate": 6.876201891982788e-05, + "loss": 0.4349491596221924, + "step": 73450 + }, + { + "epoch": 0.31537913328696665, + "grad_norm": 0.7685583233833313, + "learning_rate": 6.875770719971026e-05, + "loss": 0.13812743425369262, + "step": 73460 + }, + { + "epoch": 0.3154220653769867, + "grad_norm": 0.8320184946060181, + "learning_rate": 6.875339547959263e-05, + "loss": 0.2647475004196167, + "step": 73470 + }, + { + "epoch": 0.3154649974670067, + "grad_norm": 11.539794921875, + "learning_rate": 6.874908375947501e-05, + "loss": 0.2847954273223877, + "step": 73480 + }, + { + "epoch": 0.3155079295570267, + "grad_norm": 2.478747606277466, + "learning_rate": 6.874477203935739e-05, + "loss": 0.47316579818725585, + "step": 73490 + }, + { + "epoch": 0.3155508616470467, + "grad_norm": 0.12313472479581833, + "learning_rate": 6.874046031923977e-05, + "loss": 0.12275233268737792, + "step": 73500 + }, + { + "epoch": 0.3155937937370667, + "grad_norm": 0.18190690875053406, + "learning_rate": 6.873614859912214e-05, + "loss": 0.34535109996795654, + "step": 73510 + }, + { + "epoch": 0.31563672582708674, + "grad_norm": 0.02219126932322979, + "learning_rate": 6.873183687900452e-05, + "loss": 0.3690331935882568, + "step": 73520 + }, + { + "epoch": 0.3156796579171067, + "grad_norm": 0.7718960642814636, + "learning_rate": 6.872752515888688e-05, + "loss": 0.1922664999961853, + "step": 73530 + }, + { + "epoch": 0.3157225900071267, + "grad_norm": 17.84370994567871, + "learning_rate": 6.872321343876926e-05, + "loss": 0.20998814105987548, + "step": 73540 + }, + { + "epoch": 0.31576552209714676, + "grad_norm": 4.637472629547119, + "learning_rate": 6.871890171865164e-05, + "loss": 0.3144806146621704, + "step": 73550 + }, + { + "epoch": 0.31580845418716674, + "grad_norm": 4.393991470336914, + "learning_rate": 6.871458999853402e-05, + "loss": 0.39880125522613524, + "step": 73560 + }, + { + "epoch": 0.31585138627718673, + "grad_norm": 0.05799943953752518, + "learning_rate": 6.87102782784164e-05, + "loss": 0.230562424659729, + "step": 73570 + }, + { + "epoch": 0.3158943183672068, + "grad_norm": 0.9074521064758301, + "learning_rate": 6.870596655829877e-05, + "loss": 0.08199453353881836, + "step": 73580 + }, + { + "epoch": 0.31593725045722676, + "grad_norm": 0.012529193423688412, + "learning_rate": 6.870165483818115e-05, + "loss": 0.15863250494003295, + "step": 73590 + }, + { + "epoch": 0.31598018254724675, + "grad_norm": 0.0102114612236619, + "learning_rate": 6.869734311806353e-05, + "loss": 0.1947621464729309, + "step": 73600 + }, + { + "epoch": 0.3160231146372668, + "grad_norm": 0.08841849118471146, + "learning_rate": 6.869303139794589e-05, + "loss": 0.14853737354278565, + "step": 73610 + }, + { + "epoch": 0.3160660467272868, + "grad_norm": 3.2766990661621094, + "learning_rate": 6.868871967782827e-05, + "loss": 0.11773378849029541, + "step": 73620 + }, + { + "epoch": 0.31610897881730676, + "grad_norm": 0.004059187136590481, + "learning_rate": 6.868440795771064e-05, + "loss": 0.22974061965942383, + "step": 73630 + }, + { + "epoch": 0.3161519109073268, + "grad_norm": 0.0031375617254525423, + "learning_rate": 6.868009623759302e-05, + "loss": 0.22927279472351075, + "step": 73640 + }, + { + "epoch": 0.3161948429973468, + "grad_norm": 5.625444412231445, + "learning_rate": 6.86757845174754e-05, + "loss": 0.4188673496246338, + "step": 73650 + }, + { + "epoch": 0.3162377750873668, + "grad_norm": 0.02534160204231739, + "learning_rate": 6.867147279735778e-05, + "loss": 0.2420274257659912, + "step": 73660 + }, + { + "epoch": 0.3162807071773868, + "grad_norm": 1.61578369140625, + "learning_rate": 6.866716107724017e-05, + "loss": 0.2600194692611694, + "step": 73670 + }, + { + "epoch": 0.3163236392674068, + "grad_norm": 0.0764639675617218, + "learning_rate": 6.866284935712254e-05, + "loss": 0.38288352489471433, + "step": 73680 + }, + { + "epoch": 0.3163665713574268, + "grad_norm": 0.19459417462348938, + "learning_rate": 6.865853763700491e-05, + "loss": 0.06477647423744201, + "step": 73690 + }, + { + "epoch": 0.31640950344744684, + "grad_norm": 0.07276225835084915, + "learning_rate": 6.865422591688729e-05, + "loss": 0.0916989028453827, + "step": 73700 + }, + { + "epoch": 0.3164524355374668, + "grad_norm": 2.2095179557800293, + "learning_rate": 6.864991419676966e-05, + "loss": 0.3283759832382202, + "step": 73710 + }, + { + "epoch": 0.31649536762748687, + "grad_norm": 0.13781090080738068, + "learning_rate": 6.864560247665204e-05, + "loss": 0.34083831310272217, + "step": 73720 + }, + { + "epoch": 0.31653829971750685, + "grad_norm": 3.043226718902588, + "learning_rate": 6.864129075653442e-05, + "loss": 0.22454166412353516, + "step": 73730 + }, + { + "epoch": 0.31658123180752684, + "grad_norm": 0.4412704408168793, + "learning_rate": 6.86369790364168e-05, + "loss": 0.11458765268325806, + "step": 73740 + }, + { + "epoch": 0.3166241638975469, + "grad_norm": 4.447032451629639, + "learning_rate": 6.863266731629917e-05, + "loss": 0.3687690496444702, + "step": 73750 + }, + { + "epoch": 0.31666709598756687, + "grad_norm": 0.015125907026231289, + "learning_rate": 6.862835559618155e-05, + "loss": 0.08853949904441834, + "step": 73760 + }, + { + "epoch": 0.31671002807758686, + "grad_norm": 3.1621484756469727, + "learning_rate": 6.862404387606391e-05, + "loss": 0.46956467628479004, + "step": 73770 + }, + { + "epoch": 0.3167529601676069, + "grad_norm": 2.4286065101623535, + "learning_rate": 6.861973215594629e-05, + "loss": 0.15351874828338624, + "step": 73780 + }, + { + "epoch": 0.3167958922576269, + "grad_norm": 3.618520736694336, + "learning_rate": 6.861542043582867e-05, + "loss": 0.20479018688201905, + "step": 73790 + }, + { + "epoch": 0.3168388243476469, + "grad_norm": 0.027950339019298553, + "learning_rate": 6.861110871571105e-05, + "loss": 0.26309866905212403, + "step": 73800 + }, + { + "epoch": 0.3168817564376669, + "grad_norm": 0.4110260605812073, + "learning_rate": 6.860679699559342e-05, + "loss": 0.14416571855545043, + "step": 73810 + }, + { + "epoch": 0.3169246885276869, + "grad_norm": 4.954768657684326, + "learning_rate": 6.86024852754758e-05, + "loss": 0.43427104949951173, + "step": 73820 + }, + { + "epoch": 0.3169676206177069, + "grad_norm": 0.0031284948345273733, + "learning_rate": 6.859817355535818e-05, + "loss": 0.07026217579841613, + "step": 73830 + }, + { + "epoch": 0.31701055270772693, + "grad_norm": 0.07524044066667557, + "learning_rate": 6.859386183524056e-05, + "loss": 0.03691762983798981, + "step": 73840 + }, + { + "epoch": 0.3170534847977469, + "grad_norm": 0.018756557255983353, + "learning_rate": 6.858955011512293e-05, + "loss": 0.28873724937438966, + "step": 73850 + }, + { + "epoch": 0.3170964168877669, + "grad_norm": 0.36779269576072693, + "learning_rate": 6.85852383950053e-05, + "loss": 0.3000300884246826, + "step": 73860 + }, + { + "epoch": 0.31713934897778695, + "grad_norm": 2.9124794006347656, + "learning_rate": 6.858092667488767e-05, + "loss": 0.29564919471740725, + "step": 73870 + }, + { + "epoch": 0.31718228106780694, + "grad_norm": 1.56826651096344, + "learning_rate": 6.857661495477005e-05, + "loss": 0.11024869680404663, + "step": 73880 + }, + { + "epoch": 0.3172252131578269, + "grad_norm": 2.3062291145324707, + "learning_rate": 6.857230323465244e-05, + "loss": 0.26093406677246095, + "step": 73890 + }, + { + "epoch": 0.31726814524784697, + "grad_norm": 0.014564376324415207, + "learning_rate": 6.856799151453482e-05, + "loss": 0.5606659412384033, + "step": 73900 + }, + { + "epoch": 0.31731107733786695, + "grad_norm": 0.31643006205558777, + "learning_rate": 6.85636797944172e-05, + "loss": 0.3782673358917236, + "step": 73910 + }, + { + "epoch": 0.317354009427887, + "grad_norm": 2.9848568439483643, + "learning_rate": 6.855936807429957e-05, + "loss": 0.43987135887145995, + "step": 73920 + }, + { + "epoch": 0.317396941517907, + "grad_norm": 2.8447110652923584, + "learning_rate": 6.855505635418195e-05, + "loss": 0.4308504581451416, + "step": 73930 + }, + { + "epoch": 0.31743987360792697, + "grad_norm": 0.21271449327468872, + "learning_rate": 6.855074463406432e-05, + "loss": 0.31659440994262694, + "step": 73940 + }, + { + "epoch": 0.317482805697947, + "grad_norm": 0.08096141368150711, + "learning_rate": 6.854643291394669e-05, + "loss": 0.18132725954055787, + "step": 73950 + }, + { + "epoch": 0.317525737787967, + "grad_norm": 0.6696780323982239, + "learning_rate": 6.854212119382907e-05, + "loss": 0.05133354067802429, + "step": 73960 + }, + { + "epoch": 0.317568669877987, + "grad_norm": 1.4350579977035522, + "learning_rate": 6.853780947371145e-05, + "loss": 0.30485239028930666, + "step": 73970 + }, + { + "epoch": 0.31761160196800703, + "grad_norm": 2.1010446548461914, + "learning_rate": 6.853349775359382e-05, + "loss": 0.1809307336807251, + "step": 73980 + }, + { + "epoch": 0.317654534058027, + "grad_norm": 2.349207878112793, + "learning_rate": 6.85291860334762e-05, + "loss": 0.5290529251098632, + "step": 73990 + }, + { + "epoch": 0.317697466148047, + "grad_norm": 0.06218874827027321, + "learning_rate": 6.852487431335858e-05, + "loss": 0.13957602977752687, + "step": 74000 + }, + { + "epoch": 0.317697466148047, + "eval_loss": 0.4288139045238495, + "eval_runtime": 27.4541, + "eval_samples_per_second": 3.642, + "eval_steps_per_second": 3.642, + "step": 74000 + }, + { + "epoch": 0.31774039823806705, + "grad_norm": 22.351932525634766, + "learning_rate": 6.852056259324096e-05, + "loss": 0.08331415057182312, + "step": 74010 + }, + { + "epoch": 0.31778333032808703, + "grad_norm": 0.6534700989723206, + "learning_rate": 6.851625087312332e-05, + "loss": 0.21025574207305908, + "step": 74020 + }, + { + "epoch": 0.317826262418107, + "grad_norm": 2.180680990219116, + "learning_rate": 6.85119391530057e-05, + "loss": 0.2656111478805542, + "step": 74030 + }, + { + "epoch": 0.31786919450812706, + "grad_norm": 0.5378935933113098, + "learning_rate": 6.850762743288807e-05, + "loss": 0.0576973021030426, + "step": 74040 + }, + { + "epoch": 0.31791212659814705, + "grad_norm": 0.13711658120155334, + "learning_rate": 6.850331571277045e-05, + "loss": 0.15462011098861694, + "step": 74050 + }, + { + "epoch": 0.31795505868816704, + "grad_norm": 0.42226970195770264, + "learning_rate": 6.849900399265283e-05, + "loss": 0.36159141063690187, + "step": 74060 + }, + { + "epoch": 0.3179979907781871, + "grad_norm": 0.34076371788978577, + "learning_rate": 6.849469227253521e-05, + "loss": 0.06672542691230773, + "step": 74070 + }, + { + "epoch": 0.31804092286820707, + "grad_norm": 0.08435351401567459, + "learning_rate": 6.849038055241758e-05, + "loss": 0.2677072763442993, + "step": 74080 + }, + { + "epoch": 0.31808385495822705, + "grad_norm": 1.4236209392547607, + "learning_rate": 6.848606883229996e-05, + "loss": 0.21868863105773925, + "step": 74090 + }, + { + "epoch": 0.3181267870482471, + "grad_norm": 2.8784008026123047, + "learning_rate": 6.848175711218233e-05, + "loss": 0.21964583396911622, + "step": 74100 + }, + { + "epoch": 0.3181697191382671, + "grad_norm": 1.7891312837600708, + "learning_rate": 6.847744539206472e-05, + "loss": 0.23284506797790527, + "step": 74110 + }, + { + "epoch": 0.31821265122828707, + "grad_norm": 0.48343974351882935, + "learning_rate": 6.84731336719471e-05, + "loss": 0.16025389432907106, + "step": 74120 + }, + { + "epoch": 0.3182555833183071, + "grad_norm": 4.8825531005859375, + "learning_rate": 6.846882195182947e-05, + "loss": 0.2873832225799561, + "step": 74130 + }, + { + "epoch": 0.3182985154083271, + "grad_norm": 2.3623721599578857, + "learning_rate": 6.846451023171185e-05, + "loss": 0.14504092931747437, + "step": 74140 + }, + { + "epoch": 0.31834144749834714, + "grad_norm": 6.830047607421875, + "learning_rate": 6.846019851159423e-05, + "loss": 0.266190767288208, + "step": 74150 + }, + { + "epoch": 0.31838437958836713, + "grad_norm": 1.440556287765503, + "learning_rate": 6.84558867914766e-05, + "loss": 0.31242871284484863, + "step": 74160 + }, + { + "epoch": 0.3184273116783871, + "grad_norm": 1.8908369541168213, + "learning_rate": 6.845157507135898e-05, + "loss": 0.32432739734649657, + "step": 74170 + }, + { + "epoch": 0.31847024376840716, + "grad_norm": 0.2510699927806854, + "learning_rate": 6.844726335124136e-05, + "loss": 0.3110947847366333, + "step": 74180 + }, + { + "epoch": 0.31851317585842714, + "grad_norm": 0.12024804204702377, + "learning_rate": 6.844295163112372e-05, + "loss": 0.13613009452819824, + "step": 74190 + }, + { + "epoch": 0.31855610794844713, + "grad_norm": 1.5495117902755737, + "learning_rate": 6.84386399110061e-05, + "loss": 0.442257022857666, + "step": 74200 + }, + { + "epoch": 0.3185990400384672, + "grad_norm": 0.110983707010746, + "learning_rate": 6.843432819088848e-05, + "loss": 0.1462443709373474, + "step": 74210 + }, + { + "epoch": 0.31864197212848716, + "grad_norm": 3.793349504470825, + "learning_rate": 6.843001647077085e-05, + "loss": 0.2545795440673828, + "step": 74220 + }, + { + "epoch": 0.31868490421850715, + "grad_norm": 0.16117846965789795, + "learning_rate": 6.842570475065323e-05, + "loss": 0.15870895385742187, + "step": 74230 + }, + { + "epoch": 0.3187278363085272, + "grad_norm": 0.07271076738834381, + "learning_rate": 6.842139303053561e-05, + "loss": 0.12695436477661132, + "step": 74240 + }, + { + "epoch": 0.3187707683985472, + "grad_norm": 1.6894328594207764, + "learning_rate": 6.841708131041799e-05, + "loss": 0.18799281120300293, + "step": 74250 + }, + { + "epoch": 0.31881370048856716, + "grad_norm": 0.026538612321019173, + "learning_rate": 6.841276959030036e-05, + "loss": 0.25241885185241697, + "step": 74260 + }, + { + "epoch": 0.3188566325785872, + "grad_norm": 0.17398607730865479, + "learning_rate": 6.840845787018273e-05, + "loss": 0.12818098068237305, + "step": 74270 + }, + { + "epoch": 0.3188995646686072, + "grad_norm": 0.011165103875100613, + "learning_rate": 6.84041461500651e-05, + "loss": 0.005557307228446007, + "step": 74280 + }, + { + "epoch": 0.3189424967586272, + "grad_norm": 4.761146068572998, + "learning_rate": 6.839983442994748e-05, + "loss": 0.2648351192474365, + "step": 74290 + }, + { + "epoch": 0.3189854288486472, + "grad_norm": 0.0018934222171083093, + "learning_rate": 6.839552270982986e-05, + "loss": 0.24793648719787598, + "step": 74300 + }, + { + "epoch": 0.3190283609386672, + "grad_norm": 13.782504081726074, + "learning_rate": 6.839121098971224e-05, + "loss": 0.4354073524475098, + "step": 74310 + }, + { + "epoch": 0.3190712930286872, + "grad_norm": 5.460157871246338, + "learning_rate": 6.838689926959461e-05, + "loss": 0.2691751003265381, + "step": 74320 + }, + { + "epoch": 0.31911422511870724, + "grad_norm": 3.517686605453491, + "learning_rate": 6.838258754947699e-05, + "loss": 0.10529460906982421, + "step": 74330 + }, + { + "epoch": 0.3191571572087272, + "grad_norm": 0.017410220578312874, + "learning_rate": 6.837827582935937e-05, + "loss": 0.26190121173858644, + "step": 74340 + }, + { + "epoch": 0.31920008929874727, + "grad_norm": 0.043979302048683167, + "learning_rate": 6.837396410924175e-05, + "loss": 0.18496758937835694, + "step": 74350 + }, + { + "epoch": 0.31924302138876726, + "grad_norm": 0.17498759925365448, + "learning_rate": 6.836965238912412e-05, + "loss": 0.39980545043945315, + "step": 74360 + }, + { + "epoch": 0.31928595347878724, + "grad_norm": 0.07269234210252762, + "learning_rate": 6.83653406690065e-05, + "loss": 0.25225677490234377, + "step": 74370 + }, + { + "epoch": 0.3193288855688073, + "grad_norm": 1.516465425491333, + "learning_rate": 6.836102894888888e-05, + "loss": 0.09321829676628113, + "step": 74380 + }, + { + "epoch": 0.3193718176588273, + "grad_norm": 0.11955863237380981, + "learning_rate": 6.835671722877125e-05, + "loss": 0.054157298803329465, + "step": 74390 + }, + { + "epoch": 0.31941474974884726, + "grad_norm": 2.1415810585021973, + "learning_rate": 6.835240550865363e-05, + "loss": 0.19859232902526855, + "step": 74400 + }, + { + "epoch": 0.3194576818388673, + "grad_norm": 0.006748868618160486, + "learning_rate": 6.834809378853601e-05, + "loss": 0.34521052837371824, + "step": 74410 + }, + { + "epoch": 0.3195006139288873, + "grad_norm": 0.4633532166481018, + "learning_rate": 6.834378206841839e-05, + "loss": 0.2189234495162964, + "step": 74420 + }, + { + "epoch": 0.3195435460189073, + "grad_norm": 1.0470963716506958, + "learning_rate": 6.833947034830075e-05, + "loss": 0.5068300247192383, + "step": 74430 + }, + { + "epoch": 0.3195864781089273, + "grad_norm": 0.03167863190174103, + "learning_rate": 6.833515862818313e-05, + "loss": 0.06632805466651917, + "step": 74440 + }, + { + "epoch": 0.3196294101989473, + "grad_norm": 1.881988763809204, + "learning_rate": 6.83308469080655e-05, + "loss": 0.17728158235549926, + "step": 74450 + }, + { + "epoch": 0.3196723422889673, + "grad_norm": 0.2718351483345032, + "learning_rate": 6.832653518794788e-05, + "loss": 0.09898674488067627, + "step": 74460 + }, + { + "epoch": 0.31971527437898734, + "grad_norm": 0.0583646185696125, + "learning_rate": 6.832222346783026e-05, + "loss": 0.07836299538612365, + "step": 74470 + }, + { + "epoch": 0.3197582064690073, + "grad_norm": 0.01715688779950142, + "learning_rate": 6.831791174771264e-05, + "loss": 0.3526939392089844, + "step": 74480 + }, + { + "epoch": 0.3198011385590273, + "grad_norm": 0.007413911167532206, + "learning_rate": 6.831360002759501e-05, + "loss": 0.0661340057849884, + "step": 74490 + }, + { + "epoch": 0.31984407064904735, + "grad_norm": 0.28797730803489685, + "learning_rate": 6.830928830747739e-05, + "loss": 0.3759117603302002, + "step": 74500 + }, + { + "epoch": 0.31988700273906734, + "grad_norm": 4.893228054046631, + "learning_rate": 6.830497658735976e-05, + "loss": 0.26215925216674807, + "step": 74510 + }, + { + "epoch": 0.3199299348290873, + "grad_norm": 1.035056233406067, + "learning_rate": 6.830066486724213e-05, + "loss": 0.21864049434661864, + "step": 74520 + }, + { + "epoch": 0.31997286691910737, + "grad_norm": 2.0399975776672363, + "learning_rate": 6.829635314712451e-05, + "loss": 0.12469936609268188, + "step": 74530 + }, + { + "epoch": 0.32001579900912736, + "grad_norm": 0.0653163492679596, + "learning_rate": 6.829204142700689e-05, + "loss": 0.24412922859191893, + "step": 74540 + }, + { + "epoch": 0.32005873109914734, + "grad_norm": 0.26795297861099243, + "learning_rate": 6.828772970688927e-05, + "loss": 0.028508707880973816, + "step": 74550 + }, + { + "epoch": 0.3201016631891674, + "grad_norm": 0.033264756202697754, + "learning_rate": 6.828341798677164e-05, + "loss": 0.21181397438049315, + "step": 74560 + }, + { + "epoch": 0.32014459527918737, + "grad_norm": 0.015528388321399689, + "learning_rate": 6.827910626665402e-05, + "loss": 0.13729554414749146, + "step": 74570 + }, + { + "epoch": 0.3201875273692074, + "grad_norm": 0.007677197456359863, + "learning_rate": 6.82747945465364e-05, + "loss": 0.04282234907150269, + "step": 74580 + }, + { + "epoch": 0.3202304594592274, + "grad_norm": 0.007951623760163784, + "learning_rate": 6.827048282641877e-05, + "loss": 0.11191353797912598, + "step": 74590 + }, + { + "epoch": 0.3202733915492474, + "grad_norm": 0.7950018048286438, + "learning_rate": 6.826617110630115e-05, + "loss": 0.19527859687805177, + "step": 74600 + }, + { + "epoch": 0.32031632363926743, + "grad_norm": 0.007828062400221825, + "learning_rate": 6.826185938618353e-05, + "loss": 0.21099915504455566, + "step": 74610 + }, + { + "epoch": 0.3203592557292874, + "grad_norm": 1.7859470844268799, + "learning_rate": 6.82575476660659e-05, + "loss": 0.3485477685928345, + "step": 74620 + }, + { + "epoch": 0.3204021878193074, + "grad_norm": 0.11276675760746002, + "learning_rate": 6.825323594594828e-05, + "loss": 0.2156670331954956, + "step": 74630 + }, + { + "epoch": 0.32044511990932745, + "grad_norm": 0.005295217968523502, + "learning_rate": 6.824892422583066e-05, + "loss": 0.3430546760559082, + "step": 74640 + }, + { + "epoch": 0.32048805199934743, + "grad_norm": 0.006576932035386562, + "learning_rate": 6.824461250571304e-05, + "loss": 0.03515567183494568, + "step": 74650 + }, + { + "epoch": 0.3205309840893674, + "grad_norm": 3.4251933097839355, + "learning_rate": 6.824030078559542e-05, + "loss": 0.32103025913238525, + "step": 74660 + }, + { + "epoch": 0.32057391617938746, + "grad_norm": 0.833284854888916, + "learning_rate": 6.82359890654778e-05, + "loss": 0.10578702688217163, + "step": 74670 + }, + { + "epoch": 0.32061684826940745, + "grad_norm": 1.1916800737380981, + "learning_rate": 6.823167734536016e-05, + "loss": 0.5401986598968506, + "step": 74680 + }, + { + "epoch": 0.32065978035942744, + "grad_norm": 12.759496688842773, + "learning_rate": 6.822736562524253e-05, + "loss": 0.1452803134918213, + "step": 74690 + }, + { + "epoch": 0.3207027124494475, + "grad_norm": 0.20818258821964264, + "learning_rate": 6.822305390512491e-05, + "loss": 0.08625043630599975, + "step": 74700 + }, + { + "epoch": 0.32074564453946747, + "grad_norm": 0.002943378174677491, + "learning_rate": 6.821874218500729e-05, + "loss": 0.16818757057189943, + "step": 74710 + }, + { + "epoch": 0.32078857662948745, + "grad_norm": 0.009216200560331345, + "learning_rate": 6.821443046488967e-05, + "loss": 0.09053115248680114, + "step": 74720 + }, + { + "epoch": 0.3208315087195075, + "grad_norm": 1.6070560216903687, + "learning_rate": 6.821011874477204e-05, + "loss": 0.27480525970458985, + "step": 74730 + }, + { + "epoch": 0.3208744408095275, + "grad_norm": 0.4009722173213959, + "learning_rate": 6.820580702465442e-05, + "loss": 0.36229352951049804, + "step": 74740 + }, + { + "epoch": 0.32091737289954747, + "grad_norm": 0.006991582922637463, + "learning_rate": 6.82014953045368e-05, + "loss": 0.21590156555175782, + "step": 74750 + }, + { + "epoch": 0.3209603049895675, + "grad_norm": 3.9237425327301025, + "learning_rate": 6.819718358441916e-05, + "loss": 0.26285347938537595, + "step": 74760 + }, + { + "epoch": 0.3210032370795875, + "grad_norm": 0.0030905550811439753, + "learning_rate": 6.819287186430154e-05, + "loss": 0.20134003162384034, + "step": 74770 + }, + { + "epoch": 0.32104616916960754, + "grad_norm": 24.15610694885254, + "learning_rate": 6.818856014418392e-05, + "loss": 0.1851871132850647, + "step": 74780 + }, + { + "epoch": 0.32108910125962753, + "grad_norm": 2.26943039894104, + "learning_rate": 6.81842484240663e-05, + "loss": 0.17499132156372071, + "step": 74790 + }, + { + "epoch": 0.3211320333496475, + "grad_norm": 0.013470095582306385, + "learning_rate": 6.817993670394867e-05, + "loss": 0.24905192852020264, + "step": 74800 + }, + { + "epoch": 0.32117496543966756, + "grad_norm": 4.934391975402832, + "learning_rate": 6.817562498383105e-05, + "loss": 0.3353193521499634, + "step": 74810 + }, + { + "epoch": 0.32121789752968755, + "grad_norm": 1.404799222946167, + "learning_rate": 6.817131326371343e-05, + "loss": 0.20187389850616455, + "step": 74820 + }, + { + "epoch": 0.32126082961970753, + "grad_norm": 0.2565377652645111, + "learning_rate": 6.81670015435958e-05, + "loss": 0.03901310861110687, + "step": 74830 + }, + { + "epoch": 0.3213037617097276, + "grad_norm": 0.18820512294769287, + "learning_rate": 6.816268982347818e-05, + "loss": 0.07977944612503052, + "step": 74840 + }, + { + "epoch": 0.32134669379974756, + "grad_norm": 4.582075595855713, + "learning_rate": 6.815837810336056e-05, + "loss": 0.23041229248046874, + "step": 74850 + }, + { + "epoch": 0.32138962588976755, + "grad_norm": 0.007188515271991491, + "learning_rate": 6.815406638324294e-05, + "loss": 0.2832122564315796, + "step": 74860 + }, + { + "epoch": 0.3214325579797876, + "grad_norm": 1.4931919574737549, + "learning_rate": 6.814975466312531e-05, + "loss": 0.43709635734558105, + "step": 74870 + }, + { + "epoch": 0.3214754900698076, + "grad_norm": 1.0869759321212769, + "learning_rate": 6.814544294300769e-05, + "loss": 0.24327874183654785, + "step": 74880 + }, + { + "epoch": 0.32151842215982757, + "grad_norm": 1.867722988128662, + "learning_rate": 6.814113122289007e-05, + "loss": 0.23782784938812257, + "step": 74890 + }, + { + "epoch": 0.3215613542498476, + "grad_norm": 0.01522792037576437, + "learning_rate": 6.813681950277245e-05, + "loss": 0.28627400398254393, + "step": 74900 + }, + { + "epoch": 0.3216042863398676, + "grad_norm": 0.21601015329360962, + "learning_rate": 6.813250778265482e-05, + "loss": 0.3485018253326416, + "step": 74910 + }, + { + "epoch": 0.3216472184298876, + "grad_norm": 0.009931655600667, + "learning_rate": 6.81281960625372e-05, + "loss": 0.37991507053375245, + "step": 74920 + }, + { + "epoch": 0.3216901505199076, + "grad_norm": 0.8376566767692566, + "learning_rate": 6.812388434241956e-05, + "loss": 0.20397210121154785, + "step": 74930 + }, + { + "epoch": 0.3217330826099276, + "grad_norm": 0.018569491803646088, + "learning_rate": 6.811957262230194e-05, + "loss": 0.29088473320007324, + "step": 74940 + }, + { + "epoch": 0.3217760146999476, + "grad_norm": 0.08200386166572571, + "learning_rate": 6.811526090218432e-05, + "loss": 0.04362513422966004, + "step": 74950 + }, + { + "epoch": 0.32181894678996764, + "grad_norm": 1.1255528926849365, + "learning_rate": 6.81109491820667e-05, + "loss": 0.06557718515396119, + "step": 74960 + }, + { + "epoch": 0.32186187887998763, + "grad_norm": 0.5678053498268127, + "learning_rate": 6.810663746194907e-05, + "loss": 0.2532555818557739, + "step": 74970 + }, + { + "epoch": 0.3219048109700076, + "grad_norm": 0.052461788058280945, + "learning_rate": 6.810232574183145e-05, + "loss": 0.26459851264953616, + "step": 74980 + }, + { + "epoch": 0.32194774306002766, + "grad_norm": 0.7850907444953918, + "learning_rate": 6.809801402171383e-05, + "loss": 0.1933504819869995, + "step": 74990 + }, + { + "epoch": 0.32199067515004764, + "grad_norm": 0.07535672187805176, + "learning_rate": 6.80937023015962e-05, + "loss": 0.19619510173797608, + "step": 75000 + }, + { + "epoch": 0.32199067515004764, + "eval_loss": 0.42375648021698, + "eval_runtime": 27.4204, + "eval_samples_per_second": 3.647, + "eval_steps_per_second": 3.647, + "step": 75000 + }, + { + "epoch": 0.3220336072400677, + "grad_norm": 0.07737399637699127, + "learning_rate": 6.808939058147857e-05, + "loss": 0.24549593925476074, + "step": 75010 + }, + { + "epoch": 0.3220765393300877, + "grad_norm": 5.281481742858887, + "learning_rate": 6.808507886136095e-05, + "loss": 0.16492899656295776, + "step": 75020 + }, + { + "epoch": 0.32211947142010766, + "grad_norm": 1.1363037824630737, + "learning_rate": 6.808076714124332e-05, + "loss": 0.23363871574401857, + "step": 75030 + }, + { + "epoch": 0.3221624035101277, + "grad_norm": 0.8865850567817688, + "learning_rate": 6.80764554211257e-05, + "loss": 0.04637100994586944, + "step": 75040 + }, + { + "epoch": 0.3222053356001477, + "grad_norm": 0.05453188344836235, + "learning_rate": 6.807214370100808e-05, + "loss": 0.28700239658355714, + "step": 75050 + }, + { + "epoch": 0.3222482676901677, + "grad_norm": 0.06403063237667084, + "learning_rate": 6.806783198089046e-05, + "loss": 0.29975502490997313, + "step": 75060 + }, + { + "epoch": 0.3222911997801877, + "grad_norm": 0.08798815310001373, + "learning_rate": 6.806352026077283e-05, + "loss": 0.15733823776245118, + "step": 75070 + }, + { + "epoch": 0.3223341318702077, + "grad_norm": 1.5155143737792969, + "learning_rate": 6.805920854065522e-05, + "loss": 0.21812820434570312, + "step": 75080 + }, + { + "epoch": 0.3223770639602277, + "grad_norm": 0.12022317945957184, + "learning_rate": 6.805489682053759e-05, + "loss": 0.077181476354599, + "step": 75090 + }, + { + "epoch": 0.32241999605024774, + "grad_norm": 1.108117938041687, + "learning_rate": 6.805058510041996e-05, + "loss": 0.17434661388397216, + "step": 75100 + }, + { + "epoch": 0.3224629281402677, + "grad_norm": 0.009858007542788982, + "learning_rate": 6.804627338030234e-05, + "loss": 0.3324748039245605, + "step": 75110 + }, + { + "epoch": 0.3225058602302877, + "grad_norm": 0.001360285677947104, + "learning_rate": 6.804196166018472e-05, + "loss": 0.20198981761932372, + "step": 75120 + }, + { + "epoch": 0.32254879232030775, + "grad_norm": 0.0015368229942396283, + "learning_rate": 6.80376499400671e-05, + "loss": 0.18393850326538086, + "step": 75130 + }, + { + "epoch": 0.32259172441032774, + "grad_norm": 0.01634756661951542, + "learning_rate": 6.803333821994947e-05, + "loss": 0.19422999620437623, + "step": 75140 + }, + { + "epoch": 0.3226346565003477, + "grad_norm": 27.322790145874023, + "learning_rate": 6.802902649983185e-05, + "loss": 0.26277408599853513, + "step": 75150 + }, + { + "epoch": 0.32267758859036777, + "grad_norm": 0.08929922431707382, + "learning_rate": 6.802471477971423e-05, + "loss": 0.30891809463500974, + "step": 75160 + }, + { + "epoch": 0.32272052068038776, + "grad_norm": 0.030177028849720955, + "learning_rate": 6.802040305959659e-05, + "loss": 0.4126291275024414, + "step": 75170 + }, + { + "epoch": 0.32276345277040774, + "grad_norm": 0.8247568011283875, + "learning_rate": 6.801609133947897e-05, + "loss": 0.09560133814811707, + "step": 75180 + }, + { + "epoch": 0.3228063848604278, + "grad_norm": 0.13243091106414795, + "learning_rate": 6.801177961936135e-05, + "loss": 0.2584502696990967, + "step": 75190 + }, + { + "epoch": 0.3228493169504478, + "grad_norm": 0.01154093537479639, + "learning_rate": 6.800746789924372e-05, + "loss": 0.14872738122940063, + "step": 75200 + }, + { + "epoch": 0.32289224904046776, + "grad_norm": 0.01844400353729725, + "learning_rate": 6.80031561791261e-05, + "loss": 0.2512629747390747, + "step": 75210 + }, + { + "epoch": 0.3229351811304878, + "grad_norm": 4.3735833168029785, + "learning_rate": 6.799884445900848e-05, + "loss": 0.08456424474716187, + "step": 75220 + }, + { + "epoch": 0.3229781132205078, + "grad_norm": 3.748575210571289, + "learning_rate": 6.799453273889086e-05, + "loss": 0.4353503227233887, + "step": 75230 + }, + { + "epoch": 0.32302104531052783, + "grad_norm": 0.2333959937095642, + "learning_rate": 6.799022101877323e-05, + "loss": 0.08659371733665466, + "step": 75240 + }, + { + "epoch": 0.3230639774005478, + "grad_norm": 1.1548364162445068, + "learning_rate": 6.79859092986556e-05, + "loss": 0.29887216091156005, + "step": 75250 + }, + { + "epoch": 0.3231069094905678, + "grad_norm": 0.009117556735873222, + "learning_rate": 6.798159757853798e-05, + "loss": 0.155653178691864, + "step": 75260 + }, + { + "epoch": 0.32314984158058785, + "grad_norm": 0.19610357284545898, + "learning_rate": 6.797728585842035e-05, + "loss": 0.22091832160949706, + "step": 75270 + }, + { + "epoch": 0.32319277367060784, + "grad_norm": 0.062415748834609985, + "learning_rate": 6.797297413830273e-05, + "loss": 0.44664454460144043, + "step": 75280 + }, + { + "epoch": 0.3232357057606278, + "grad_norm": 0.06516305357217789, + "learning_rate": 6.796866241818511e-05, + "loss": 0.22703559398651124, + "step": 75290 + }, + { + "epoch": 0.32327863785064787, + "grad_norm": 0.10712216049432755, + "learning_rate": 6.79643506980675e-05, + "loss": 0.1619246482849121, + "step": 75300 + }, + { + "epoch": 0.32332156994066785, + "grad_norm": 0.02630920521914959, + "learning_rate": 6.796003897794988e-05, + "loss": 0.17304229736328125, + "step": 75310 + }, + { + "epoch": 0.32336450203068784, + "grad_norm": 0.2714967131614685, + "learning_rate": 6.795572725783225e-05, + "loss": 0.1583251476287842, + "step": 75320 + }, + { + "epoch": 0.3234074341207079, + "grad_norm": 6.2658796310424805, + "learning_rate": 6.795141553771463e-05, + "loss": 0.23793091773986816, + "step": 75330 + }, + { + "epoch": 0.32345036621072787, + "grad_norm": 0.015765508636832237, + "learning_rate": 6.7947103817597e-05, + "loss": 0.34331545829772947, + "step": 75340 + }, + { + "epoch": 0.32349329830074786, + "grad_norm": 1.6817232370376587, + "learning_rate": 6.794279209747937e-05, + "loss": 0.3233363151550293, + "step": 75350 + }, + { + "epoch": 0.3235362303907679, + "grad_norm": 0.00951626431196928, + "learning_rate": 6.793848037736175e-05, + "loss": 0.11996997594833374, + "step": 75360 + }, + { + "epoch": 0.3235791624807879, + "grad_norm": 0.21259069442749023, + "learning_rate": 6.793416865724413e-05, + "loss": 0.2515986442565918, + "step": 75370 + }, + { + "epoch": 0.32362209457080787, + "grad_norm": 0.036104682832956314, + "learning_rate": 6.79298569371265e-05, + "loss": 0.3075053930282593, + "step": 75380 + }, + { + "epoch": 0.3236650266608279, + "grad_norm": 0.05259181931614876, + "learning_rate": 6.792554521700888e-05, + "loss": 0.29380111694335936, + "step": 75390 + }, + { + "epoch": 0.3237079587508479, + "grad_norm": 0.09340565651655197, + "learning_rate": 6.792123349689126e-05, + "loss": 0.23864850997924805, + "step": 75400 + }, + { + "epoch": 0.3237508908408679, + "grad_norm": 0.0508541613817215, + "learning_rate": 6.791692177677364e-05, + "loss": 0.26605610847473143, + "step": 75410 + }, + { + "epoch": 0.32379382293088793, + "grad_norm": 0.4147844910621643, + "learning_rate": 6.7912610056656e-05, + "loss": 0.1429394006729126, + "step": 75420 + }, + { + "epoch": 0.3238367550209079, + "grad_norm": 2.3247790336608887, + "learning_rate": 6.790829833653838e-05, + "loss": 0.41593217849731445, + "step": 75430 + }, + { + "epoch": 0.32387968711092796, + "grad_norm": 0.4674864113330841, + "learning_rate": 6.790398661642075e-05, + "loss": 0.14860082864761354, + "step": 75440 + }, + { + "epoch": 0.32392261920094795, + "grad_norm": 2.115438938140869, + "learning_rate": 6.789967489630313e-05, + "loss": 0.21622676849365235, + "step": 75450 + }, + { + "epoch": 0.32396555129096793, + "grad_norm": 1.429050087928772, + "learning_rate": 6.789536317618551e-05, + "loss": 0.3115101337432861, + "step": 75460 + }, + { + "epoch": 0.324008483380988, + "grad_norm": 0.019108422100543976, + "learning_rate": 6.789105145606789e-05, + "loss": 0.29048006534576415, + "step": 75470 + }, + { + "epoch": 0.32405141547100796, + "grad_norm": 89.55338287353516, + "learning_rate": 6.788673973595026e-05, + "loss": 0.274165678024292, + "step": 75480 + }, + { + "epoch": 0.32409434756102795, + "grad_norm": 0.29647406935691833, + "learning_rate": 6.788242801583264e-05, + "loss": 0.23708763122558593, + "step": 75490 + }, + { + "epoch": 0.324137279651048, + "grad_norm": 2.5495612621307373, + "learning_rate": 6.7878116295715e-05, + "loss": 0.27585015296936033, + "step": 75500 + }, + { + "epoch": 0.324180211741068, + "grad_norm": 4.890408515930176, + "learning_rate": 6.787380457559738e-05, + "loss": 0.2675506830215454, + "step": 75510 + }, + { + "epoch": 0.32422314383108797, + "grad_norm": 5.719459056854248, + "learning_rate": 6.786949285547977e-05, + "loss": 0.25751235485076907, + "step": 75520 + }, + { + "epoch": 0.324266075921108, + "grad_norm": 0.04946063086390495, + "learning_rate": 6.786518113536215e-05, + "loss": 0.2648316860198975, + "step": 75530 + }, + { + "epoch": 0.324309008011128, + "grad_norm": 2.4120781421661377, + "learning_rate": 6.786086941524453e-05, + "loss": 0.16416383981704713, + "step": 75540 + }, + { + "epoch": 0.324351940101148, + "grad_norm": 2.5934016704559326, + "learning_rate": 6.78565576951269e-05, + "loss": 0.3151400566101074, + "step": 75550 + }, + { + "epoch": 0.324394872191168, + "grad_norm": 0.235334113240242, + "learning_rate": 6.785224597500928e-05, + "loss": 0.29945969581604004, + "step": 75560 + }, + { + "epoch": 0.324437804281188, + "grad_norm": 0.02608591318130493, + "learning_rate": 6.784793425489166e-05, + "loss": 0.1716364622116089, + "step": 75570 + }, + { + "epoch": 0.324480736371208, + "grad_norm": 6.218681335449219, + "learning_rate": 6.784362253477402e-05, + "loss": 0.20043642520904542, + "step": 75580 + }, + { + "epoch": 0.32452366846122804, + "grad_norm": 4.614294528961182, + "learning_rate": 6.78393108146564e-05, + "loss": 0.21403853893280028, + "step": 75590 + }, + { + "epoch": 0.32456660055124803, + "grad_norm": 0.23961031436920166, + "learning_rate": 6.783499909453878e-05, + "loss": 0.28221883773803713, + "step": 75600 + }, + { + "epoch": 0.324609532641268, + "grad_norm": 0.9346398115158081, + "learning_rate": 6.783068737442116e-05, + "loss": 0.3928943395614624, + "step": 75610 + }, + { + "epoch": 0.32465246473128806, + "grad_norm": 3.2818591594696045, + "learning_rate": 6.782637565430353e-05, + "loss": 0.27822265625, + "step": 75620 + }, + { + "epoch": 0.32469539682130805, + "grad_norm": 14.885111808776855, + "learning_rate": 6.782206393418591e-05, + "loss": 0.18708184957504273, + "step": 75630 + }, + { + "epoch": 0.32473832891132803, + "grad_norm": 0.09145841747522354, + "learning_rate": 6.781775221406829e-05, + "loss": 0.17132962942123414, + "step": 75640 + }, + { + "epoch": 0.3247812610013481, + "grad_norm": 0.312010258436203, + "learning_rate": 6.781344049395066e-05, + "loss": 0.22032701969146729, + "step": 75650 + }, + { + "epoch": 0.32482419309136806, + "grad_norm": 0.20743966102600098, + "learning_rate": 6.780912877383304e-05, + "loss": 0.32837064266204835, + "step": 75660 + }, + { + "epoch": 0.3248671251813881, + "grad_norm": 4.621267318725586, + "learning_rate": 6.78048170537154e-05, + "loss": 0.2122286558151245, + "step": 75670 + }, + { + "epoch": 0.3249100572714081, + "grad_norm": 0.028458695858716965, + "learning_rate": 6.780050533359778e-05, + "loss": 0.09199699759483337, + "step": 75680 + }, + { + "epoch": 0.3249529893614281, + "grad_norm": 0.6898798942565918, + "learning_rate": 6.779619361348016e-05, + "loss": 0.31503407955169677, + "step": 75690 + }, + { + "epoch": 0.3249959214514481, + "grad_norm": 1.5483331680297852, + "learning_rate": 6.779188189336254e-05, + "loss": 0.2990516901016235, + "step": 75700 + }, + { + "epoch": 0.3250388535414681, + "grad_norm": 1.5893640518188477, + "learning_rate": 6.778757017324492e-05, + "loss": 0.3719264268875122, + "step": 75710 + }, + { + "epoch": 0.3250817856314881, + "grad_norm": 0.2527483105659485, + "learning_rate": 6.778325845312729e-05, + "loss": 0.08039749264717103, + "step": 75720 + }, + { + "epoch": 0.32512471772150814, + "grad_norm": 0.07131931930780411, + "learning_rate": 6.777894673300967e-05, + "loss": 0.17600921392440796, + "step": 75730 + }, + { + "epoch": 0.3251676498115281, + "grad_norm": 0.007941215299069881, + "learning_rate": 6.777463501289205e-05, + "loss": 0.4083552360534668, + "step": 75740 + }, + { + "epoch": 0.3252105819015481, + "grad_norm": 28.28816795349121, + "learning_rate": 6.777032329277442e-05, + "loss": 0.1733398914337158, + "step": 75750 + }, + { + "epoch": 0.32525351399156816, + "grad_norm": 2.974475622177124, + "learning_rate": 6.77660115726568e-05, + "loss": 0.3968287229537964, + "step": 75760 + }, + { + "epoch": 0.32529644608158814, + "grad_norm": 5.3154072761535645, + "learning_rate": 6.776169985253918e-05, + "loss": 0.28736727237701415, + "step": 75770 + }, + { + "epoch": 0.32533937817160813, + "grad_norm": 0.040719084441661835, + "learning_rate": 6.775738813242156e-05, + "loss": 0.11909763813018799, + "step": 75780 + }, + { + "epoch": 0.32538231026162817, + "grad_norm": 2.0676820278167725, + "learning_rate": 6.775307641230393e-05, + "loss": 0.39493000507354736, + "step": 75790 + }, + { + "epoch": 0.32542524235164816, + "grad_norm": 0.23623089492321014, + "learning_rate": 6.774876469218631e-05, + "loss": 0.12532161474227904, + "step": 75800 + }, + { + "epoch": 0.32546817444166815, + "grad_norm": 3.196317672729492, + "learning_rate": 6.774445297206869e-05, + "loss": 0.4569605827331543, + "step": 75810 + }, + { + "epoch": 0.3255111065316882, + "grad_norm": 0.234226793050766, + "learning_rate": 6.774014125195107e-05, + "loss": 0.2106410503387451, + "step": 75820 + }, + { + "epoch": 0.3255540386217082, + "grad_norm": 3.3291547298431396, + "learning_rate": 6.773582953183343e-05, + "loss": 0.17373330593109132, + "step": 75830 + }, + { + "epoch": 0.32559697071172816, + "grad_norm": 6.981594085693359, + "learning_rate": 6.773151781171581e-05, + "loss": 0.32075550556182864, + "step": 75840 + }, + { + "epoch": 0.3256399028017482, + "grad_norm": 0.0836896300315857, + "learning_rate": 6.772720609159818e-05, + "loss": 0.17245697975158691, + "step": 75850 + }, + { + "epoch": 0.3256828348917682, + "grad_norm": 0.007298530079424381, + "learning_rate": 6.772289437148056e-05, + "loss": 0.4353139877319336, + "step": 75860 + }, + { + "epoch": 0.32572576698178823, + "grad_norm": 1.379041314125061, + "learning_rate": 6.771858265136294e-05, + "loss": 0.22218732833862304, + "step": 75870 + }, + { + "epoch": 0.3257686990718082, + "grad_norm": 0.578391969203949, + "learning_rate": 6.771427093124532e-05, + "loss": 0.18814339637756347, + "step": 75880 + }, + { + "epoch": 0.3258116311618282, + "grad_norm": 0.09199251979589462, + "learning_rate": 6.77099592111277e-05, + "loss": 0.38448591232299806, + "step": 75890 + }, + { + "epoch": 0.32585456325184825, + "grad_norm": 0.03338582068681717, + "learning_rate": 6.770564749101007e-05, + "loss": 0.21661453247070311, + "step": 75900 + }, + { + "epoch": 0.32589749534186824, + "grad_norm": 1.5796095132827759, + "learning_rate": 6.770133577089243e-05, + "loss": 0.4152249336242676, + "step": 75910 + }, + { + "epoch": 0.3259404274318882, + "grad_norm": 1.3694597482681274, + "learning_rate": 6.769702405077481e-05, + "loss": 0.3295388460159302, + "step": 75920 + }, + { + "epoch": 0.32598335952190827, + "grad_norm": 2.6144042015075684, + "learning_rate": 6.769271233065719e-05, + "loss": 0.43881473541259763, + "step": 75930 + }, + { + "epoch": 0.32602629161192825, + "grad_norm": 0.142476424574852, + "learning_rate": 6.768840061053957e-05, + "loss": 0.17999660968780518, + "step": 75940 + }, + { + "epoch": 0.32606922370194824, + "grad_norm": 1.8619637489318848, + "learning_rate": 6.768408889042194e-05, + "loss": 0.2617930889129639, + "step": 75950 + }, + { + "epoch": 0.3261121557919683, + "grad_norm": 0.06892193853855133, + "learning_rate": 6.767977717030432e-05, + "loss": 0.15514886379241943, + "step": 75960 + }, + { + "epoch": 0.32615508788198827, + "grad_norm": 0.0553896464407444, + "learning_rate": 6.76754654501867e-05, + "loss": 0.2816772937774658, + "step": 75970 + }, + { + "epoch": 0.32619801997200826, + "grad_norm": 4.053797721862793, + "learning_rate": 6.767115373006908e-05, + "loss": 0.0834043264389038, + "step": 75980 + }, + { + "epoch": 0.3262409520620283, + "grad_norm": 1.032049536705017, + "learning_rate": 6.766684200995145e-05, + "loss": 0.3517958641052246, + "step": 75990 + }, + { + "epoch": 0.3262838841520483, + "grad_norm": 0.01598850078880787, + "learning_rate": 6.766253028983383e-05, + "loss": 0.18671306371688842, + "step": 76000 + }, + { + "epoch": 0.3262838841520483, + "eval_loss": 0.4156621992588043, + "eval_runtime": 27.4602, + "eval_samples_per_second": 3.642, + "eval_steps_per_second": 3.642, + "step": 76000 + }, + { + "epoch": 0.3263268162420683, + "grad_norm": 0.04511842504143715, + "learning_rate": 6.765821856971621e-05, + "loss": 0.3244593381881714, + "step": 76010 + }, + { + "epoch": 0.3263697483320883, + "grad_norm": 2.1493868827819824, + "learning_rate": 6.765390684959859e-05, + "loss": 0.09227357506752014, + "step": 76020 + }, + { + "epoch": 0.3264126804221083, + "grad_norm": 0.003547315252944827, + "learning_rate": 6.764959512948096e-05, + "loss": 0.23017759323120118, + "step": 76030 + }, + { + "epoch": 0.3264556125121283, + "grad_norm": 0.0195333119481802, + "learning_rate": 6.764528340936334e-05, + "loss": 0.2469775676727295, + "step": 76040 + }, + { + "epoch": 0.32649854460214833, + "grad_norm": 0.05551581457257271, + "learning_rate": 6.764097168924572e-05, + "loss": 0.2187352418899536, + "step": 76050 + }, + { + "epoch": 0.3265414766921683, + "grad_norm": 0.043517522513866425, + "learning_rate": 6.76366599691281e-05, + "loss": 0.20890164375305176, + "step": 76060 + }, + { + "epoch": 0.3265844087821883, + "grad_norm": 1.1082757711410522, + "learning_rate": 6.763234824901047e-05, + "loss": 0.5428919792175293, + "step": 76070 + }, + { + "epoch": 0.32662734087220835, + "grad_norm": 0.612888514995575, + "learning_rate": 6.762803652889284e-05, + "loss": 0.22457678318023683, + "step": 76080 + }, + { + "epoch": 0.32667027296222834, + "grad_norm": 0.03663385286927223, + "learning_rate": 6.762372480877521e-05, + "loss": 0.020015633106231688, + "step": 76090 + }, + { + "epoch": 0.3267132050522484, + "grad_norm": 0.04500538483262062, + "learning_rate": 6.761941308865759e-05, + "loss": 0.16043226718902587, + "step": 76100 + }, + { + "epoch": 0.32675613714226837, + "grad_norm": 0.23189817368984222, + "learning_rate": 6.761510136853997e-05, + "loss": 0.16690292358398437, + "step": 76110 + }, + { + "epoch": 0.32679906923228835, + "grad_norm": 17.376708984375, + "learning_rate": 6.761078964842235e-05, + "loss": 0.391109824180603, + "step": 76120 + }, + { + "epoch": 0.3268420013223084, + "grad_norm": 10.385045051574707, + "learning_rate": 6.760647792830472e-05, + "loss": 0.4553534507751465, + "step": 76130 + }, + { + "epoch": 0.3268849334123284, + "grad_norm": 0.05477939546108246, + "learning_rate": 6.76021662081871e-05, + "loss": 0.11088311672210693, + "step": 76140 + }, + { + "epoch": 0.32692786550234837, + "grad_norm": 0.12639902532100677, + "learning_rate": 6.759785448806948e-05, + "loss": 0.20579469203948975, + "step": 76150 + }, + { + "epoch": 0.3269707975923684, + "grad_norm": 0.39953383803367615, + "learning_rate": 6.759354276795184e-05, + "loss": 0.44393744468688967, + "step": 76160 + }, + { + "epoch": 0.3270137296823884, + "grad_norm": 0.9820276498794556, + "learning_rate": 6.758923104783422e-05, + "loss": 0.1274343490600586, + "step": 76170 + }, + { + "epoch": 0.3270566617724084, + "grad_norm": 0.04637445881962776, + "learning_rate": 6.75849193277166e-05, + "loss": 0.11465686559677124, + "step": 76180 + }, + { + "epoch": 0.32709959386242843, + "grad_norm": 1.4160728454589844, + "learning_rate": 6.758060760759897e-05, + "loss": 0.24932937622070311, + "step": 76190 + }, + { + "epoch": 0.3271425259524484, + "grad_norm": 0.046061545610427856, + "learning_rate": 6.757629588748135e-05, + "loss": 0.16362361907958983, + "step": 76200 + }, + { + "epoch": 0.3271854580424684, + "grad_norm": 0.05247507616877556, + "learning_rate": 6.757198416736373e-05, + "loss": 0.27802603244781493, + "step": 76210 + }, + { + "epoch": 0.32722839013248844, + "grad_norm": 4.0940022468566895, + "learning_rate": 6.75676724472461e-05, + "loss": 0.43665170669555664, + "step": 76220 + }, + { + "epoch": 0.32727132222250843, + "grad_norm": 1.1918656826019287, + "learning_rate": 6.756336072712848e-05, + "loss": 0.25766515731811523, + "step": 76230 + }, + { + "epoch": 0.3273142543125284, + "grad_norm": 1.1751809120178223, + "learning_rate": 6.755904900701086e-05, + "loss": 0.2382516622543335, + "step": 76240 + }, + { + "epoch": 0.32735718640254846, + "grad_norm": 0.05170591548085213, + "learning_rate": 6.755473728689324e-05, + "loss": 0.12441542148590087, + "step": 76250 + }, + { + "epoch": 0.32740011849256845, + "grad_norm": 0.06157025322318077, + "learning_rate": 6.755042556677561e-05, + "loss": 0.1379122853279114, + "step": 76260 + }, + { + "epoch": 0.32744305058258844, + "grad_norm": 0.013994595035910606, + "learning_rate": 6.754611384665799e-05, + "loss": 0.1110068678855896, + "step": 76270 + }, + { + "epoch": 0.3274859826726085, + "grad_norm": 0.002745084697380662, + "learning_rate": 6.754180212654037e-05, + "loss": 0.1953068971633911, + "step": 76280 + }, + { + "epoch": 0.32752891476262846, + "grad_norm": 0.03550855815410614, + "learning_rate": 6.753749040642275e-05, + "loss": 0.3258711338043213, + "step": 76290 + }, + { + "epoch": 0.3275718468526485, + "grad_norm": 2.1965603828430176, + "learning_rate": 6.753317868630512e-05, + "loss": 0.4237357616424561, + "step": 76300 + }, + { + "epoch": 0.3276147789426685, + "grad_norm": 1.426078200340271, + "learning_rate": 6.75288669661875e-05, + "loss": 0.2333233594894409, + "step": 76310 + }, + { + "epoch": 0.3276577110326885, + "grad_norm": 1.203570008277893, + "learning_rate": 6.752455524606987e-05, + "loss": 0.10217641592025757, + "step": 76320 + }, + { + "epoch": 0.3277006431227085, + "grad_norm": 0.2727987766265869, + "learning_rate": 6.752024352595224e-05, + "loss": 0.2010711431503296, + "step": 76330 + }, + { + "epoch": 0.3277435752127285, + "grad_norm": 0.03920082747936249, + "learning_rate": 6.751593180583462e-05, + "loss": 0.33216466903686526, + "step": 76340 + }, + { + "epoch": 0.3277865073027485, + "grad_norm": 0.2013687789440155, + "learning_rate": 6.7511620085717e-05, + "loss": 0.13307746648788452, + "step": 76350 + }, + { + "epoch": 0.32782943939276854, + "grad_norm": 0.030201489105820656, + "learning_rate": 6.750730836559937e-05, + "loss": 0.12909989356994628, + "step": 76360 + }, + { + "epoch": 0.3278723714827885, + "grad_norm": 0.027242738753557205, + "learning_rate": 6.750299664548175e-05, + "loss": 0.25510106086730955, + "step": 76370 + }, + { + "epoch": 0.3279153035728085, + "grad_norm": 0.35596781969070435, + "learning_rate": 6.749868492536413e-05, + "loss": 0.4509886264801025, + "step": 76380 + }, + { + "epoch": 0.32795823566282856, + "grad_norm": 0.3249008357524872, + "learning_rate": 6.74943732052465e-05, + "loss": 0.16739230155944823, + "step": 76390 + }, + { + "epoch": 0.32800116775284854, + "grad_norm": 0.23222234845161438, + "learning_rate": 6.749006148512888e-05, + "loss": 0.16387200355529785, + "step": 76400 + }, + { + "epoch": 0.32804409984286853, + "grad_norm": 0.080628402531147, + "learning_rate": 6.748574976501125e-05, + "loss": 0.26126348972320557, + "step": 76410 + }, + { + "epoch": 0.3280870319328886, + "grad_norm": 0.04311169683933258, + "learning_rate": 6.748143804489363e-05, + "loss": 0.1209375500679016, + "step": 76420 + }, + { + "epoch": 0.32812996402290856, + "grad_norm": 3.4792978763580322, + "learning_rate": 6.7477126324776e-05, + "loss": 0.1685411214828491, + "step": 76430 + }, + { + "epoch": 0.32817289611292855, + "grad_norm": 0.004967233166098595, + "learning_rate": 6.747281460465838e-05, + "loss": 0.23701400756835939, + "step": 76440 + }, + { + "epoch": 0.3282158282029486, + "grad_norm": 2.156458616256714, + "learning_rate": 6.746850288454076e-05, + "loss": 0.4158666133880615, + "step": 76450 + }, + { + "epoch": 0.3282587602929686, + "grad_norm": 3.7077934741973877, + "learning_rate": 6.746419116442313e-05, + "loss": 0.31851413249969485, + "step": 76460 + }, + { + "epoch": 0.32830169238298856, + "grad_norm": 4.856022834777832, + "learning_rate": 6.745987944430551e-05, + "loss": 0.25244870185852053, + "step": 76470 + }, + { + "epoch": 0.3283446244730086, + "grad_norm": 2.416544198989868, + "learning_rate": 6.745556772418789e-05, + "loss": 0.28528871536254885, + "step": 76480 + }, + { + "epoch": 0.3283875565630286, + "grad_norm": 0.025426995009183884, + "learning_rate": 6.745125600407027e-05, + "loss": 0.1763625144958496, + "step": 76490 + }, + { + "epoch": 0.3284304886530486, + "grad_norm": 0.033551640808582306, + "learning_rate": 6.744694428395264e-05, + "loss": 0.2682926893234253, + "step": 76500 + }, + { + "epoch": 0.3284734207430686, + "grad_norm": 0.4935181140899658, + "learning_rate": 6.744263256383502e-05, + "loss": 0.23141703605651856, + "step": 76510 + }, + { + "epoch": 0.3285163528330886, + "grad_norm": 0.017396489158272743, + "learning_rate": 6.74383208437174e-05, + "loss": 0.34695000648498536, + "step": 76520 + }, + { + "epoch": 0.32855928492310865, + "grad_norm": 0.004304240923374891, + "learning_rate": 6.743400912359978e-05, + "loss": 0.17491270303726197, + "step": 76530 + }, + { + "epoch": 0.32860221701312864, + "grad_norm": 0.0214606374502182, + "learning_rate": 6.742969740348215e-05, + "loss": 0.3861451387405396, + "step": 76540 + }, + { + "epoch": 0.3286451491031486, + "grad_norm": 0.11962751299142838, + "learning_rate": 6.742538568336453e-05, + "loss": 0.1299293279647827, + "step": 76550 + }, + { + "epoch": 0.32868808119316867, + "grad_norm": 0.019032057374715805, + "learning_rate": 6.742107396324691e-05, + "loss": 0.34933903217315676, + "step": 76560 + }, + { + "epoch": 0.32873101328318866, + "grad_norm": 0.00490832282230258, + "learning_rate": 6.741676224312927e-05, + "loss": 0.17231972217559816, + "step": 76570 + }, + { + "epoch": 0.32877394537320864, + "grad_norm": 4.506997585296631, + "learning_rate": 6.741245052301165e-05, + "loss": 0.33571021556854247, + "step": 76580 + }, + { + "epoch": 0.3288168774632287, + "grad_norm": 0.009630247950553894, + "learning_rate": 6.740813880289403e-05, + "loss": 0.0702921986579895, + "step": 76590 + }, + { + "epoch": 0.32885980955324867, + "grad_norm": 1.6018216609954834, + "learning_rate": 6.74038270827764e-05, + "loss": 0.27153944969177246, + "step": 76600 + }, + { + "epoch": 0.32890274164326866, + "grad_norm": 1.0840051174163818, + "learning_rate": 6.739951536265878e-05, + "loss": 0.23780350685119628, + "step": 76610 + }, + { + "epoch": 0.3289456737332887, + "grad_norm": 0.767017126083374, + "learning_rate": 6.739520364254116e-05, + "loss": 0.08334184885025024, + "step": 76620 + }, + { + "epoch": 0.3289886058233087, + "grad_norm": 0.03613976761698723, + "learning_rate": 6.739089192242354e-05, + "loss": 0.29817283153533936, + "step": 76630 + }, + { + "epoch": 0.3290315379133287, + "grad_norm": 4.877974510192871, + "learning_rate": 6.738658020230591e-05, + "loss": 0.3710965633392334, + "step": 76640 + }, + { + "epoch": 0.3290744700033487, + "grad_norm": 0.05267646536231041, + "learning_rate": 6.738226848218828e-05, + "loss": 0.06984040141105652, + "step": 76650 + }, + { + "epoch": 0.3291174020933687, + "grad_norm": 6.402245044708252, + "learning_rate": 6.737795676207065e-05, + "loss": 0.2605679750442505, + "step": 76660 + }, + { + "epoch": 0.3291603341833887, + "grad_norm": 0.4752952754497528, + "learning_rate": 6.737364504195303e-05, + "loss": 0.20503077507019044, + "step": 76670 + }, + { + "epoch": 0.32920326627340873, + "grad_norm": 0.02644643560051918, + "learning_rate": 6.736933332183541e-05, + "loss": 0.19144694805145263, + "step": 76680 + }, + { + "epoch": 0.3292461983634287, + "grad_norm": 0.024135204032063484, + "learning_rate": 6.736502160171779e-05, + "loss": 0.22330479621887206, + "step": 76690 + }, + { + "epoch": 0.3292891304534487, + "grad_norm": 0.06849802285432816, + "learning_rate": 6.736070988160016e-05, + "loss": 0.3209495782852173, + "step": 76700 + }, + { + "epoch": 0.32933206254346875, + "grad_norm": 0.14241854846477509, + "learning_rate": 6.735639816148255e-05, + "loss": 0.22532942295074462, + "step": 76710 + }, + { + "epoch": 0.32937499463348874, + "grad_norm": 0.23180896043777466, + "learning_rate": 6.735208644136493e-05, + "loss": 0.009979206323623657, + "step": 76720 + }, + { + "epoch": 0.3294179267235088, + "grad_norm": 0.006083738524466753, + "learning_rate": 6.734777472124731e-05, + "loss": 0.09892491698265075, + "step": 76730 + }, + { + "epoch": 0.32946085881352877, + "grad_norm": 1.0296895503997803, + "learning_rate": 6.734346300112967e-05, + "loss": 0.2149442434310913, + "step": 76740 + }, + { + "epoch": 0.32950379090354875, + "grad_norm": 4.450377941131592, + "learning_rate": 6.733915128101205e-05, + "loss": 0.4012149333953857, + "step": 76750 + }, + { + "epoch": 0.3295467229935688, + "grad_norm": 0.052618831396102905, + "learning_rate": 6.733483956089443e-05, + "loss": 0.23020663261413574, + "step": 76760 + }, + { + "epoch": 0.3295896550835888, + "grad_norm": 0.047180309891700745, + "learning_rate": 6.73305278407768e-05, + "loss": 0.14085540771484376, + "step": 76770 + }, + { + "epoch": 0.32963258717360877, + "grad_norm": 0.043389927595853806, + "learning_rate": 6.732621612065918e-05, + "loss": 0.32704901695251465, + "step": 76780 + }, + { + "epoch": 0.3296755192636288, + "grad_norm": 2.241126298904419, + "learning_rate": 6.732190440054156e-05, + "loss": 0.09795815348625184, + "step": 76790 + }, + { + "epoch": 0.3297184513536488, + "grad_norm": 1.1609137058258057, + "learning_rate": 6.731759268042394e-05, + "loss": 0.27942144870758057, + "step": 76800 + }, + { + "epoch": 0.3297613834436688, + "grad_norm": 2.5255086421966553, + "learning_rate": 6.731328096030631e-05, + "loss": 0.12147682905197144, + "step": 76810 + }, + { + "epoch": 0.32980431553368883, + "grad_norm": 0.1601771116256714, + "learning_rate": 6.730896924018868e-05, + "loss": 0.09735663533210755, + "step": 76820 + }, + { + "epoch": 0.3298472476237088, + "grad_norm": 0.021886374801397324, + "learning_rate": 6.730465752007106e-05, + "loss": 0.41047191619873047, + "step": 76830 + }, + { + "epoch": 0.3298901797137288, + "grad_norm": 0.005706735420972109, + "learning_rate": 6.730034579995343e-05, + "loss": 0.1476006031036377, + "step": 76840 + }, + { + "epoch": 0.32993311180374885, + "grad_norm": 0.006201342213898897, + "learning_rate": 6.729603407983581e-05, + "loss": 0.31725659370422366, + "step": 76850 + }, + { + "epoch": 0.32997604389376883, + "grad_norm": 4.594889163970947, + "learning_rate": 6.729172235971819e-05, + "loss": 0.4527462005615234, + "step": 76860 + }, + { + "epoch": 0.3300189759837888, + "grad_norm": 1.3257827758789062, + "learning_rate": 6.728741063960056e-05, + "loss": 0.35180106163024905, + "step": 76870 + }, + { + "epoch": 0.33006190807380886, + "grad_norm": 0.00488060899078846, + "learning_rate": 6.728309891948294e-05, + "loss": 0.20447189807891847, + "step": 76880 + }, + { + "epoch": 0.33010484016382885, + "grad_norm": 1.7747087478637695, + "learning_rate": 6.727878719936532e-05, + "loss": 0.33049397468566893, + "step": 76890 + }, + { + "epoch": 0.33014777225384884, + "grad_norm": 1.723191261291504, + "learning_rate": 6.727447547924768e-05, + "loss": 0.2903787612915039, + "step": 76900 + }, + { + "epoch": 0.3301907043438689, + "grad_norm": 0.006007712800055742, + "learning_rate": 6.727016375913006e-05, + "loss": 0.16813333034515382, + "step": 76910 + }, + { + "epoch": 0.33023363643388887, + "grad_norm": 0.06792984157800674, + "learning_rate": 6.726585203901244e-05, + "loss": 0.1623067855834961, + "step": 76920 + }, + { + "epoch": 0.33027656852390885, + "grad_norm": 0.9910484552383423, + "learning_rate": 6.726154031889483e-05, + "loss": 0.2884075403213501, + "step": 76930 + }, + { + "epoch": 0.3303195006139289, + "grad_norm": 1.2997990846633911, + "learning_rate": 6.72572285987772e-05, + "loss": 0.22271907329559326, + "step": 76940 + }, + { + "epoch": 0.3303624327039489, + "grad_norm": 0.0020591760985553265, + "learning_rate": 6.725291687865958e-05, + "loss": 0.2424839735031128, + "step": 76950 + }, + { + "epoch": 0.3304053647939689, + "grad_norm": 0.002365125808864832, + "learning_rate": 6.724860515854196e-05, + "loss": 0.15611679553985597, + "step": 76960 + }, + { + "epoch": 0.3304482968839889, + "grad_norm": 1.1765587329864502, + "learning_rate": 6.724429343842434e-05, + "loss": 0.2949581861495972, + "step": 76970 + }, + { + "epoch": 0.3304912289740089, + "grad_norm": 0.004968604538589716, + "learning_rate": 6.72399817183067e-05, + "loss": 0.19246065616607666, + "step": 76980 + }, + { + "epoch": 0.33053416106402894, + "grad_norm": 1.2751332521438599, + "learning_rate": 6.723566999818908e-05, + "loss": 0.32780234813690184, + "step": 76990 + }, + { + "epoch": 0.33057709315404893, + "grad_norm": 0.08275651931762695, + "learning_rate": 6.723135827807146e-05, + "loss": 0.13589673042297362, + "step": 77000 + }, + { + "epoch": 0.33057709315404893, + "eval_loss": 0.4413558542728424, + "eval_runtime": 27.5166, + "eval_samples_per_second": 3.634, + "eval_steps_per_second": 3.634, + "step": 77000 + }, + { + "epoch": 0.3306200252440689, + "grad_norm": 0.03493470698595047, + "learning_rate": 6.722704655795383e-05, + "loss": 0.2439117193222046, + "step": 77010 + }, + { + "epoch": 0.33066295733408896, + "grad_norm": 0.0626208558678627, + "learning_rate": 6.722273483783621e-05, + "loss": 0.22538723945617675, + "step": 77020 + }, + { + "epoch": 0.33070588942410895, + "grad_norm": 0.7499268054962158, + "learning_rate": 6.721842311771859e-05, + "loss": 0.06841785907745361, + "step": 77030 + }, + { + "epoch": 0.33074882151412893, + "grad_norm": 0.09875093400478363, + "learning_rate": 6.721411139760097e-05, + "loss": 0.04120129942893982, + "step": 77040 + }, + { + "epoch": 0.330791753604149, + "grad_norm": 0.4667881429195404, + "learning_rate": 6.720979967748334e-05, + "loss": 0.32832248210906984, + "step": 77050 + }, + { + "epoch": 0.33083468569416896, + "grad_norm": 1.0091041326522827, + "learning_rate": 6.720548795736571e-05, + "loss": 0.16251636743545533, + "step": 77060 + }, + { + "epoch": 0.33087761778418895, + "grad_norm": 0.01553149800747633, + "learning_rate": 6.720117623724808e-05, + "loss": 0.044237416982650754, + "step": 77070 + }, + { + "epoch": 0.330920549874209, + "grad_norm": 0.00838327594101429, + "learning_rate": 6.719686451713046e-05, + "loss": 0.084531968832016, + "step": 77080 + }, + { + "epoch": 0.330963481964229, + "grad_norm": 4.201925754547119, + "learning_rate": 6.719255279701284e-05, + "loss": 0.31850759983062743, + "step": 77090 + }, + { + "epoch": 0.33100641405424897, + "grad_norm": 0.7134585976600647, + "learning_rate": 6.718824107689522e-05, + "loss": 0.20272321701049806, + "step": 77100 + }, + { + "epoch": 0.331049346144269, + "grad_norm": 2.2711195945739746, + "learning_rate": 6.71839293567776e-05, + "loss": 0.3575067281723022, + "step": 77110 + }, + { + "epoch": 0.331092278234289, + "grad_norm": 3.7381644248962402, + "learning_rate": 6.717961763665997e-05, + "loss": 0.10863602161407471, + "step": 77120 + }, + { + "epoch": 0.331135210324309, + "grad_norm": 0.08643455803394318, + "learning_rate": 6.717530591654235e-05, + "loss": 0.36785068511962893, + "step": 77130 + }, + { + "epoch": 0.331178142414329, + "grad_norm": 0.03152371197938919, + "learning_rate": 6.717099419642473e-05, + "loss": 0.4662750720977783, + "step": 77140 + }, + { + "epoch": 0.331221074504349, + "grad_norm": 2.6884765625, + "learning_rate": 6.71666824763071e-05, + "loss": 0.11766870021820068, + "step": 77150 + }, + { + "epoch": 0.33126400659436905, + "grad_norm": 0.04446806013584137, + "learning_rate": 6.716237075618948e-05, + "loss": 0.19636658430099488, + "step": 77160 + }, + { + "epoch": 0.33130693868438904, + "grad_norm": 0.9991649389266968, + "learning_rate": 6.715805903607186e-05, + "loss": 0.1769741654396057, + "step": 77170 + }, + { + "epoch": 0.331349870774409, + "grad_norm": 0.03879518061876297, + "learning_rate": 6.715374731595424e-05, + "loss": 0.16789230108261108, + "step": 77180 + }, + { + "epoch": 0.33139280286442907, + "grad_norm": 0.009082995355129242, + "learning_rate": 6.714943559583661e-05, + "loss": 0.2841722726821899, + "step": 77190 + }, + { + "epoch": 0.33143573495444906, + "grad_norm": 3.0159389972686768, + "learning_rate": 6.714512387571899e-05, + "loss": 0.08515748977661133, + "step": 77200 + }, + { + "epoch": 0.33147866704446904, + "grad_norm": 2.8059897422790527, + "learning_rate": 6.714081215560137e-05, + "loss": 0.16116414070129395, + "step": 77210 + }, + { + "epoch": 0.3315215991344891, + "grad_norm": 0.8414933085441589, + "learning_rate": 6.713650043548374e-05, + "loss": 0.24936089515686036, + "step": 77220 + }, + { + "epoch": 0.3315645312245091, + "grad_norm": 0.011716615408658981, + "learning_rate": 6.713218871536611e-05, + "loss": 0.19275407791137694, + "step": 77230 + }, + { + "epoch": 0.33160746331452906, + "grad_norm": 0.003756319172680378, + "learning_rate": 6.712787699524849e-05, + "loss": 0.3036526679992676, + "step": 77240 + }, + { + "epoch": 0.3316503954045491, + "grad_norm": 0.998163104057312, + "learning_rate": 6.712356527513086e-05, + "loss": 0.30022311210632324, + "step": 77250 + }, + { + "epoch": 0.3316933274945691, + "grad_norm": 0.19256161153316498, + "learning_rate": 6.711925355501324e-05, + "loss": 0.07962195873260498, + "step": 77260 + }, + { + "epoch": 0.3317362595845891, + "grad_norm": 0.003725707530975342, + "learning_rate": 6.711494183489562e-05, + "loss": 0.2163766622543335, + "step": 77270 + }, + { + "epoch": 0.3317791916746091, + "grad_norm": 2.38783860206604, + "learning_rate": 6.7110630114778e-05, + "loss": 0.3317635774612427, + "step": 77280 + }, + { + "epoch": 0.3318221237646291, + "grad_norm": 0.018589181825518608, + "learning_rate": 6.710631839466037e-05, + "loss": 0.15659880638122559, + "step": 77290 + }, + { + "epoch": 0.3318650558546491, + "grad_norm": 0.03237080201506615, + "learning_rate": 6.710200667454275e-05, + "loss": 0.19567469358444214, + "step": 77300 + }, + { + "epoch": 0.33190798794466914, + "grad_norm": 0.005493505857884884, + "learning_rate": 6.709769495442511e-05, + "loss": 0.2278986692428589, + "step": 77310 + }, + { + "epoch": 0.3319509200346891, + "grad_norm": 1.1253221035003662, + "learning_rate": 6.709338323430749e-05, + "loss": 0.24583756923675537, + "step": 77320 + }, + { + "epoch": 0.3319938521247091, + "grad_norm": 11.32041072845459, + "learning_rate": 6.708907151418987e-05, + "loss": 0.32453408241271975, + "step": 77330 + }, + { + "epoch": 0.33203678421472915, + "grad_norm": 0.0009816490346565843, + "learning_rate": 6.708475979407225e-05, + "loss": 0.3192496061325073, + "step": 77340 + }, + { + "epoch": 0.33207971630474914, + "grad_norm": 1.2368215322494507, + "learning_rate": 6.708044807395462e-05, + "loss": 0.1536526083946228, + "step": 77350 + }, + { + "epoch": 0.3321226483947691, + "grad_norm": 1.075612187385559, + "learning_rate": 6.7076136353837e-05, + "loss": 0.24217960834503174, + "step": 77360 + }, + { + "epoch": 0.33216558048478917, + "grad_norm": 0.016452711075544357, + "learning_rate": 6.707182463371938e-05, + "loss": 0.1228145956993103, + "step": 77370 + }, + { + "epoch": 0.33220851257480916, + "grad_norm": 0.51668781042099, + "learning_rate": 6.706751291360176e-05, + "loss": 0.3324535608291626, + "step": 77380 + }, + { + "epoch": 0.3322514446648292, + "grad_norm": 0.038861099630594254, + "learning_rate": 6.706320119348413e-05, + "loss": 0.18577756881713867, + "step": 77390 + }, + { + "epoch": 0.3322943767548492, + "grad_norm": 0.0012117112055420876, + "learning_rate": 6.705888947336651e-05, + "loss": 0.13234348297119142, + "step": 77400 + }, + { + "epoch": 0.3323373088448692, + "grad_norm": 0.09664575755596161, + "learning_rate": 6.705457775324889e-05, + "loss": 0.1769404411315918, + "step": 77410 + }, + { + "epoch": 0.3323802409348892, + "grad_norm": 0.14990632236003876, + "learning_rate": 6.705026603313126e-05, + "loss": 0.19105058908462524, + "step": 77420 + }, + { + "epoch": 0.3324231730249092, + "grad_norm": 6.025893688201904, + "learning_rate": 6.704595431301364e-05, + "loss": 0.2703073978424072, + "step": 77430 + }, + { + "epoch": 0.3324661051149292, + "grad_norm": 2.898651123046875, + "learning_rate": 6.704164259289602e-05, + "loss": 0.33019936084747314, + "step": 77440 + }, + { + "epoch": 0.33250903720494923, + "grad_norm": 0.02029103972017765, + "learning_rate": 6.70373308727784e-05, + "loss": 0.272914457321167, + "step": 77450 + }, + { + "epoch": 0.3325519692949692, + "grad_norm": 6.893616199493408, + "learning_rate": 6.703301915266077e-05, + "loss": 0.2154677391052246, + "step": 77460 + }, + { + "epoch": 0.3325949013849892, + "grad_norm": 0.013883881270885468, + "learning_rate": 6.702870743254315e-05, + "loss": 0.2001575469970703, + "step": 77470 + }, + { + "epoch": 0.33263783347500925, + "grad_norm": 1.378208041191101, + "learning_rate": 6.702439571242552e-05, + "loss": 0.16259143352508545, + "step": 77480 + }, + { + "epoch": 0.33268076556502924, + "grad_norm": 1.3300849199295044, + "learning_rate": 6.702008399230789e-05, + "loss": 0.13338098526000977, + "step": 77490 + }, + { + "epoch": 0.3327236976550492, + "grad_norm": 0.22622959315776825, + "learning_rate": 6.701577227219027e-05, + "loss": 0.07103768587112427, + "step": 77500 + }, + { + "epoch": 0.33276662974506926, + "grad_norm": 0.0017219501314684749, + "learning_rate": 6.701146055207265e-05, + "loss": 0.12565276622772217, + "step": 77510 + }, + { + "epoch": 0.33280956183508925, + "grad_norm": 1.9283549785614014, + "learning_rate": 6.700714883195502e-05, + "loss": 0.48866925239562986, + "step": 77520 + }, + { + "epoch": 0.33285249392510924, + "grad_norm": 3.044318914413452, + "learning_rate": 6.70028371118374e-05, + "loss": 0.1290292263031006, + "step": 77530 + }, + { + "epoch": 0.3328954260151293, + "grad_norm": 0.029055681079626083, + "learning_rate": 6.699852539171978e-05, + "loss": 0.3163025140762329, + "step": 77540 + }, + { + "epoch": 0.33293835810514927, + "grad_norm": 0.7699205875396729, + "learning_rate": 6.699421367160216e-05, + "loss": 0.007592477649450302, + "step": 77550 + }, + { + "epoch": 0.33298129019516925, + "grad_norm": 0.026558570563793182, + "learning_rate": 6.698990195148452e-05, + "loss": 0.2869630575180054, + "step": 77560 + }, + { + "epoch": 0.3330242222851893, + "grad_norm": 8.344164848327637, + "learning_rate": 6.69855902313669e-05, + "loss": 0.49628534317016604, + "step": 77570 + }, + { + "epoch": 0.3330671543752093, + "grad_norm": 0.06096833571791649, + "learning_rate": 6.698127851124927e-05, + "loss": 0.3495140314102173, + "step": 77580 + }, + { + "epoch": 0.3331100864652293, + "grad_norm": 1.687309741973877, + "learning_rate": 6.697696679113165e-05, + "loss": 0.2711905241012573, + "step": 77590 + }, + { + "epoch": 0.3331530185552493, + "grad_norm": 3.5439131259918213, + "learning_rate": 6.697265507101403e-05, + "loss": 0.47504277229309083, + "step": 77600 + }, + { + "epoch": 0.3331959506452693, + "grad_norm": 2.0073721408843994, + "learning_rate": 6.696834335089641e-05, + "loss": 0.20028815269470215, + "step": 77610 + }, + { + "epoch": 0.33323888273528934, + "grad_norm": 1.768701434135437, + "learning_rate": 6.696403163077878e-05, + "loss": 0.3161693811416626, + "step": 77620 + }, + { + "epoch": 0.33328181482530933, + "grad_norm": 0.281827449798584, + "learning_rate": 6.695971991066116e-05, + "loss": 0.19931249618530272, + "step": 77630 + }, + { + "epoch": 0.3333247469153293, + "grad_norm": 1.3660832643508911, + "learning_rate": 6.695540819054354e-05, + "loss": 0.19346129894256592, + "step": 77640 + }, + { + "epoch": 0.33336767900534936, + "grad_norm": 1.3048043251037598, + "learning_rate": 6.695109647042592e-05, + "loss": 0.14530563354492188, + "step": 77650 + }, + { + "epoch": 0.33341061109536935, + "grad_norm": 0.03229120001196861, + "learning_rate": 6.69467847503083e-05, + "loss": 0.36408612728118894, + "step": 77660 + }, + { + "epoch": 0.33345354318538933, + "grad_norm": 1.1294801235198975, + "learning_rate": 6.694247303019067e-05, + "loss": 0.33203485012054446, + "step": 77670 + }, + { + "epoch": 0.3334964752754094, + "grad_norm": 0.07346589118242264, + "learning_rate": 6.693816131007305e-05, + "loss": 0.17584925889968872, + "step": 77680 + }, + { + "epoch": 0.33353940736542936, + "grad_norm": 3.982257604598999, + "learning_rate": 6.693384958995543e-05, + "loss": 0.06325796842575074, + "step": 77690 + }, + { + "epoch": 0.33358233945544935, + "grad_norm": 0.03161519393324852, + "learning_rate": 6.69295378698378e-05, + "loss": 0.20434770584106446, + "step": 77700 + }, + { + "epoch": 0.3336252715454694, + "grad_norm": 0.42918524146080017, + "learning_rate": 6.692522614972018e-05, + "loss": 0.23393239974975585, + "step": 77710 + }, + { + "epoch": 0.3336682036354894, + "grad_norm": 0.058732885867357254, + "learning_rate": 6.692091442960254e-05, + "loss": 0.2614431619644165, + "step": 77720 + }, + { + "epoch": 0.33371113572550937, + "grad_norm": 3.1066834926605225, + "learning_rate": 6.691660270948492e-05, + "loss": 0.2023782968521118, + "step": 77730 + }, + { + "epoch": 0.3337540678155294, + "grad_norm": 0.013925936073064804, + "learning_rate": 6.69122909893673e-05, + "loss": 0.08589456081390381, + "step": 77740 + }, + { + "epoch": 0.3337969999055494, + "grad_norm": 0.9002230167388916, + "learning_rate": 6.690797926924968e-05, + "loss": 0.21781384944915771, + "step": 77750 + }, + { + "epoch": 0.3338399319955694, + "grad_norm": 0.013174137100577354, + "learning_rate": 6.690366754913205e-05, + "loss": 0.18269798755645753, + "step": 77760 + }, + { + "epoch": 0.3338828640855894, + "grad_norm": 0.03236332908272743, + "learning_rate": 6.689935582901443e-05, + "loss": 0.15968105792999268, + "step": 77770 + }, + { + "epoch": 0.3339257961756094, + "grad_norm": 0.15361715853214264, + "learning_rate": 6.689504410889681e-05, + "loss": 0.36173908710479735, + "step": 77780 + }, + { + "epoch": 0.3339687282656294, + "grad_norm": 0.5949799418449402, + "learning_rate": 6.689073238877919e-05, + "loss": 0.3477851390838623, + "step": 77790 + }, + { + "epoch": 0.33401166035564944, + "grad_norm": 0.023436477407813072, + "learning_rate": 6.688642066866155e-05, + "loss": 0.11365103721618652, + "step": 77800 + }, + { + "epoch": 0.33405459244566943, + "grad_norm": 0.003403679234907031, + "learning_rate": 6.688210894854393e-05, + "loss": 0.20094313621520996, + "step": 77810 + }, + { + "epoch": 0.33409752453568947, + "grad_norm": 0.027733566239476204, + "learning_rate": 6.68777972284263e-05, + "loss": 0.17328678369522094, + "step": 77820 + }, + { + "epoch": 0.33414045662570946, + "grad_norm": 0.16135767102241516, + "learning_rate": 6.687348550830868e-05, + "loss": 0.04763566255569458, + "step": 77830 + }, + { + "epoch": 0.33418338871572945, + "grad_norm": 0.38207441568374634, + "learning_rate": 6.686917378819106e-05, + "loss": 0.13231130838394164, + "step": 77840 + }, + { + "epoch": 0.3342263208057495, + "grad_norm": 0.04116377979516983, + "learning_rate": 6.686486206807344e-05, + "loss": 0.22739045619964598, + "step": 77850 + }, + { + "epoch": 0.3342692528957695, + "grad_norm": 1.2879244089126587, + "learning_rate": 6.686055034795581e-05, + "loss": 0.3036820650100708, + "step": 77860 + }, + { + "epoch": 0.33431218498578946, + "grad_norm": 4.732523441314697, + "learning_rate": 6.685623862783819e-05, + "loss": 0.4515406608581543, + "step": 77870 + }, + { + "epoch": 0.3343551170758095, + "grad_norm": 0.006630048621445894, + "learning_rate": 6.685192690772057e-05, + "loss": 0.08107486963272095, + "step": 77880 + }, + { + "epoch": 0.3343980491658295, + "grad_norm": 1.9734116792678833, + "learning_rate": 6.684761518760295e-05, + "loss": 0.22512927055358886, + "step": 77890 + }, + { + "epoch": 0.3344409812558495, + "grad_norm": 0.08230617642402649, + "learning_rate": 6.684330346748532e-05, + "loss": 0.18863047361373902, + "step": 77900 + }, + { + "epoch": 0.3344839133458695, + "grad_norm": 1.6849898099899292, + "learning_rate": 6.68389917473677e-05, + "loss": 0.17646138668060302, + "step": 77910 + }, + { + "epoch": 0.3345268454358895, + "grad_norm": 1.1425594091415405, + "learning_rate": 6.683468002725008e-05, + "loss": 0.33102991580963137, + "step": 77920 + }, + { + "epoch": 0.3345697775259095, + "grad_norm": 0.003869341453537345, + "learning_rate": 6.683036830713245e-05, + "loss": 0.20403015613555908, + "step": 77930 + }, + { + "epoch": 0.33461270961592954, + "grad_norm": 0.0006105902139097452, + "learning_rate": 6.682605658701483e-05, + "loss": 0.20998027324676513, + "step": 77940 + }, + { + "epoch": 0.3346556417059495, + "grad_norm": 0.05356431007385254, + "learning_rate": 6.682174486689721e-05, + "loss": 0.20201430320739747, + "step": 77950 + }, + { + "epoch": 0.3346985737959695, + "grad_norm": 5.7862091064453125, + "learning_rate": 6.681743314677959e-05, + "loss": 0.36837890148162844, + "step": 77960 + }, + { + "epoch": 0.33474150588598955, + "grad_norm": 0.9469812512397766, + "learning_rate": 6.681312142666195e-05, + "loss": 0.21669418811798097, + "step": 77970 + }, + { + "epoch": 0.33478443797600954, + "grad_norm": 2.1246681213378906, + "learning_rate": 6.680880970654433e-05, + "loss": 0.5200969696044921, + "step": 77980 + }, + { + "epoch": 0.33482737006602953, + "grad_norm": 2.986363649368286, + "learning_rate": 6.68044979864267e-05, + "loss": 0.44148664474487304, + "step": 77990 + }, + { + "epoch": 0.33487030215604957, + "grad_norm": 0.021413365378975868, + "learning_rate": 6.680018626630908e-05, + "loss": 0.2957026243209839, + "step": 78000 + }, + { + "epoch": 0.33487030215604957, + "eval_loss": 0.4273984134197235, + "eval_runtime": 27.4708, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 78000 + }, + { + "epoch": 0.33491323424606956, + "grad_norm": 0.030063766986131668, + "learning_rate": 6.679587454619146e-05, + "loss": 0.16760578155517578, + "step": 78010 + }, + { + "epoch": 0.3349561663360896, + "grad_norm": 1.2150861024856567, + "learning_rate": 6.679156282607384e-05, + "loss": 0.3231816291809082, + "step": 78020 + }, + { + "epoch": 0.3349990984261096, + "grad_norm": 0.07824573665857315, + "learning_rate": 6.678725110595621e-05, + "loss": 0.6120881080627442, + "step": 78030 + }, + { + "epoch": 0.3350420305161296, + "grad_norm": 0.3086092472076416, + "learning_rate": 6.678293938583859e-05, + "loss": 0.18578628301620484, + "step": 78040 + }, + { + "epoch": 0.3350849626061496, + "grad_norm": 236.54244995117188, + "learning_rate": 6.677862766572096e-05, + "loss": 0.27489328384399414, + "step": 78050 + }, + { + "epoch": 0.3351278946961696, + "grad_norm": 4.726393699645996, + "learning_rate": 6.677431594560333e-05, + "loss": 0.19919774532318116, + "step": 78060 + }, + { + "epoch": 0.3351708267861896, + "grad_norm": 5.875321388244629, + "learning_rate": 6.677000422548571e-05, + "loss": 0.17618502378463746, + "step": 78070 + }, + { + "epoch": 0.33521375887620963, + "grad_norm": 0.017372211441397667, + "learning_rate": 6.676569250536809e-05, + "loss": 0.2838648796081543, + "step": 78080 + }, + { + "epoch": 0.3352566909662296, + "grad_norm": 0.20975318551063538, + "learning_rate": 6.676138078525047e-05, + "loss": 0.1958579897880554, + "step": 78090 + }, + { + "epoch": 0.3352996230562496, + "grad_norm": 0.8249739408493042, + "learning_rate": 6.675706906513284e-05, + "loss": 0.3073481798171997, + "step": 78100 + }, + { + "epoch": 0.33534255514626965, + "grad_norm": 0.01300235278904438, + "learning_rate": 6.675275734501522e-05, + "loss": 0.1496501684188843, + "step": 78110 + }, + { + "epoch": 0.33538548723628964, + "grad_norm": 0.06454453617334366, + "learning_rate": 6.674844562489761e-05, + "loss": 0.19467418193817138, + "step": 78120 + }, + { + "epoch": 0.3354284193263096, + "grad_norm": 1.8716318607330322, + "learning_rate": 6.674413390477997e-05, + "loss": 0.20901689529418946, + "step": 78130 + }, + { + "epoch": 0.33547135141632967, + "grad_norm": 0.023373577743768692, + "learning_rate": 6.673982218466235e-05, + "loss": 0.23439536094665528, + "step": 78140 + }, + { + "epoch": 0.33551428350634965, + "grad_norm": 2.2952160835266113, + "learning_rate": 6.673551046454473e-05, + "loss": 0.36932108402252195, + "step": 78150 + }, + { + "epoch": 0.33555721559636964, + "grad_norm": 0.006046372931450605, + "learning_rate": 6.67311987444271e-05, + "loss": 0.16835312843322753, + "step": 78160 + }, + { + "epoch": 0.3356001476863897, + "grad_norm": 0.00841306522488594, + "learning_rate": 6.672688702430948e-05, + "loss": 0.22861673831939697, + "step": 78170 + }, + { + "epoch": 0.33564307977640967, + "grad_norm": 0.09327050298452377, + "learning_rate": 6.672257530419186e-05, + "loss": 0.17674237489700317, + "step": 78180 + }, + { + "epoch": 0.33568601186642966, + "grad_norm": 0.5533800721168518, + "learning_rate": 6.671826358407424e-05, + "loss": 0.40234909057617185, + "step": 78190 + }, + { + "epoch": 0.3357289439564497, + "grad_norm": 0.018703026697039604, + "learning_rate": 6.671395186395662e-05, + "loss": 0.08413835167884827, + "step": 78200 + }, + { + "epoch": 0.3357718760464697, + "grad_norm": 0.5634124875068665, + "learning_rate": 6.6709640143839e-05, + "loss": 0.12944929599761962, + "step": 78210 + }, + { + "epoch": 0.3358148081364897, + "grad_norm": 2.1403753757476807, + "learning_rate": 6.670532842372136e-05, + "loss": 0.20814151763916017, + "step": 78220 + }, + { + "epoch": 0.3358577402265097, + "grad_norm": 3.3457531929016113, + "learning_rate": 6.670101670360373e-05, + "loss": 0.17461209297180175, + "step": 78230 + }, + { + "epoch": 0.3359006723165297, + "grad_norm": 0.2121453732252121, + "learning_rate": 6.669670498348611e-05, + "loss": 0.1593286395072937, + "step": 78240 + }, + { + "epoch": 0.33594360440654975, + "grad_norm": 5.8306403160095215, + "learning_rate": 6.669239326336849e-05, + "loss": 0.09663127064704895, + "step": 78250 + }, + { + "epoch": 0.33598653649656973, + "grad_norm": 0.11331497132778168, + "learning_rate": 6.668808154325087e-05, + "loss": 0.32361640930175783, + "step": 78260 + }, + { + "epoch": 0.3360294685865897, + "grad_norm": 0.011147410608828068, + "learning_rate": 6.668376982313324e-05, + "loss": 0.14730819463729858, + "step": 78270 + }, + { + "epoch": 0.33607240067660976, + "grad_norm": 0.006566631607711315, + "learning_rate": 6.667945810301562e-05, + "loss": 0.38276846408843995, + "step": 78280 + }, + { + "epoch": 0.33611533276662975, + "grad_norm": 4.870085716247559, + "learning_rate": 6.6675146382898e-05, + "loss": 0.40787491798400877, + "step": 78290 + }, + { + "epoch": 0.33615826485664974, + "grad_norm": 0.06610367447137833, + "learning_rate": 6.667083466278036e-05, + "loss": 0.11140779256820679, + "step": 78300 + }, + { + "epoch": 0.3362011969466698, + "grad_norm": 0.08063236624002457, + "learning_rate": 6.666652294266274e-05, + "loss": 0.10844311714172364, + "step": 78310 + }, + { + "epoch": 0.33624412903668977, + "grad_norm": 0.005386178847402334, + "learning_rate": 6.666221122254512e-05, + "loss": 0.23304014205932616, + "step": 78320 + }, + { + "epoch": 0.33628706112670975, + "grad_norm": 1.1538461446762085, + "learning_rate": 6.66578995024275e-05, + "loss": 0.24594826698303224, + "step": 78330 + }, + { + "epoch": 0.3363299932167298, + "grad_norm": 0.05892343819141388, + "learning_rate": 6.665358778230989e-05, + "loss": 0.2531136035919189, + "step": 78340 + }, + { + "epoch": 0.3363729253067498, + "grad_norm": 0.0801706314086914, + "learning_rate": 6.664927606219226e-05, + "loss": 0.3033801555633545, + "step": 78350 + }, + { + "epoch": 0.33641585739676977, + "grad_norm": 5.389936447143555, + "learning_rate": 6.664496434207464e-05, + "loss": 0.3112450122833252, + "step": 78360 + }, + { + "epoch": 0.3364587894867898, + "grad_norm": 7.637985706329346, + "learning_rate": 6.664065262195702e-05, + "loss": 0.25974130630493164, + "step": 78370 + }, + { + "epoch": 0.3365017215768098, + "grad_norm": 1.631525993347168, + "learning_rate": 6.663634090183938e-05, + "loss": 0.07255272269248962, + "step": 78380 + }, + { + "epoch": 0.3365446536668298, + "grad_norm": 1.459124207496643, + "learning_rate": 6.663202918172176e-05, + "loss": 0.314791202545166, + "step": 78390 + }, + { + "epoch": 0.3365875857568498, + "grad_norm": 0.12745119631290436, + "learning_rate": 6.662771746160414e-05, + "loss": 0.06704494953155518, + "step": 78400 + }, + { + "epoch": 0.3366305178468698, + "grad_norm": 2.5058093070983887, + "learning_rate": 6.662340574148651e-05, + "loss": 0.3335081100463867, + "step": 78410 + }, + { + "epoch": 0.3366734499368898, + "grad_norm": 0.35333049297332764, + "learning_rate": 6.661909402136889e-05, + "loss": 0.2435668706893921, + "step": 78420 + }, + { + "epoch": 0.33671638202690984, + "grad_norm": 0.24012352526187897, + "learning_rate": 6.661478230125127e-05, + "loss": 0.2389286994934082, + "step": 78430 + }, + { + "epoch": 0.33675931411692983, + "grad_norm": 0.20842285454273224, + "learning_rate": 6.661047058113365e-05, + "loss": 0.21304574012756347, + "step": 78440 + }, + { + "epoch": 0.3368022462069499, + "grad_norm": 0.11603209376335144, + "learning_rate": 6.660615886101602e-05, + "loss": 0.33554635047912595, + "step": 78450 + }, + { + "epoch": 0.33684517829696986, + "grad_norm": 1.2757017612457275, + "learning_rate": 6.660184714089839e-05, + "loss": 0.24091644287109376, + "step": 78460 + }, + { + "epoch": 0.33688811038698985, + "grad_norm": 0.002968688029795885, + "learning_rate": 6.659753542078076e-05, + "loss": 0.3429348707199097, + "step": 78470 + }, + { + "epoch": 0.3369310424770099, + "grad_norm": 1.2616757154464722, + "learning_rate": 6.659322370066314e-05, + "loss": 0.40502257347106935, + "step": 78480 + }, + { + "epoch": 0.3369739745670299, + "grad_norm": 0.0027247348334640265, + "learning_rate": 6.658891198054552e-05, + "loss": 0.10175679922103882, + "step": 78490 + }, + { + "epoch": 0.33701690665704986, + "grad_norm": 0.030607003718614578, + "learning_rate": 6.65846002604279e-05, + "loss": 0.45574145317077636, + "step": 78500 + }, + { + "epoch": 0.3370598387470699, + "grad_norm": 0.8096807599067688, + "learning_rate": 6.658028854031027e-05, + "loss": 0.1723085403442383, + "step": 78510 + }, + { + "epoch": 0.3371027708370899, + "grad_norm": 3.5476157665252686, + "learning_rate": 6.657597682019265e-05, + "loss": 0.3150531768798828, + "step": 78520 + }, + { + "epoch": 0.3371457029271099, + "grad_norm": 2.841060161590576, + "learning_rate": 6.657166510007503e-05, + "loss": 0.12495272159576416, + "step": 78530 + }, + { + "epoch": 0.3371886350171299, + "grad_norm": 0.03028915636241436, + "learning_rate": 6.656735337995739e-05, + "loss": 0.16928415298461913, + "step": 78540 + }, + { + "epoch": 0.3372315671071499, + "grad_norm": 0.006461134646087885, + "learning_rate": 6.656304165983977e-05, + "loss": 0.21656641960144044, + "step": 78550 + }, + { + "epoch": 0.3372744991971699, + "grad_norm": 0.02571956440806389, + "learning_rate": 6.655872993972216e-05, + "loss": 0.3709154844284058, + "step": 78560 + }, + { + "epoch": 0.33731743128718994, + "grad_norm": 0.4863227605819702, + "learning_rate": 6.655441821960454e-05, + "loss": 0.10554158687591553, + "step": 78570 + }, + { + "epoch": 0.3373603633772099, + "grad_norm": 0.036538202315568924, + "learning_rate": 6.655010649948691e-05, + "loss": 0.08269681334495545, + "step": 78580 + }, + { + "epoch": 0.3374032954672299, + "grad_norm": 0.061518244445323944, + "learning_rate": 6.654579477936929e-05, + "loss": 0.017988350987434388, + "step": 78590 + }, + { + "epoch": 0.33744622755724996, + "grad_norm": 0.021507324650883675, + "learning_rate": 6.654148305925167e-05, + "loss": 0.12341071367263794, + "step": 78600 + }, + { + "epoch": 0.33748915964726994, + "grad_norm": 0.038689348846673965, + "learning_rate": 6.653717133913405e-05, + "loss": 0.3153055191040039, + "step": 78610 + }, + { + "epoch": 0.33753209173728993, + "grad_norm": 0.13708128035068512, + "learning_rate": 6.653285961901642e-05, + "loss": 0.36469757556915283, + "step": 78620 + }, + { + "epoch": 0.33757502382731, + "grad_norm": 2.7672643661499023, + "learning_rate": 6.652854789889879e-05, + "loss": 0.2653116941452026, + "step": 78630 + }, + { + "epoch": 0.33761795591732996, + "grad_norm": 12.475768089294434, + "learning_rate": 6.652423617878116e-05, + "loss": 0.29444379806518556, + "step": 78640 + }, + { + "epoch": 0.33766088800734995, + "grad_norm": 1.8217633962631226, + "learning_rate": 6.651992445866354e-05, + "loss": 0.33646581172943113, + "step": 78650 + }, + { + "epoch": 0.33770382009737, + "grad_norm": 0.08565472811460495, + "learning_rate": 6.651561273854592e-05, + "loss": 0.35305285453796387, + "step": 78660 + }, + { + "epoch": 0.33774675218739, + "grad_norm": 0.071127749979496, + "learning_rate": 6.65113010184283e-05, + "loss": 0.1562727212905884, + "step": 78670 + }, + { + "epoch": 0.33778968427741, + "grad_norm": 0.9672734141349792, + "learning_rate": 6.650698929831067e-05, + "loss": 0.20825026035308838, + "step": 78680 + }, + { + "epoch": 0.33783261636743, + "grad_norm": 0.3451642096042633, + "learning_rate": 6.650267757819305e-05, + "loss": 0.12548161745071412, + "step": 78690 + }, + { + "epoch": 0.33787554845745, + "grad_norm": 0.023614799603819847, + "learning_rate": 6.649836585807543e-05, + "loss": 0.2497967004776001, + "step": 78700 + }, + { + "epoch": 0.33791848054747003, + "grad_norm": 0.11236807703971863, + "learning_rate": 6.649405413795779e-05, + "loss": 0.3373664140701294, + "step": 78710 + }, + { + "epoch": 0.33796141263749, + "grad_norm": 0.00224771024659276, + "learning_rate": 6.648974241784017e-05, + "loss": 0.18866246938705444, + "step": 78720 + }, + { + "epoch": 0.33800434472751, + "grad_norm": 0.33207470178604126, + "learning_rate": 6.648543069772255e-05, + "loss": 0.2463681936264038, + "step": 78730 + }, + { + "epoch": 0.33804727681753005, + "grad_norm": 3.1149215698242188, + "learning_rate": 6.648111897760492e-05, + "loss": 0.3616062641143799, + "step": 78740 + }, + { + "epoch": 0.33809020890755004, + "grad_norm": 0.40803834795951843, + "learning_rate": 6.64768072574873e-05, + "loss": 0.17350724935531617, + "step": 78750 + }, + { + "epoch": 0.33813314099757, + "grad_norm": 7.756111145019531, + "learning_rate": 6.647249553736968e-05, + "loss": 0.2586477994918823, + "step": 78760 + }, + { + "epoch": 0.33817607308759007, + "grad_norm": 1.2022218704223633, + "learning_rate": 6.646818381725206e-05, + "loss": 0.28184449672698975, + "step": 78770 + }, + { + "epoch": 0.33821900517761005, + "grad_norm": 0.03321431577205658, + "learning_rate": 6.646387209713443e-05, + "loss": 0.27339468002319334, + "step": 78780 + }, + { + "epoch": 0.33826193726763004, + "grad_norm": 10.436616897583008, + "learning_rate": 6.645956037701681e-05, + "loss": 0.39653310775756834, + "step": 78790 + }, + { + "epoch": 0.3383048693576501, + "grad_norm": 0.02555697225034237, + "learning_rate": 6.645524865689919e-05, + "loss": 0.11828770637512206, + "step": 78800 + }, + { + "epoch": 0.33834780144767007, + "grad_norm": 0.8284059166908264, + "learning_rate": 6.645093693678157e-05, + "loss": 0.18440791368484497, + "step": 78810 + }, + { + "epoch": 0.33839073353769006, + "grad_norm": 9.03634262084961, + "learning_rate": 6.644662521666394e-05, + "loss": 0.2156630277633667, + "step": 78820 + }, + { + "epoch": 0.3384336656277101, + "grad_norm": 0.044301051646471024, + "learning_rate": 6.644231349654632e-05, + "loss": 0.13982144594192505, + "step": 78830 + }, + { + "epoch": 0.3384765977177301, + "grad_norm": 0.06587236374616623, + "learning_rate": 6.64380017764287e-05, + "loss": 0.05743167400360107, + "step": 78840 + }, + { + "epoch": 0.3385195298077501, + "grad_norm": 0.006749128457158804, + "learning_rate": 6.643369005631108e-05, + "loss": 0.1766321063041687, + "step": 78850 + }, + { + "epoch": 0.3385624618977701, + "grad_norm": 0.1305362582206726, + "learning_rate": 6.642937833619345e-05, + "loss": 0.16463260650634765, + "step": 78860 + }, + { + "epoch": 0.3386053939877901, + "grad_norm": 0.29861336946487427, + "learning_rate": 6.642506661607582e-05, + "loss": 0.4933755397796631, + "step": 78870 + }, + { + "epoch": 0.33864832607781015, + "grad_norm": 0.1558838188648224, + "learning_rate": 6.64207548959582e-05, + "loss": 0.16021682024002076, + "step": 78880 + }, + { + "epoch": 0.33869125816783013, + "grad_norm": 0.12155012786388397, + "learning_rate": 6.641644317584057e-05, + "loss": 0.18769705295562744, + "step": 78890 + }, + { + "epoch": 0.3387341902578501, + "grad_norm": 0.003283413592725992, + "learning_rate": 6.641213145572295e-05, + "loss": 0.07501508593559265, + "step": 78900 + }, + { + "epoch": 0.33877712234787016, + "grad_norm": 0.007337587419897318, + "learning_rate": 6.640781973560533e-05, + "loss": 0.23510518074035644, + "step": 78910 + }, + { + "epoch": 0.33882005443789015, + "grad_norm": 1.428571343421936, + "learning_rate": 6.64035080154877e-05, + "loss": 0.2963968276977539, + "step": 78920 + }, + { + "epoch": 0.33886298652791014, + "grad_norm": 0.006976236589252949, + "learning_rate": 6.639919629537008e-05, + "loss": 0.38243851661682127, + "step": 78930 + }, + { + "epoch": 0.3389059186179302, + "grad_norm": 0.13427439332008362, + "learning_rate": 6.639488457525246e-05, + "loss": 0.19198756217956542, + "step": 78940 + }, + { + "epoch": 0.33894885070795017, + "grad_norm": 0.16244111955165863, + "learning_rate": 6.639057285513484e-05, + "loss": 0.2261066198348999, + "step": 78950 + }, + { + "epoch": 0.33899178279797015, + "grad_norm": 0.5172285437583923, + "learning_rate": 6.63862611350172e-05, + "loss": 0.23315682411193847, + "step": 78960 + }, + { + "epoch": 0.3390347148879902, + "grad_norm": 2.285973072052002, + "learning_rate": 6.638194941489958e-05, + "loss": 0.16300102472305297, + "step": 78970 + }, + { + "epoch": 0.3390776469780102, + "grad_norm": 0.0691041648387909, + "learning_rate": 6.637763769478195e-05, + "loss": 0.3021952390670776, + "step": 78980 + }, + { + "epoch": 0.33912057906803017, + "grad_norm": 0.045362479984760284, + "learning_rate": 6.637332597466433e-05, + "loss": 0.14102792739868164, + "step": 78990 + }, + { + "epoch": 0.3391635111580502, + "grad_norm": 27.12893295288086, + "learning_rate": 6.636901425454671e-05, + "loss": 0.21746547222137452, + "step": 79000 + }, + { + "epoch": 0.3391635111580502, + "eval_loss": 0.4396827220916748, + "eval_runtime": 27.1677, + "eval_samples_per_second": 3.681, + "eval_steps_per_second": 3.681, + "step": 79000 + }, + { + "epoch": 0.3392064432480702, + "grad_norm": 0.014844651333987713, + "learning_rate": 6.636470253442909e-05, + "loss": 0.273270845413208, + "step": 79010 + }, + { + "epoch": 0.3392493753380902, + "grad_norm": 0.0699886828660965, + "learning_rate": 6.636039081431146e-05, + "loss": 0.36042520999908445, + "step": 79020 + }, + { + "epoch": 0.33929230742811023, + "grad_norm": 0.13504739105701447, + "learning_rate": 6.635607909419384e-05, + "loss": 0.16561659574508666, + "step": 79030 + }, + { + "epoch": 0.3393352395181302, + "grad_norm": 0.2788245379924774, + "learning_rate": 6.635176737407622e-05, + "loss": 0.17961949110031128, + "step": 79040 + }, + { + "epoch": 0.3393781716081502, + "grad_norm": 0.19097217917442322, + "learning_rate": 6.63474556539586e-05, + "loss": 0.14346141815185548, + "step": 79050 + }, + { + "epoch": 0.33942110369817025, + "grad_norm": 0.07555440813302994, + "learning_rate": 6.634314393384097e-05, + "loss": 0.13214304447174072, + "step": 79060 + }, + { + "epoch": 0.33946403578819023, + "grad_norm": 0.00638962909579277, + "learning_rate": 6.633883221372335e-05, + "loss": 0.14622740745544432, + "step": 79070 + }, + { + "epoch": 0.3395069678782102, + "grad_norm": 0.004908399190753698, + "learning_rate": 6.633452049360573e-05, + "loss": 0.12188636064529419, + "step": 79080 + }, + { + "epoch": 0.33954989996823026, + "grad_norm": 0.016102461144328117, + "learning_rate": 6.63302087734881e-05, + "loss": 0.17850993871688842, + "step": 79090 + }, + { + "epoch": 0.33959283205825025, + "grad_norm": 0.3554086685180664, + "learning_rate": 6.632589705337048e-05, + "loss": 0.05948584079742432, + "step": 79100 + }, + { + "epoch": 0.3396357641482703, + "grad_norm": 165.74964904785156, + "learning_rate": 6.632158533325286e-05, + "loss": 0.12853333950042725, + "step": 79110 + }, + { + "epoch": 0.3396786962382903, + "grad_norm": 2.9265964031219482, + "learning_rate": 6.631727361313522e-05, + "loss": 0.3278707504272461, + "step": 79120 + }, + { + "epoch": 0.33972162832831027, + "grad_norm": 0.002426425227895379, + "learning_rate": 6.63129618930176e-05, + "loss": 0.17605364322662354, + "step": 79130 + }, + { + "epoch": 0.3397645604183303, + "grad_norm": 5.70521879196167, + "learning_rate": 6.630865017289998e-05, + "loss": 0.29195294380187986, + "step": 79140 + }, + { + "epoch": 0.3398074925083503, + "grad_norm": 0.0027679158374667168, + "learning_rate": 6.630433845278236e-05, + "loss": 0.2246103286743164, + "step": 79150 + }, + { + "epoch": 0.3398504245983703, + "grad_norm": 2.1907317638397217, + "learning_rate": 6.630002673266473e-05, + "loss": 0.24519672393798828, + "step": 79160 + }, + { + "epoch": 0.3398933566883903, + "grad_norm": 0.015526399947702885, + "learning_rate": 6.629571501254711e-05, + "loss": 0.048506084084510806, + "step": 79170 + }, + { + "epoch": 0.3399362887784103, + "grad_norm": 2.9595799446105957, + "learning_rate": 6.629140329242949e-05, + "loss": 0.3247170925140381, + "step": 79180 + }, + { + "epoch": 0.3399792208684303, + "grad_norm": 1.484932780265808, + "learning_rate": 6.628709157231186e-05, + "loss": 0.15729442834854127, + "step": 79190 + }, + { + "epoch": 0.34002215295845034, + "grad_norm": 0.011102179065346718, + "learning_rate": 6.628277985219423e-05, + "loss": 0.16122729778289796, + "step": 79200 + }, + { + "epoch": 0.34006508504847033, + "grad_norm": 0.05957547575235367, + "learning_rate": 6.62784681320766e-05, + "loss": 0.038811984658241275, + "step": 79210 + }, + { + "epoch": 0.3401080171384903, + "grad_norm": 0.033005524426698685, + "learning_rate": 6.627415641195898e-05, + "loss": 0.09726451635360718, + "step": 79220 + }, + { + "epoch": 0.34015094922851036, + "grad_norm": 0.0013659193646162748, + "learning_rate": 6.626984469184136e-05, + "loss": 0.2876324415206909, + "step": 79230 + }, + { + "epoch": 0.34019388131853034, + "grad_norm": 0.0034706296864897013, + "learning_rate": 6.626553297172374e-05, + "loss": 0.004077165573835373, + "step": 79240 + }, + { + "epoch": 0.34023681340855033, + "grad_norm": 3.6416494846343994, + "learning_rate": 6.626122125160611e-05, + "loss": 0.3369143486022949, + "step": 79250 + }, + { + "epoch": 0.3402797454985704, + "grad_norm": 0.0032283675391227007, + "learning_rate": 6.625690953148849e-05, + "loss": 0.21591832637786865, + "step": 79260 + }, + { + "epoch": 0.34032267758859036, + "grad_norm": 0.9636190533638, + "learning_rate": 6.625259781137087e-05, + "loss": 0.2059413194656372, + "step": 79270 + }, + { + "epoch": 0.34036560967861035, + "grad_norm": 5.577648639678955, + "learning_rate": 6.624828609125325e-05, + "loss": 0.23836958408355713, + "step": 79280 + }, + { + "epoch": 0.3404085417686304, + "grad_norm": 0.010573506355285645, + "learning_rate": 6.624397437113562e-05, + "loss": 0.19436086416244508, + "step": 79290 + }, + { + "epoch": 0.3404514738586504, + "grad_norm": 3.705470561981201, + "learning_rate": 6.6239662651018e-05, + "loss": 0.32446415424346925, + "step": 79300 + }, + { + "epoch": 0.3404944059486704, + "grad_norm": 0.40182867646217346, + "learning_rate": 6.623535093090038e-05, + "loss": 0.27088615894317625, + "step": 79310 + }, + { + "epoch": 0.3405373380386904, + "grad_norm": 1.5534511804580688, + "learning_rate": 6.623103921078276e-05, + "loss": 0.17054343223571777, + "step": 79320 + }, + { + "epoch": 0.3405802701287104, + "grad_norm": 0.03681186959147453, + "learning_rate": 6.622672749066513e-05, + "loss": 0.3183943271636963, + "step": 79330 + }, + { + "epoch": 0.34062320221873044, + "grad_norm": 0.05637132748961449, + "learning_rate": 6.622241577054751e-05, + "loss": 0.0830637276172638, + "step": 79340 + }, + { + "epoch": 0.3406661343087504, + "grad_norm": 0.01597636006772518, + "learning_rate": 6.621810405042989e-05, + "loss": 0.2368089437484741, + "step": 79350 + }, + { + "epoch": 0.3407090663987704, + "grad_norm": 0.07524899393320084, + "learning_rate": 6.621379233031227e-05, + "loss": 0.3124144792556763, + "step": 79360 + }, + { + "epoch": 0.34075199848879045, + "grad_norm": 0.017055392265319824, + "learning_rate": 6.620948061019463e-05, + "loss": 0.28198373317718506, + "step": 79370 + }, + { + "epoch": 0.34079493057881044, + "grad_norm": 0.9158375263214111, + "learning_rate": 6.620516889007701e-05, + "loss": 0.20923035144805907, + "step": 79380 + }, + { + "epoch": 0.3408378626688304, + "grad_norm": 0.0037546558305621147, + "learning_rate": 6.620085716995938e-05, + "loss": 0.0971619188785553, + "step": 79390 + }, + { + "epoch": 0.34088079475885047, + "grad_norm": 0.33954140543937683, + "learning_rate": 6.619654544984176e-05, + "loss": 0.2176882266998291, + "step": 79400 + }, + { + "epoch": 0.34092372684887046, + "grad_norm": 0.06124844774603844, + "learning_rate": 6.619223372972414e-05, + "loss": 0.27284195423126223, + "step": 79410 + }, + { + "epoch": 0.34096665893889044, + "grad_norm": 0.781387448310852, + "learning_rate": 6.618792200960652e-05, + "loss": 0.34500880241394044, + "step": 79420 + }, + { + "epoch": 0.3410095910289105, + "grad_norm": 0.05491228029131889, + "learning_rate": 6.61836102894889e-05, + "loss": 0.08464367985725403, + "step": 79430 + }, + { + "epoch": 0.3410525231189305, + "grad_norm": 0.39764919877052307, + "learning_rate": 6.617929856937127e-05, + "loss": 0.12523021697998046, + "step": 79440 + }, + { + "epoch": 0.34109545520895046, + "grad_norm": 0.017724506556987762, + "learning_rate": 6.617498684925363e-05, + "loss": 0.012008582800626754, + "step": 79450 + }, + { + "epoch": 0.3411383872989705, + "grad_norm": 0.5780832767486572, + "learning_rate": 6.617067512913601e-05, + "loss": 0.21086423397064208, + "step": 79460 + }, + { + "epoch": 0.3411813193889905, + "grad_norm": 0.03017611987888813, + "learning_rate": 6.616636340901839e-05, + "loss": 0.14146060943603517, + "step": 79470 + }, + { + "epoch": 0.3412242514790105, + "grad_norm": 0.0034709612373262644, + "learning_rate": 6.616205168890077e-05, + "loss": 0.22980284690856934, + "step": 79480 + }, + { + "epoch": 0.3412671835690305, + "grad_norm": 1.7348976135253906, + "learning_rate": 6.615773996878314e-05, + "loss": 0.273857569694519, + "step": 79490 + }, + { + "epoch": 0.3413101156590505, + "grad_norm": 0.12557612359523773, + "learning_rate": 6.615342824866552e-05, + "loss": 0.22032973766326905, + "step": 79500 + }, + { + "epoch": 0.3413530477490705, + "grad_norm": 0.018461521714925766, + "learning_rate": 6.61491165285479e-05, + "loss": 0.33548736572265625, + "step": 79510 + }, + { + "epoch": 0.34139597983909054, + "grad_norm": 0.0520981103181839, + "learning_rate": 6.614480480843029e-05, + "loss": 0.30472311973571775, + "step": 79520 + }, + { + "epoch": 0.3414389119291105, + "grad_norm": 0.8297507762908936, + "learning_rate": 6.614049308831265e-05, + "loss": 0.1717914819717407, + "step": 79530 + }, + { + "epoch": 0.34148184401913056, + "grad_norm": 0.06621135771274567, + "learning_rate": 6.613618136819503e-05, + "loss": 0.23071463108062745, + "step": 79540 + }, + { + "epoch": 0.34152477610915055, + "grad_norm": 0.3965785503387451, + "learning_rate": 6.613186964807741e-05, + "loss": 0.18059780597686767, + "step": 79550 + }, + { + "epoch": 0.34156770819917054, + "grad_norm": 0.024892650544643402, + "learning_rate": 6.612755792795979e-05, + "loss": 0.38873488903045655, + "step": 79560 + }, + { + "epoch": 0.3416106402891906, + "grad_norm": 0.01708587259054184, + "learning_rate": 6.612324620784216e-05, + "loss": 0.29357964992523194, + "step": 79570 + }, + { + "epoch": 0.34165357237921057, + "grad_norm": 8.060369491577148, + "learning_rate": 6.611893448772454e-05, + "loss": 0.0989808738231659, + "step": 79580 + }, + { + "epoch": 0.34169650446923056, + "grad_norm": 0.3194878101348877, + "learning_rate": 6.611462276760692e-05, + "loss": 0.2561430692672729, + "step": 79590 + }, + { + "epoch": 0.3417394365592506, + "grad_norm": 0.02276020310819149, + "learning_rate": 6.61103110474893e-05, + "loss": 0.3739812135696411, + "step": 79600 + }, + { + "epoch": 0.3417823686492706, + "grad_norm": 0.10554180294275284, + "learning_rate": 6.610599932737166e-05, + "loss": 0.22100362777709961, + "step": 79610 + }, + { + "epoch": 0.34182530073929057, + "grad_norm": 0.18519657850265503, + "learning_rate": 6.610168760725404e-05, + "loss": 0.12010836601257324, + "step": 79620 + }, + { + "epoch": 0.3418682328293106, + "grad_norm": 1.409715175628662, + "learning_rate": 6.609737588713641e-05, + "loss": 0.41119384765625, + "step": 79630 + }, + { + "epoch": 0.3419111649193306, + "grad_norm": 0.06727342307567596, + "learning_rate": 6.609306416701879e-05, + "loss": 0.12643046379089357, + "step": 79640 + }, + { + "epoch": 0.3419540970093506, + "grad_norm": 0.005273744929581881, + "learning_rate": 6.608875244690117e-05, + "loss": 0.002772852033376694, + "step": 79650 + }, + { + "epoch": 0.34199702909937063, + "grad_norm": 5.838228225708008, + "learning_rate": 6.608444072678355e-05, + "loss": 0.304068922996521, + "step": 79660 + }, + { + "epoch": 0.3420399611893906, + "grad_norm": 7.21917200088501, + "learning_rate": 6.608012900666592e-05, + "loss": 0.19127278327941893, + "step": 79670 + }, + { + "epoch": 0.3420828932794106, + "grad_norm": 0.8204609751701355, + "learning_rate": 6.60758172865483e-05, + "loss": 0.2872287750244141, + "step": 79680 + }, + { + "epoch": 0.34212582536943065, + "grad_norm": 0.00942230224609375, + "learning_rate": 6.607150556643068e-05, + "loss": 0.2193960189819336, + "step": 79690 + }, + { + "epoch": 0.34216875745945063, + "grad_norm": 0.606311559677124, + "learning_rate": 6.606719384631304e-05, + "loss": 0.25763740539550783, + "step": 79700 + }, + { + "epoch": 0.3422116895494706, + "grad_norm": 0.0011694321874529123, + "learning_rate": 6.606288212619542e-05, + "loss": 0.263020396232605, + "step": 79710 + }, + { + "epoch": 0.34225462163949066, + "grad_norm": 1.0757243633270264, + "learning_rate": 6.60585704060778e-05, + "loss": 0.26357810497283934, + "step": 79720 + }, + { + "epoch": 0.34229755372951065, + "grad_norm": 0.06414375454187393, + "learning_rate": 6.605425868596017e-05, + "loss": 0.12486532926559449, + "step": 79730 + }, + { + "epoch": 0.3423404858195307, + "grad_norm": 2.9872426986694336, + "learning_rate": 6.604994696584256e-05, + "loss": 0.30005874633789065, + "step": 79740 + }, + { + "epoch": 0.3423834179095507, + "grad_norm": 1.153652310371399, + "learning_rate": 6.604563524572494e-05, + "loss": 0.15315334796905516, + "step": 79750 + }, + { + "epoch": 0.34242634999957067, + "grad_norm": 4.178336143493652, + "learning_rate": 6.604132352560732e-05, + "loss": 0.17489372491836547, + "step": 79760 + }, + { + "epoch": 0.3424692820895907, + "grad_norm": 5.421525478363037, + "learning_rate": 6.60370118054897e-05, + "loss": 0.26713321208953855, + "step": 79770 + }, + { + "epoch": 0.3425122141796107, + "grad_norm": 14.969881057739258, + "learning_rate": 6.603270008537206e-05, + "loss": 0.09526968002319336, + "step": 79780 + }, + { + "epoch": 0.3425551462696307, + "grad_norm": 4.366916656494141, + "learning_rate": 6.602838836525444e-05, + "loss": 0.22230844497680663, + "step": 79790 + }, + { + "epoch": 0.3425980783596507, + "grad_norm": 0.004848931450396776, + "learning_rate": 6.602407664513681e-05, + "loss": 0.08430198431015015, + "step": 79800 + }, + { + "epoch": 0.3426410104496707, + "grad_norm": 0.4716125726699829, + "learning_rate": 6.601976492501919e-05, + "loss": 0.2586265325546265, + "step": 79810 + }, + { + "epoch": 0.3426839425396907, + "grad_norm": 0.0038550784811377525, + "learning_rate": 6.601545320490157e-05, + "loss": 0.12058546543121337, + "step": 79820 + }, + { + "epoch": 0.34272687462971074, + "grad_norm": 1.374908447265625, + "learning_rate": 6.601114148478395e-05, + "loss": 0.13479118347167968, + "step": 79830 + }, + { + "epoch": 0.34276980671973073, + "grad_norm": 0.004285231698304415, + "learning_rate": 6.600682976466632e-05, + "loss": 0.19934382438659667, + "step": 79840 + }, + { + "epoch": 0.3428127388097507, + "grad_norm": 0.00542460847645998, + "learning_rate": 6.60025180445487e-05, + "loss": 0.24936530590057374, + "step": 79850 + }, + { + "epoch": 0.34285567089977076, + "grad_norm": 0.011068413965404034, + "learning_rate": 6.599820632443107e-05, + "loss": 0.09880251884460449, + "step": 79860 + }, + { + "epoch": 0.34289860298979075, + "grad_norm": 0.10294083505868912, + "learning_rate": 6.599389460431344e-05, + "loss": 0.1930585980415344, + "step": 79870 + }, + { + "epoch": 0.34294153507981073, + "grad_norm": 0.4128507971763611, + "learning_rate": 6.598958288419582e-05, + "loss": 0.1906563401222229, + "step": 79880 + }, + { + "epoch": 0.3429844671698308, + "grad_norm": 2.6419787406921387, + "learning_rate": 6.59852711640782e-05, + "loss": 0.38313093185424807, + "step": 79890 + }, + { + "epoch": 0.34302739925985076, + "grad_norm": 2.718538761138916, + "learning_rate": 6.598095944396057e-05, + "loss": 0.2621057748794556, + "step": 79900 + }, + { + "epoch": 0.34307033134987075, + "grad_norm": 0.04285736754536629, + "learning_rate": 6.597664772384295e-05, + "loss": 0.12957130670547484, + "step": 79910 + }, + { + "epoch": 0.3431132634398908, + "grad_norm": 0.025662913918495178, + "learning_rate": 6.597233600372533e-05, + "loss": 0.3805588960647583, + "step": 79920 + }, + { + "epoch": 0.3431561955299108, + "grad_norm": 1.7533470392227173, + "learning_rate": 6.59680242836077e-05, + "loss": 0.25470623970031736, + "step": 79930 + }, + { + "epoch": 0.34319912761993077, + "grad_norm": 0.02234153263270855, + "learning_rate": 6.596371256349007e-05, + "loss": 0.23534765243530273, + "step": 79940 + }, + { + "epoch": 0.3432420597099508, + "grad_norm": 0.014367325231432915, + "learning_rate": 6.595940084337245e-05, + "loss": 0.17741996049880981, + "step": 79950 + }, + { + "epoch": 0.3432849917999708, + "grad_norm": 0.0687040314078331, + "learning_rate": 6.595508912325484e-05, + "loss": 0.3304955005645752, + "step": 79960 + }, + { + "epoch": 0.34332792388999084, + "grad_norm": 0.014946906827390194, + "learning_rate": 6.595077740313722e-05, + "loss": 0.1195443868637085, + "step": 79970 + }, + { + "epoch": 0.3433708559800108, + "grad_norm": 3.6829023361206055, + "learning_rate": 6.59464656830196e-05, + "loss": 0.18149445056915284, + "step": 79980 + }, + { + "epoch": 0.3434137880700308, + "grad_norm": 3.5131752490997314, + "learning_rate": 6.594215396290197e-05, + "loss": 0.3053800106048584, + "step": 79990 + }, + { + "epoch": 0.34345672016005085, + "grad_norm": 0.002581225708127022, + "learning_rate": 6.593784224278435e-05, + "loss": 0.05220865607261658, + "step": 80000 + }, + { + "epoch": 0.34345672016005085, + "eval_loss": 0.4368164837360382, + "eval_runtime": 27.1598, + "eval_samples_per_second": 3.682, + "eval_steps_per_second": 3.682, + "step": 80000 + }, + { + "epoch": 0.34349965225007084, + "grad_norm": 3.8440799713134766, + "learning_rate": 6.593353052266673e-05, + "loss": 0.2896945714950562, + "step": 80010 + }, + { + "epoch": 0.34354258434009083, + "grad_norm": 2.2781224250793457, + "learning_rate": 6.59292188025491e-05, + "loss": 0.15586977005004882, + "step": 80020 + }, + { + "epoch": 0.34358551643011087, + "grad_norm": 0.09280993789434433, + "learning_rate": 6.592490708243147e-05, + "loss": 0.16732913255691528, + "step": 80030 + }, + { + "epoch": 0.34362844852013086, + "grad_norm": 0.0976981520652771, + "learning_rate": 6.592059536231384e-05, + "loss": 0.5183558940887452, + "step": 80040 + }, + { + "epoch": 0.34367138061015085, + "grad_norm": 0.35272637009620667, + "learning_rate": 6.591628364219622e-05, + "loss": 0.2636963129043579, + "step": 80050 + }, + { + "epoch": 0.3437143127001709, + "grad_norm": 0.32257652282714844, + "learning_rate": 6.59119719220786e-05, + "loss": 0.07576382756233216, + "step": 80060 + }, + { + "epoch": 0.3437572447901909, + "grad_norm": 0.0030997225549072027, + "learning_rate": 6.590766020196098e-05, + "loss": 0.11930543184280396, + "step": 80070 + }, + { + "epoch": 0.34380017688021086, + "grad_norm": 0.3869069814682007, + "learning_rate": 6.590334848184335e-05, + "loss": 0.18810378313064574, + "step": 80080 + }, + { + "epoch": 0.3438431089702309, + "grad_norm": 1.9977630376815796, + "learning_rate": 6.589903676172573e-05, + "loss": 0.08379656076431274, + "step": 80090 + }, + { + "epoch": 0.3438860410602509, + "grad_norm": 0.01002898346632719, + "learning_rate": 6.589472504160811e-05, + "loss": 0.10857353210449219, + "step": 80100 + }, + { + "epoch": 0.3439289731502709, + "grad_norm": 6.863121509552002, + "learning_rate": 6.589041332149047e-05, + "loss": 0.2604074001312256, + "step": 80110 + }, + { + "epoch": 0.3439719052402909, + "grad_norm": 1.263120174407959, + "learning_rate": 6.588610160137285e-05, + "loss": 0.26923954486846924, + "step": 80120 + }, + { + "epoch": 0.3440148373303109, + "grad_norm": 0.08355128020048141, + "learning_rate": 6.588178988125523e-05, + "loss": 0.11677829027175904, + "step": 80130 + }, + { + "epoch": 0.3440577694203309, + "grad_norm": 0.001254008966498077, + "learning_rate": 6.58774781611376e-05, + "loss": 0.20639264583587646, + "step": 80140 + }, + { + "epoch": 0.34410070151035094, + "grad_norm": 0.11388286203145981, + "learning_rate": 6.587316644101998e-05, + "loss": 0.2153681516647339, + "step": 80150 + }, + { + "epoch": 0.3441436336003709, + "grad_norm": 1.5901732444763184, + "learning_rate": 6.586885472090236e-05, + "loss": 0.3067173004150391, + "step": 80160 + }, + { + "epoch": 0.34418656569039097, + "grad_norm": 1.3476285934448242, + "learning_rate": 6.586454300078474e-05, + "loss": 0.21053199768066405, + "step": 80170 + }, + { + "epoch": 0.34422949778041095, + "grad_norm": 0.006064875982701778, + "learning_rate": 6.586023128066711e-05, + "loss": 0.29959542751312257, + "step": 80180 + }, + { + "epoch": 0.34427242987043094, + "grad_norm": 0.016881000250577927, + "learning_rate": 6.585591956054949e-05, + "loss": 0.22309458255767822, + "step": 80190 + }, + { + "epoch": 0.344315361960451, + "grad_norm": 0.3344466984272003, + "learning_rate": 6.585160784043187e-05, + "loss": 0.23730158805847168, + "step": 80200 + }, + { + "epoch": 0.34435829405047097, + "grad_norm": 0.016367772594094276, + "learning_rate": 6.584729612031425e-05, + "loss": 0.06344168782234191, + "step": 80210 + }, + { + "epoch": 0.34440122614049096, + "grad_norm": 1.8823529481887817, + "learning_rate": 6.584298440019662e-05, + "loss": 0.31463663578033446, + "step": 80220 + }, + { + "epoch": 0.344444158230511, + "grad_norm": 0.4750448763370514, + "learning_rate": 6.5838672680079e-05, + "loss": 0.2528752326965332, + "step": 80230 + }, + { + "epoch": 0.344487090320531, + "grad_norm": 0.0015717342030256987, + "learning_rate": 6.583436095996138e-05, + "loss": 0.13928334712982177, + "step": 80240 + }, + { + "epoch": 0.344530022410551, + "grad_norm": 0.0036773881874978542, + "learning_rate": 6.583004923984375e-05, + "loss": 0.1504261612892151, + "step": 80250 + }, + { + "epoch": 0.344572954500571, + "grad_norm": 0.010615227743983269, + "learning_rate": 6.582573751972613e-05, + "loss": 0.21787447929382325, + "step": 80260 + }, + { + "epoch": 0.344615886590591, + "grad_norm": 0.06478608399629593, + "learning_rate": 6.58214257996085e-05, + "loss": 0.44989643096923826, + "step": 80270 + }, + { + "epoch": 0.344658818680611, + "grad_norm": 0.025874905288219452, + "learning_rate": 6.581711407949087e-05, + "loss": 0.101692795753479, + "step": 80280 + }, + { + "epoch": 0.34470175077063103, + "grad_norm": 0.005986157804727554, + "learning_rate": 6.581280235937325e-05, + "loss": 0.2578385829925537, + "step": 80290 + }, + { + "epoch": 0.344744682860651, + "grad_norm": 0.009471284225583076, + "learning_rate": 6.580849063925563e-05, + "loss": 0.2216710329055786, + "step": 80300 + }, + { + "epoch": 0.344787614950671, + "grad_norm": 4.648142337799072, + "learning_rate": 6.5804178919138e-05, + "loss": 0.3399980545043945, + "step": 80310 + }, + { + "epoch": 0.34483054704069105, + "grad_norm": 0.1861869841814041, + "learning_rate": 6.579986719902038e-05, + "loss": 0.2353301763534546, + "step": 80320 + }, + { + "epoch": 0.34487347913071104, + "grad_norm": 1.3739330768585205, + "learning_rate": 6.579555547890276e-05, + "loss": 0.13581985235214233, + "step": 80330 + }, + { + "epoch": 0.344916411220731, + "grad_norm": 0.8927357196807861, + "learning_rate": 6.579124375878514e-05, + "loss": 0.4658341884613037, + "step": 80340 + }, + { + "epoch": 0.34495934331075107, + "grad_norm": 0.46321922540664673, + "learning_rate": 6.57869320386675e-05, + "loss": 0.11834501028060913, + "step": 80350 + }, + { + "epoch": 0.34500227540077105, + "grad_norm": 1.620883822441101, + "learning_rate": 6.578262031854988e-05, + "loss": 0.38476948738098143, + "step": 80360 + }, + { + "epoch": 0.34504520749079104, + "grad_norm": 0.024614451453089714, + "learning_rate": 6.577830859843226e-05, + "loss": 0.1613122820854187, + "step": 80370 + }, + { + "epoch": 0.3450881395808111, + "grad_norm": 0.20275190472602844, + "learning_rate": 6.577399687831463e-05, + "loss": 0.21819941997528075, + "step": 80380 + }, + { + "epoch": 0.34513107167083107, + "grad_norm": 0.005242771469056606, + "learning_rate": 6.576968515819701e-05, + "loss": 0.04298856854438782, + "step": 80390 + }, + { + "epoch": 0.3451740037608511, + "grad_norm": 0.0025338917039334774, + "learning_rate": 6.576537343807939e-05, + "loss": 0.004497027024626732, + "step": 80400 + }, + { + "epoch": 0.3452169358508711, + "grad_norm": 1.8911714553833008, + "learning_rate": 6.576106171796176e-05, + "loss": 0.39356627464294436, + "step": 80410 + }, + { + "epoch": 0.3452598679408911, + "grad_norm": 0.009503847919404507, + "learning_rate": 6.575674999784414e-05, + "loss": 0.15246884822845458, + "step": 80420 + }, + { + "epoch": 0.34530280003091113, + "grad_norm": 0.18555688858032227, + "learning_rate": 6.575243827772652e-05, + "loss": 0.0459955096244812, + "step": 80430 + }, + { + "epoch": 0.3453457321209311, + "grad_norm": 2.2830750942230225, + "learning_rate": 6.57481265576089e-05, + "loss": 0.11132620573043824, + "step": 80440 + }, + { + "epoch": 0.3453886642109511, + "grad_norm": 0.4190322756767273, + "learning_rate": 6.574381483749127e-05, + "loss": 0.21302645206451415, + "step": 80450 + }, + { + "epoch": 0.34543159630097114, + "grad_norm": 0.005154999904334545, + "learning_rate": 6.573950311737365e-05, + "loss": 0.3834122657775879, + "step": 80460 + }, + { + "epoch": 0.34547452839099113, + "grad_norm": 0.5437856316566467, + "learning_rate": 6.573519139725603e-05, + "loss": 0.1995234966278076, + "step": 80470 + }, + { + "epoch": 0.3455174604810111, + "grad_norm": 0.0034222968388348818, + "learning_rate": 6.57308796771384e-05, + "loss": 0.2423619508743286, + "step": 80480 + }, + { + "epoch": 0.34556039257103116, + "grad_norm": 0.626228392124176, + "learning_rate": 6.572656795702078e-05, + "loss": 0.059421378374099734, + "step": 80490 + }, + { + "epoch": 0.34560332466105115, + "grad_norm": 10.068717956542969, + "learning_rate": 6.572225623690316e-05, + "loss": 0.3919938087463379, + "step": 80500 + }, + { + "epoch": 0.34564625675107113, + "grad_norm": 5.153544902801514, + "learning_rate": 6.571794451678554e-05, + "loss": 0.23537764549255372, + "step": 80510 + }, + { + "epoch": 0.3456891888410912, + "grad_norm": 0.004742398392409086, + "learning_rate": 6.57136327966679e-05, + "loss": 0.06173415184020996, + "step": 80520 + }, + { + "epoch": 0.34573212093111116, + "grad_norm": 0.45885029435157776, + "learning_rate": 6.570932107655028e-05, + "loss": 0.24032649993896485, + "step": 80530 + }, + { + "epoch": 0.34577505302113115, + "grad_norm": 0.08064588904380798, + "learning_rate": 6.570500935643266e-05, + "loss": 0.2223383903503418, + "step": 80540 + }, + { + "epoch": 0.3458179851111512, + "grad_norm": 0.685327410697937, + "learning_rate": 6.570069763631503e-05, + "loss": 0.16256893873214723, + "step": 80550 + }, + { + "epoch": 0.3458609172011712, + "grad_norm": 0.257320374250412, + "learning_rate": 6.569638591619741e-05, + "loss": 0.37799150943756105, + "step": 80560 + }, + { + "epoch": 0.34590384929119117, + "grad_norm": 0.09647884964942932, + "learning_rate": 6.569207419607979e-05, + "loss": 0.006505146622657776, + "step": 80570 + }, + { + "epoch": 0.3459467813812112, + "grad_norm": 0.12064115703105927, + "learning_rate": 6.568776247596217e-05, + "loss": 0.11497091054916382, + "step": 80580 + }, + { + "epoch": 0.3459897134712312, + "grad_norm": 0.052829205989837646, + "learning_rate": 6.568345075584454e-05, + "loss": 0.1395114541053772, + "step": 80590 + }, + { + "epoch": 0.34603264556125124, + "grad_norm": 1.4082953929901123, + "learning_rate": 6.567913903572691e-05, + "loss": 0.6587924003601074, + "step": 80600 + }, + { + "epoch": 0.3460755776512712, + "grad_norm": 2.774576187133789, + "learning_rate": 6.567482731560928e-05, + "loss": 0.18932981491088868, + "step": 80610 + }, + { + "epoch": 0.3461185097412912, + "grad_norm": 0.4535244107246399, + "learning_rate": 6.567051559549166e-05, + "loss": 0.1731483221054077, + "step": 80620 + }, + { + "epoch": 0.34616144183131126, + "grad_norm": 1.3304637670516968, + "learning_rate": 6.566620387537404e-05, + "loss": 0.4254453659057617, + "step": 80630 + }, + { + "epoch": 0.34620437392133124, + "grad_norm": 0.028769170865416527, + "learning_rate": 6.566189215525642e-05, + "loss": 0.2866819381713867, + "step": 80640 + }, + { + "epoch": 0.34624730601135123, + "grad_norm": 0.17192727327346802, + "learning_rate": 6.56575804351388e-05, + "loss": 0.22045984268188476, + "step": 80650 + }, + { + "epoch": 0.3462902381013713, + "grad_norm": 39.970855712890625, + "learning_rate": 6.565326871502117e-05, + "loss": 0.17609299421310426, + "step": 80660 + }, + { + "epoch": 0.34633317019139126, + "grad_norm": 0.0026669171638786793, + "learning_rate": 6.564895699490355e-05, + "loss": 0.1487862229347229, + "step": 80670 + }, + { + "epoch": 0.34637610228141125, + "grad_norm": 0.017173703759908676, + "learning_rate": 6.564464527478593e-05, + "loss": 0.1477797031402588, + "step": 80680 + }, + { + "epoch": 0.3464190343714313, + "grad_norm": 0.01858958974480629, + "learning_rate": 6.56403335546683e-05, + "loss": 0.40072221755981446, + "step": 80690 + }, + { + "epoch": 0.3464619664614513, + "grad_norm": 0.7842692136764526, + "learning_rate": 6.563602183455068e-05, + "loss": 0.268584680557251, + "step": 80700 + }, + { + "epoch": 0.34650489855147126, + "grad_norm": 0.08671054989099503, + "learning_rate": 6.563171011443306e-05, + "loss": 0.1292457699775696, + "step": 80710 + }, + { + "epoch": 0.3465478306414913, + "grad_norm": 2.038288116455078, + "learning_rate": 6.562739839431544e-05, + "loss": 0.15267176628112794, + "step": 80720 + }, + { + "epoch": 0.3465907627315113, + "grad_norm": 11.327775955200195, + "learning_rate": 6.562308667419781e-05, + "loss": 0.16896125078201293, + "step": 80730 + }, + { + "epoch": 0.3466336948215313, + "grad_norm": 4.022800445556641, + "learning_rate": 6.561877495408019e-05, + "loss": 0.1759350538253784, + "step": 80740 + }, + { + "epoch": 0.3466766269115513, + "grad_norm": 0.009791653603315353, + "learning_rate": 6.561446323396257e-05, + "loss": 0.20523180961608886, + "step": 80750 + }, + { + "epoch": 0.3467195590015713, + "grad_norm": 2.14821195602417, + "learning_rate": 6.561015151384494e-05, + "loss": 0.2979742527008057, + "step": 80760 + }, + { + "epoch": 0.3467624910915913, + "grad_norm": 0.2958136796951294, + "learning_rate": 6.560583979372731e-05, + "loss": 0.15679970979690552, + "step": 80770 + }, + { + "epoch": 0.34680542318161134, + "grad_norm": 0.14770112931728363, + "learning_rate": 6.560152807360969e-05, + "loss": 0.13254650831222534, + "step": 80780 + }, + { + "epoch": 0.3468483552716313, + "grad_norm": 8.195019721984863, + "learning_rate": 6.559721635349206e-05, + "loss": 0.2113429069519043, + "step": 80790 + }, + { + "epoch": 0.3468912873616513, + "grad_norm": 0.05892874300479889, + "learning_rate": 6.559290463337444e-05, + "loss": 0.27900118827819825, + "step": 80800 + }, + { + "epoch": 0.34693421945167136, + "grad_norm": 1.09965181350708, + "learning_rate": 6.558859291325682e-05, + "loss": 0.22384970188140868, + "step": 80810 + }, + { + "epoch": 0.34697715154169134, + "grad_norm": 0.1914658546447754, + "learning_rate": 6.55842811931392e-05, + "loss": 0.28952152729034425, + "step": 80820 + }, + { + "epoch": 0.3470200836317114, + "grad_norm": 0.04727429524064064, + "learning_rate": 6.557996947302157e-05, + "loss": 0.2773630380630493, + "step": 80830 + }, + { + "epoch": 0.34706301572173137, + "grad_norm": 0.033436909317970276, + "learning_rate": 6.557565775290395e-05, + "loss": 0.06806041002273559, + "step": 80840 + }, + { + "epoch": 0.34710594781175136, + "grad_norm": 0.021389199420809746, + "learning_rate": 6.557134603278631e-05, + "loss": 0.14047728776931762, + "step": 80850 + }, + { + "epoch": 0.3471488799017714, + "grad_norm": 1.8197565078735352, + "learning_rate": 6.556703431266869e-05, + "loss": 0.2777894973754883, + "step": 80860 + }, + { + "epoch": 0.3471918119917914, + "grad_norm": 1.6493991613388062, + "learning_rate": 6.556272259255107e-05, + "loss": 0.19073477983474732, + "step": 80870 + }, + { + "epoch": 0.3472347440818114, + "grad_norm": 1.2035952806472778, + "learning_rate": 6.555841087243345e-05, + "loss": 0.3294836044311523, + "step": 80880 + }, + { + "epoch": 0.3472776761718314, + "grad_norm": 0.013203203678131104, + "learning_rate": 6.555409915231582e-05, + "loss": 0.21418993473052977, + "step": 80890 + }, + { + "epoch": 0.3473206082618514, + "grad_norm": 0.02511315606534481, + "learning_rate": 6.55497874321982e-05, + "loss": 0.0967091977596283, + "step": 80900 + }, + { + "epoch": 0.3473635403518714, + "grad_norm": 0.8609216213226318, + "learning_rate": 6.554547571208058e-05, + "loss": 0.3208725929260254, + "step": 80910 + }, + { + "epoch": 0.34740647244189143, + "grad_norm": 0.01230608019977808, + "learning_rate": 6.554116399196296e-05, + "loss": 0.15385348796844484, + "step": 80920 + }, + { + "epoch": 0.3474494045319114, + "grad_norm": 4.483237266540527, + "learning_rate": 6.553685227184533e-05, + "loss": 0.25249333381652833, + "step": 80930 + }, + { + "epoch": 0.3474923366219314, + "grad_norm": 1.1705982685089111, + "learning_rate": 6.553254055172771e-05, + "loss": 0.4617762088775635, + "step": 80940 + }, + { + "epoch": 0.34753526871195145, + "grad_norm": 2.1686246395111084, + "learning_rate": 6.552822883161009e-05, + "loss": 0.22434430122375487, + "step": 80950 + }, + { + "epoch": 0.34757820080197144, + "grad_norm": 4.323497295379639, + "learning_rate": 6.552391711149246e-05, + "loss": 0.29828619956970215, + "step": 80960 + }, + { + "epoch": 0.3476211328919914, + "grad_norm": 0.18413817882537842, + "learning_rate": 6.551960539137484e-05, + "loss": 0.4002058029174805, + "step": 80970 + }, + { + "epoch": 0.34766406498201147, + "grad_norm": 1.3192716836929321, + "learning_rate": 6.551529367125722e-05, + "loss": 0.077541983127594, + "step": 80980 + }, + { + "epoch": 0.34770699707203145, + "grad_norm": 0.3532804846763611, + "learning_rate": 6.55109819511396e-05, + "loss": 0.0771310031414032, + "step": 80990 + }, + { + "epoch": 0.34774992916205144, + "grad_norm": 0.008650779724121094, + "learning_rate": 6.550667023102197e-05, + "loss": 0.3339291334152222, + "step": 81000 + }, + { + "epoch": 0.34774992916205144, + "eval_loss": 0.42357951402664185, + "eval_runtime": 27.1096, + "eval_samples_per_second": 3.689, + "eval_steps_per_second": 3.689, + "step": 81000 + }, + { + "epoch": 0.3477928612520715, + "grad_norm": 34.085533142089844, + "learning_rate": 6.550235851090434e-05, + "loss": 0.08599871993064881, + "step": 81010 + }, + { + "epoch": 0.34783579334209147, + "grad_norm": 0.00876838993281126, + "learning_rate": 6.549804679078671e-05, + "loss": 0.2552550554275513, + "step": 81020 + }, + { + "epoch": 0.3478787254321115, + "grad_norm": 0.2916661500930786, + "learning_rate": 6.549373507066909e-05, + "loss": 0.20430076122283936, + "step": 81030 + }, + { + "epoch": 0.3479216575221315, + "grad_norm": 0.004903197754174471, + "learning_rate": 6.548942335055147e-05, + "loss": 0.16290855407714844, + "step": 81040 + }, + { + "epoch": 0.3479645896121515, + "grad_norm": 0.08551827818155289, + "learning_rate": 6.548511163043385e-05, + "loss": 0.27665128707885744, + "step": 81050 + }, + { + "epoch": 0.34800752170217153, + "grad_norm": 5.609275817871094, + "learning_rate": 6.548079991031622e-05, + "loss": 0.26043925285339353, + "step": 81060 + }, + { + "epoch": 0.3480504537921915, + "grad_norm": 0.0011216587154194713, + "learning_rate": 6.54764881901986e-05, + "loss": 0.2311476230621338, + "step": 81070 + }, + { + "epoch": 0.3480933858822115, + "grad_norm": 0.4421592056751251, + "learning_rate": 6.547217647008098e-05, + "loss": 0.11889767646789551, + "step": 81080 + }, + { + "epoch": 0.34813631797223155, + "grad_norm": 0.0412554107606411, + "learning_rate": 6.546786474996336e-05, + "loss": 0.17613816261291504, + "step": 81090 + }, + { + "epoch": 0.34817925006225153, + "grad_norm": 0.13568167388439178, + "learning_rate": 6.546355302984572e-05, + "loss": 0.25402672290802003, + "step": 81100 + }, + { + "epoch": 0.3482221821522715, + "grad_norm": 4.991754531860352, + "learning_rate": 6.54592413097281e-05, + "loss": 0.34817531108856203, + "step": 81110 + }, + { + "epoch": 0.34826511424229156, + "grad_norm": 3.6499862670898438, + "learning_rate": 6.545492958961047e-05, + "loss": 0.23540241718292237, + "step": 81120 + }, + { + "epoch": 0.34830804633231155, + "grad_norm": 1.1497873067855835, + "learning_rate": 6.545061786949285e-05, + "loss": 0.30424203872680666, + "step": 81130 + }, + { + "epoch": 0.34835097842233154, + "grad_norm": 0.029126333072781563, + "learning_rate": 6.544630614937523e-05, + "loss": 0.21944479942321776, + "step": 81140 + }, + { + "epoch": 0.3483939105123516, + "grad_norm": 0.658740758895874, + "learning_rate": 6.544199442925762e-05, + "loss": 0.3712824583053589, + "step": 81150 + }, + { + "epoch": 0.34843684260237157, + "grad_norm": 0.039526522159576416, + "learning_rate": 6.543768270914e-05, + "loss": 0.1300313353538513, + "step": 81160 + }, + { + "epoch": 0.34847977469239155, + "grad_norm": 1.5271961688995361, + "learning_rate": 6.543337098902238e-05, + "loss": 0.1440997838973999, + "step": 81170 + }, + { + "epoch": 0.3485227067824116, + "grad_norm": 0.008994216099381447, + "learning_rate": 6.542905926890474e-05, + "loss": 0.24037206172943115, + "step": 81180 + }, + { + "epoch": 0.3485656388724316, + "grad_norm": 0.9555651545524597, + "learning_rate": 6.542474754878712e-05, + "loss": 0.21935515403747557, + "step": 81190 + }, + { + "epoch": 0.34860857096245157, + "grad_norm": 0.02722945250570774, + "learning_rate": 6.54204358286695e-05, + "loss": 0.18047159910202026, + "step": 81200 + }, + { + "epoch": 0.3486515030524716, + "grad_norm": 3.5013840198516846, + "learning_rate": 6.541612410855187e-05, + "loss": 0.3040013790130615, + "step": 81210 + }, + { + "epoch": 0.3486944351424916, + "grad_norm": 0.0028997049666941166, + "learning_rate": 6.541181238843425e-05, + "loss": 0.19227596521377563, + "step": 81220 + }, + { + "epoch": 0.3487373672325116, + "grad_norm": 0.13611142337322235, + "learning_rate": 6.540750066831663e-05, + "loss": 0.14200282096862793, + "step": 81230 + }, + { + "epoch": 0.34878029932253163, + "grad_norm": 2.3464372158050537, + "learning_rate": 6.5403188948199e-05, + "loss": 0.36781165599822996, + "step": 81240 + }, + { + "epoch": 0.3488232314125516, + "grad_norm": 0.003250357462093234, + "learning_rate": 6.539887722808138e-05, + "loss": 0.28147766590118406, + "step": 81250 + }, + { + "epoch": 0.34886616350257166, + "grad_norm": 0.10627375543117523, + "learning_rate": 6.539456550796374e-05, + "loss": 0.15539608001708985, + "step": 81260 + }, + { + "epoch": 0.34890909559259164, + "grad_norm": 0.1069926768541336, + "learning_rate": 6.539025378784612e-05, + "loss": 0.07610588073730469, + "step": 81270 + }, + { + "epoch": 0.34895202768261163, + "grad_norm": 13.996943473815918, + "learning_rate": 6.53859420677285e-05, + "loss": 0.3478125333786011, + "step": 81280 + }, + { + "epoch": 0.3489949597726317, + "grad_norm": 36.15385437011719, + "learning_rate": 6.538163034761088e-05, + "loss": 0.07145402431488038, + "step": 81290 + }, + { + "epoch": 0.34903789186265166, + "grad_norm": 11.668716430664062, + "learning_rate": 6.537731862749325e-05, + "loss": 0.06767347455024719, + "step": 81300 + }, + { + "epoch": 0.34908082395267165, + "grad_norm": 2.9824609756469727, + "learning_rate": 6.537300690737563e-05, + "loss": 0.1598757028579712, + "step": 81310 + }, + { + "epoch": 0.3491237560426917, + "grad_norm": 0.33760035037994385, + "learning_rate": 6.536869518725801e-05, + "loss": 0.4008010387420654, + "step": 81320 + }, + { + "epoch": 0.3491666881327117, + "grad_norm": 0.5066002607345581, + "learning_rate": 6.536438346714039e-05, + "loss": 0.22687935829162598, + "step": 81330 + }, + { + "epoch": 0.34920962022273166, + "grad_norm": 0.28217196464538574, + "learning_rate": 6.536007174702275e-05, + "loss": 0.1519029140472412, + "step": 81340 + }, + { + "epoch": 0.3492525523127517, + "grad_norm": 0.15852105617523193, + "learning_rate": 6.535576002690513e-05, + "loss": 0.19549806118011476, + "step": 81350 + }, + { + "epoch": 0.3492954844027717, + "grad_norm": 0.8499207496643066, + "learning_rate": 6.53514483067875e-05, + "loss": 0.10695688724517823, + "step": 81360 + }, + { + "epoch": 0.3493384164927917, + "grad_norm": 0.019876671954989433, + "learning_rate": 6.53471365866699e-05, + "loss": 0.15644168853759766, + "step": 81370 + }, + { + "epoch": 0.3493813485828117, + "grad_norm": 3.817972183227539, + "learning_rate": 6.534282486655227e-05, + "loss": 0.19872108697891236, + "step": 81380 + }, + { + "epoch": 0.3494242806728317, + "grad_norm": 1.0998066663742065, + "learning_rate": 6.533851314643465e-05, + "loss": 0.31540355682373045, + "step": 81390 + }, + { + "epoch": 0.3494672127628517, + "grad_norm": 0.6520820260047913, + "learning_rate": 6.533420142631703e-05, + "loss": 0.3238629579544067, + "step": 81400 + }, + { + "epoch": 0.34951014485287174, + "grad_norm": 1.3481556177139282, + "learning_rate": 6.53298897061994e-05, + "loss": 0.2559787750244141, + "step": 81410 + }, + { + "epoch": 0.3495530769428917, + "grad_norm": 8.32856273651123, + "learning_rate": 6.532557798608177e-05, + "loss": 0.40641307830810547, + "step": 81420 + }, + { + "epoch": 0.3495960090329117, + "grad_norm": 3.496859550476074, + "learning_rate": 6.532126626596415e-05, + "loss": 0.07484019994735717, + "step": 81430 + }, + { + "epoch": 0.34963894112293176, + "grad_norm": 2.567354679107666, + "learning_rate": 6.531695454584652e-05, + "loss": 0.20389926433563232, + "step": 81440 + }, + { + "epoch": 0.34968187321295174, + "grad_norm": 7.194469451904297, + "learning_rate": 6.53126428257289e-05, + "loss": 0.20474486351013182, + "step": 81450 + }, + { + "epoch": 0.3497248053029718, + "grad_norm": 1.0319026708602905, + "learning_rate": 6.530833110561128e-05, + "loss": 0.2735546112060547, + "step": 81460 + }, + { + "epoch": 0.3497677373929918, + "grad_norm": 0.02630757726728916, + "learning_rate": 6.530401938549365e-05, + "loss": 0.21943216323852538, + "step": 81470 + }, + { + "epoch": 0.34981066948301176, + "grad_norm": 0.6678802371025085, + "learning_rate": 6.529970766537603e-05, + "loss": 0.054949283599853516, + "step": 81480 + }, + { + "epoch": 0.3498536015730318, + "grad_norm": 0.010042681358754635, + "learning_rate": 6.529539594525841e-05, + "loss": 0.15706361532211305, + "step": 81490 + }, + { + "epoch": 0.3498965336630518, + "grad_norm": 3.5283749103546143, + "learning_rate": 6.529108422514079e-05, + "loss": 0.46043691635131834, + "step": 81500 + }, + { + "epoch": 0.3499394657530718, + "grad_norm": 0.2358131855726242, + "learning_rate": 6.528677250502315e-05, + "loss": 0.3449904680252075, + "step": 81510 + }, + { + "epoch": 0.3499823978430918, + "grad_norm": 1.6699026823043823, + "learning_rate": 6.528246078490553e-05, + "loss": 0.3223626375198364, + "step": 81520 + }, + { + "epoch": 0.3500253299331118, + "grad_norm": 0.12843438982963562, + "learning_rate": 6.52781490647879e-05, + "loss": 0.13025667667388915, + "step": 81530 + }, + { + "epoch": 0.3500682620231318, + "grad_norm": 2.23526930809021, + "learning_rate": 6.527383734467028e-05, + "loss": 0.1384149193763733, + "step": 81540 + }, + { + "epoch": 0.35011119411315184, + "grad_norm": 0.0064170872792601585, + "learning_rate": 6.526952562455266e-05, + "loss": 0.15591152906417846, + "step": 81550 + }, + { + "epoch": 0.3501541262031718, + "grad_norm": 0.08279679715633392, + "learning_rate": 6.526521390443504e-05, + "loss": 0.18339295387268068, + "step": 81560 + }, + { + "epoch": 0.3501970582931918, + "grad_norm": 1.4734292030334473, + "learning_rate": 6.526090218431741e-05, + "loss": 0.14943455457687377, + "step": 81570 + }, + { + "epoch": 0.35023999038321185, + "grad_norm": 0.010176564566791058, + "learning_rate": 6.525659046419979e-05, + "loss": 0.21766793727874756, + "step": 81580 + }, + { + "epoch": 0.35028292247323184, + "grad_norm": 0.0007287487387657166, + "learning_rate": 6.525227874408217e-05, + "loss": 0.12911440134048463, + "step": 81590 + }, + { + "epoch": 0.3503258545632518, + "grad_norm": 0.24612395465373993, + "learning_rate": 6.524796702396455e-05, + "loss": 0.123654305934906, + "step": 81600 + }, + { + "epoch": 0.35036878665327187, + "grad_norm": 0.07536415010690689, + "learning_rate": 6.524365530384692e-05, + "loss": 0.015879042446613312, + "step": 81610 + }, + { + "epoch": 0.35041171874329186, + "grad_norm": 0.16817022860050201, + "learning_rate": 6.52393435837293e-05, + "loss": 0.35819756984710693, + "step": 81620 + }, + { + "epoch": 0.35045465083331184, + "grad_norm": 0.053329501301050186, + "learning_rate": 6.523503186361168e-05, + "loss": 0.2505997657775879, + "step": 81630 + }, + { + "epoch": 0.3504975829233319, + "grad_norm": 0.20820234715938568, + "learning_rate": 6.523072014349406e-05, + "loss": 0.16442860364913942, + "step": 81640 + }, + { + "epoch": 0.35054051501335187, + "grad_norm": 2.309272289276123, + "learning_rate": 6.522640842337643e-05, + "loss": 0.25239014625549316, + "step": 81650 + }, + { + "epoch": 0.35058344710337186, + "grad_norm": 3.8517117500305176, + "learning_rate": 6.522209670325881e-05, + "loss": 0.21353816986083984, + "step": 81660 + }, + { + "epoch": 0.3506263791933919, + "grad_norm": 0.11886537075042725, + "learning_rate": 6.521778498314117e-05, + "loss": 0.26951894760131834, + "step": 81670 + }, + { + "epoch": 0.3506693112834119, + "grad_norm": 4.493960857391357, + "learning_rate": 6.521347326302355e-05, + "loss": 0.17917612791061402, + "step": 81680 + }, + { + "epoch": 0.35071224337343193, + "grad_norm": 0.054700762033462524, + "learning_rate": 6.520916154290593e-05, + "loss": 0.2787285566329956, + "step": 81690 + }, + { + "epoch": 0.3507551754634519, + "grad_norm": 6.3242058753967285, + "learning_rate": 6.52048498227883e-05, + "loss": 0.2191849946975708, + "step": 81700 + }, + { + "epoch": 0.3507981075534719, + "grad_norm": 5.571452617645264, + "learning_rate": 6.520053810267068e-05, + "loss": 0.3427495241165161, + "step": 81710 + }, + { + "epoch": 0.35084103964349195, + "grad_norm": 0.028819601982831955, + "learning_rate": 6.519622638255306e-05, + "loss": 0.07852021455764771, + "step": 81720 + }, + { + "epoch": 0.35088397173351193, + "grad_norm": 0.002837139181792736, + "learning_rate": 6.519191466243544e-05, + "loss": 0.17152912616729737, + "step": 81730 + }, + { + "epoch": 0.3509269038235319, + "grad_norm": 0.9089310765266418, + "learning_rate": 6.518760294231782e-05, + "loss": 0.2090291976928711, + "step": 81740 + }, + { + "epoch": 0.35096983591355196, + "grad_norm": 0.0009550207760185003, + "learning_rate": 6.518329122220018e-05, + "loss": 0.2496708393096924, + "step": 81750 + }, + { + "epoch": 0.35101276800357195, + "grad_norm": 0.0026640784926712513, + "learning_rate": 6.517897950208256e-05, + "loss": 0.4584530830383301, + "step": 81760 + }, + { + "epoch": 0.35105570009359194, + "grad_norm": 0.010816243477165699, + "learning_rate": 6.517466778196493e-05, + "loss": 0.2485302448272705, + "step": 81770 + }, + { + "epoch": 0.351098632183612, + "grad_norm": 0.004385428968816996, + "learning_rate": 6.517035606184731e-05, + "loss": 0.13215600252151488, + "step": 81780 + }, + { + "epoch": 0.35114156427363197, + "grad_norm": 0.21605446934700012, + "learning_rate": 6.516604434172969e-05, + "loss": 0.255491304397583, + "step": 81790 + }, + { + "epoch": 0.35118449636365195, + "grad_norm": 0.6936253905296326, + "learning_rate": 6.516173262161207e-05, + "loss": 0.1723204016685486, + "step": 81800 + }, + { + "epoch": 0.351227428453672, + "grad_norm": 0.04460751265287399, + "learning_rate": 6.515742090149444e-05, + "loss": 0.13337093591690063, + "step": 81810 + }, + { + "epoch": 0.351270360543692, + "grad_norm": 0.7156800031661987, + "learning_rate": 6.515310918137682e-05, + "loss": 0.11102476119995117, + "step": 81820 + }, + { + "epoch": 0.35131329263371197, + "grad_norm": 5.608897686004639, + "learning_rate": 6.51487974612592e-05, + "loss": 0.33392949104309083, + "step": 81830 + }, + { + "epoch": 0.351356224723732, + "grad_norm": 0.050913892686367035, + "learning_rate": 6.514448574114158e-05, + "loss": 0.23823533058166504, + "step": 81840 + }, + { + "epoch": 0.351399156813752, + "grad_norm": 0.43708866834640503, + "learning_rate": 6.514017402102395e-05, + "loss": 0.23856070041656494, + "step": 81850 + }, + { + "epoch": 0.351442088903772, + "grad_norm": 0.10660059750080109, + "learning_rate": 6.513586230090633e-05, + "loss": 0.28337559700012205, + "step": 81860 + }, + { + "epoch": 0.35148502099379203, + "grad_norm": 0.298322468996048, + "learning_rate": 6.513155058078871e-05, + "loss": 0.28602511882781984, + "step": 81870 + }, + { + "epoch": 0.351527953083812, + "grad_norm": 0.006606437731534243, + "learning_rate": 6.512723886067109e-05, + "loss": 0.1852823853492737, + "step": 81880 + }, + { + "epoch": 0.35157088517383206, + "grad_norm": 8.826769828796387, + "learning_rate": 6.512292714055346e-05, + "loss": 0.48439674377441405, + "step": 81890 + }, + { + "epoch": 0.35161381726385205, + "grad_norm": 0.0020860633812844753, + "learning_rate": 6.511861542043584e-05, + "loss": 0.015766726434230806, + "step": 81900 + }, + { + "epoch": 0.35165674935387203, + "grad_norm": 3.8179450035095215, + "learning_rate": 6.511430370031822e-05, + "loss": 0.41085362434387207, + "step": 81910 + }, + { + "epoch": 0.3516996814438921, + "grad_norm": 0.3400474786758423, + "learning_rate": 6.510999198020058e-05, + "loss": 0.13819591999053954, + "step": 81920 + }, + { + "epoch": 0.35174261353391206, + "grad_norm": 2.31620192527771, + "learning_rate": 6.510568026008296e-05, + "loss": 0.29909653663635255, + "step": 81930 + }, + { + "epoch": 0.35178554562393205, + "grad_norm": 1.8688679933547974, + "learning_rate": 6.510136853996534e-05, + "loss": 0.1573151469230652, + "step": 81940 + }, + { + "epoch": 0.3518284777139521, + "grad_norm": 0.1233760342001915, + "learning_rate": 6.509705681984771e-05, + "loss": 0.2249547004699707, + "step": 81950 + }, + { + "epoch": 0.3518714098039721, + "grad_norm": 1.5025233030319214, + "learning_rate": 6.509274509973009e-05, + "loss": 0.09836132526397705, + "step": 81960 + }, + { + "epoch": 0.35191434189399207, + "grad_norm": 21.100440979003906, + "learning_rate": 6.508843337961247e-05, + "loss": 0.09114798903465271, + "step": 81970 + }, + { + "epoch": 0.3519572739840121, + "grad_norm": 0.033003196120262146, + "learning_rate": 6.508412165949485e-05, + "loss": 0.14782416820526123, + "step": 81980 + }, + { + "epoch": 0.3520002060740321, + "grad_norm": 0.015041773207485676, + "learning_rate": 6.507980993937722e-05, + "loss": 0.18848912715911864, + "step": 81990 + }, + { + "epoch": 0.3520431381640521, + "grad_norm": 0.018849393352866173, + "learning_rate": 6.507549821925959e-05, + "loss": 0.3703503370285034, + "step": 82000 + }, + { + "epoch": 0.3520431381640521, + "eval_loss": 0.43411049246788025, + "eval_runtime": 27.2695, + "eval_samples_per_second": 3.667, + "eval_steps_per_second": 3.667, + "step": 82000 + }, + { + "epoch": 0.3520860702540721, + "grad_norm": 0.5550269484519958, + "learning_rate": 6.507118649914196e-05, + "loss": 0.246940279006958, + "step": 82010 + }, + { + "epoch": 0.3521290023440921, + "grad_norm": 0.24801412224769592, + "learning_rate": 6.506687477902434e-05, + "loss": 0.4794013977050781, + "step": 82020 + }, + { + "epoch": 0.3521719344341121, + "grad_norm": 0.02586125209927559, + "learning_rate": 6.506256305890672e-05, + "loss": 0.27535793781280515, + "step": 82030 + }, + { + "epoch": 0.35221486652413214, + "grad_norm": 0.39585307240486145, + "learning_rate": 6.50582513387891e-05, + "loss": 0.1664145588874817, + "step": 82040 + }, + { + "epoch": 0.35225779861415213, + "grad_norm": 0.010906996205449104, + "learning_rate": 6.505393961867147e-05, + "loss": 0.2748828649520874, + "step": 82050 + }, + { + "epoch": 0.3523007307041721, + "grad_norm": 0.005688409321010113, + "learning_rate": 6.504962789855385e-05, + "loss": 0.05075052976608276, + "step": 82060 + }, + { + "epoch": 0.35234366279419216, + "grad_norm": 0.010103407315909863, + "learning_rate": 6.504531617843623e-05, + "loss": 0.2605855464935303, + "step": 82070 + }, + { + "epoch": 0.35238659488421215, + "grad_norm": 0.12392642349004745, + "learning_rate": 6.50410044583186e-05, + "loss": 0.14412637948989868, + "step": 82080 + }, + { + "epoch": 0.35242952697423213, + "grad_norm": 0.0025631142780184746, + "learning_rate": 6.503669273820098e-05, + "loss": 0.13340569734573365, + "step": 82090 + }, + { + "epoch": 0.3524724590642522, + "grad_norm": 1.0315361022949219, + "learning_rate": 6.503238101808336e-05, + "loss": 0.30212409496307374, + "step": 82100 + }, + { + "epoch": 0.35251539115427216, + "grad_norm": 0.1816161572933197, + "learning_rate": 6.502806929796574e-05, + "loss": 0.15246015787124634, + "step": 82110 + }, + { + "epoch": 0.3525583232442922, + "grad_norm": 0.39198920130729675, + "learning_rate": 6.502375757784811e-05, + "loss": 0.31164183616638186, + "step": 82120 + }, + { + "epoch": 0.3526012553343122, + "grad_norm": 3.912492275238037, + "learning_rate": 6.501944585773049e-05, + "loss": 0.27409753799438474, + "step": 82130 + }, + { + "epoch": 0.3526441874243322, + "grad_norm": 1.958382487297058, + "learning_rate": 6.501513413761287e-05, + "loss": 0.1995323419570923, + "step": 82140 + }, + { + "epoch": 0.3526871195143522, + "grad_norm": 3.8886985778808594, + "learning_rate": 6.501082241749525e-05, + "loss": 0.2748485803604126, + "step": 82150 + }, + { + "epoch": 0.3527300516043722, + "grad_norm": 0.135552778840065, + "learning_rate": 6.500651069737761e-05, + "loss": 0.23292884826660157, + "step": 82160 + }, + { + "epoch": 0.3527729836943922, + "grad_norm": 2.7165191173553467, + "learning_rate": 6.500219897725999e-05, + "loss": 0.28181753158569334, + "step": 82170 + }, + { + "epoch": 0.35281591578441224, + "grad_norm": 1.8693931102752686, + "learning_rate": 6.499788725714236e-05, + "loss": 0.33779211044311525, + "step": 82180 + }, + { + "epoch": 0.3528588478744322, + "grad_norm": 0.1916666328907013, + "learning_rate": 6.499357553702474e-05, + "loss": 0.1582764983177185, + "step": 82190 + }, + { + "epoch": 0.3529017799644522, + "grad_norm": 0.04685550183057785, + "learning_rate": 6.498926381690712e-05, + "loss": 0.12637512683868407, + "step": 82200 + }, + { + "epoch": 0.35294471205447225, + "grad_norm": 0.13845305144786835, + "learning_rate": 6.49849520967895e-05, + "loss": 0.1528358817100525, + "step": 82210 + }, + { + "epoch": 0.35298764414449224, + "grad_norm": 0.0018608167301863432, + "learning_rate": 6.498064037667187e-05, + "loss": 0.07130320072174072, + "step": 82220 + }, + { + "epoch": 0.35303057623451223, + "grad_norm": 0.09361238777637482, + "learning_rate": 6.497632865655425e-05, + "loss": 0.0021398985758423806, + "step": 82230 + }, + { + "epoch": 0.35307350832453227, + "grad_norm": 0.10198186337947845, + "learning_rate": 6.497201693643663e-05, + "loss": 0.3469515800476074, + "step": 82240 + }, + { + "epoch": 0.35311644041455226, + "grad_norm": 0.0016866042278707027, + "learning_rate": 6.496770521631899e-05, + "loss": 0.09589306712150573, + "step": 82250 + }, + { + "epoch": 0.35315937250457224, + "grad_norm": 0.007193189579993486, + "learning_rate": 6.496339349620137e-05, + "loss": 0.2937009334564209, + "step": 82260 + }, + { + "epoch": 0.3532023045945923, + "grad_norm": 0.0004173514316789806, + "learning_rate": 6.495908177608375e-05, + "loss": 0.3419705629348755, + "step": 82270 + }, + { + "epoch": 0.3532452366846123, + "grad_norm": 0.1558980494737625, + "learning_rate": 6.495477005596612e-05, + "loss": 0.32904758453369143, + "step": 82280 + }, + { + "epoch": 0.35328816877463226, + "grad_norm": 0.22352668642997742, + "learning_rate": 6.49504583358485e-05, + "loss": 0.029522615671157836, + "step": 82290 + }, + { + "epoch": 0.3533311008646523, + "grad_norm": 0.2652186155319214, + "learning_rate": 6.494614661573088e-05, + "loss": 0.3830788850784302, + "step": 82300 + }, + { + "epoch": 0.3533740329546723, + "grad_norm": 1.32157564163208, + "learning_rate": 6.494183489561326e-05, + "loss": 0.24220361709594726, + "step": 82310 + }, + { + "epoch": 0.35341696504469233, + "grad_norm": 1.0047389268875122, + "learning_rate": 6.493752317549563e-05, + "loss": 0.34980719089508056, + "step": 82320 + }, + { + "epoch": 0.3534598971347123, + "grad_norm": 0.2502935528755188, + "learning_rate": 6.493321145537801e-05, + "loss": 0.2659575939178467, + "step": 82330 + }, + { + "epoch": 0.3535028292247323, + "grad_norm": 0.8654706478118896, + "learning_rate": 6.492889973526039e-05, + "loss": 0.22834746837615966, + "step": 82340 + }, + { + "epoch": 0.35354576131475235, + "grad_norm": 0.940198540687561, + "learning_rate": 6.492458801514277e-05, + "loss": 0.1983107566833496, + "step": 82350 + }, + { + "epoch": 0.35358869340477234, + "grad_norm": 2.899003744125366, + "learning_rate": 6.492027629502514e-05, + "loss": 0.2748394012451172, + "step": 82360 + }, + { + "epoch": 0.3536316254947923, + "grad_norm": 0.007231111638247967, + "learning_rate": 6.491596457490752e-05, + "loss": 0.07944477796554565, + "step": 82370 + }, + { + "epoch": 0.35367455758481237, + "grad_norm": 0.0028690106701105833, + "learning_rate": 6.49116528547899e-05, + "loss": 0.19711775779724122, + "step": 82380 + }, + { + "epoch": 0.35371748967483235, + "grad_norm": 1.741270661354065, + "learning_rate": 6.490734113467228e-05, + "loss": 0.17087342739105224, + "step": 82390 + }, + { + "epoch": 0.35376042176485234, + "grad_norm": 0.0005304127698764205, + "learning_rate": 6.490302941455465e-05, + "loss": 0.27425274848937986, + "step": 82400 + }, + { + "epoch": 0.3538033538548724, + "grad_norm": 1.3317078351974487, + "learning_rate": 6.489871769443702e-05, + "loss": 0.3008429765701294, + "step": 82410 + }, + { + "epoch": 0.35384628594489237, + "grad_norm": 0.018071355298161507, + "learning_rate": 6.48944059743194e-05, + "loss": 0.2681131362915039, + "step": 82420 + }, + { + "epoch": 0.35388921803491236, + "grad_norm": 1.2450603246688843, + "learning_rate": 6.489009425420177e-05, + "loss": 0.1308046340942383, + "step": 82430 + }, + { + "epoch": 0.3539321501249324, + "grad_norm": 0.0007980667287483811, + "learning_rate": 6.488578253408415e-05, + "loss": 0.20263445377349854, + "step": 82440 + }, + { + "epoch": 0.3539750822149524, + "grad_norm": 1.565921425819397, + "learning_rate": 6.488147081396653e-05, + "loss": 0.3779308795928955, + "step": 82450 + }, + { + "epoch": 0.3540180143049724, + "grad_norm": 1.4699956178665161, + "learning_rate": 6.48771590938489e-05, + "loss": 0.368353009223938, + "step": 82460 + }, + { + "epoch": 0.3540609463949924, + "grad_norm": 5.17052698135376, + "learning_rate": 6.487284737373128e-05, + "loss": 0.22542569637298585, + "step": 82470 + }, + { + "epoch": 0.3541038784850124, + "grad_norm": 0.41042616963386536, + "learning_rate": 6.486853565361366e-05, + "loss": 0.17170801162719726, + "step": 82480 + }, + { + "epoch": 0.3541468105750324, + "grad_norm": 0.00907969567924738, + "learning_rate": 6.486422393349602e-05, + "loss": 0.16800456047058104, + "step": 82490 + }, + { + "epoch": 0.35418974266505243, + "grad_norm": 1.903733491897583, + "learning_rate": 6.48599122133784e-05, + "loss": 0.27847657203674314, + "step": 82500 + }, + { + "epoch": 0.3542326747550724, + "grad_norm": 0.8449665307998657, + "learning_rate": 6.485560049326078e-05, + "loss": 0.19843802452087403, + "step": 82510 + }, + { + "epoch": 0.3542756068450924, + "grad_norm": 0.00030148582300171256, + "learning_rate": 6.485128877314315e-05, + "loss": 0.17515969276428223, + "step": 82520 + }, + { + "epoch": 0.35431853893511245, + "grad_norm": 0.0056775761768221855, + "learning_rate": 6.484697705302553e-05, + "loss": 0.11901193857192993, + "step": 82530 + }, + { + "epoch": 0.35436147102513244, + "grad_norm": 0.005599792581051588, + "learning_rate": 6.484266533290791e-05, + "loss": 0.3014923810958862, + "step": 82540 + }, + { + "epoch": 0.3544044031151525, + "grad_norm": 0.14583712816238403, + "learning_rate": 6.483835361279029e-05, + "loss": 0.09148666262626648, + "step": 82550 + }, + { + "epoch": 0.35444733520517246, + "grad_norm": 0.03529001399874687, + "learning_rate": 6.483404189267268e-05, + "loss": 0.2952387571334839, + "step": 82560 + }, + { + "epoch": 0.35449026729519245, + "grad_norm": 0.021427098661661148, + "learning_rate": 6.482973017255505e-05, + "loss": 0.16118075847625732, + "step": 82570 + }, + { + "epoch": 0.3545331993852125, + "grad_norm": 0.02916530705988407, + "learning_rate": 6.482541845243742e-05, + "loss": 0.27104642391204836, + "step": 82580 + }, + { + "epoch": 0.3545761314752325, + "grad_norm": 0.00043326994637027383, + "learning_rate": 6.48211067323198e-05, + "loss": 0.2643244504928589, + "step": 82590 + }, + { + "epoch": 0.35461906356525247, + "grad_norm": 4.119890213012695, + "learning_rate": 6.481679501220217e-05, + "loss": 0.39766530990600585, + "step": 82600 + }, + { + "epoch": 0.3546619956552725, + "grad_norm": 1.3540154695510864, + "learning_rate": 6.481248329208455e-05, + "loss": 0.27116658687591555, + "step": 82610 + }, + { + "epoch": 0.3547049277452925, + "grad_norm": 1.0312505960464478, + "learning_rate": 6.480817157196693e-05, + "loss": 0.33033294677734376, + "step": 82620 + }, + { + "epoch": 0.3547478598353125, + "grad_norm": 5.983156204223633, + "learning_rate": 6.48038598518493e-05, + "loss": 0.3525785207748413, + "step": 82630 + }, + { + "epoch": 0.3547907919253325, + "grad_norm": 0.2073276787996292, + "learning_rate": 6.479954813173168e-05, + "loss": 0.0813725769519806, + "step": 82640 + }, + { + "epoch": 0.3548337240153525, + "grad_norm": 2.0966362953186035, + "learning_rate": 6.479523641161406e-05, + "loss": 0.14142994880676268, + "step": 82650 + }, + { + "epoch": 0.3548766561053725, + "grad_norm": 0.07701893895864487, + "learning_rate": 6.479092469149642e-05, + "loss": 0.1387007713317871, + "step": 82660 + }, + { + "epoch": 0.35491958819539254, + "grad_norm": 0.033894576132297516, + "learning_rate": 6.47866129713788e-05, + "loss": 0.22242212295532227, + "step": 82670 + }, + { + "epoch": 0.35496252028541253, + "grad_norm": 1.597336769104004, + "learning_rate": 6.478230125126118e-05, + "loss": 0.29877052307128904, + "step": 82680 + }, + { + "epoch": 0.3550054523754325, + "grad_norm": 0.0051557328552007675, + "learning_rate": 6.477798953114356e-05, + "loss": 0.3312845230102539, + "step": 82690 + }, + { + "epoch": 0.35504838446545256, + "grad_norm": 0.05306149646639824, + "learning_rate": 6.477367781102593e-05, + "loss": 0.2668110132217407, + "step": 82700 + }, + { + "epoch": 0.35509131655547255, + "grad_norm": 0.5295519828796387, + "learning_rate": 6.476936609090831e-05, + "loss": 0.49150676727294923, + "step": 82710 + }, + { + "epoch": 0.35513424864549253, + "grad_norm": 2.946373462677002, + "learning_rate": 6.476505437079069e-05, + "loss": 0.23695406913757325, + "step": 82720 + }, + { + "epoch": 0.3551771807355126, + "grad_norm": 1.3482404947280884, + "learning_rate": 6.476074265067306e-05, + "loss": 0.3448635578155518, + "step": 82730 + }, + { + "epoch": 0.35522011282553256, + "grad_norm": 1.2963811159133911, + "learning_rate": 6.475643093055543e-05, + "loss": 0.3682236194610596, + "step": 82740 + }, + { + "epoch": 0.3552630449155526, + "grad_norm": 0.2663824260234833, + "learning_rate": 6.47521192104378e-05, + "loss": 0.31581456661224366, + "step": 82750 + }, + { + "epoch": 0.3553059770055726, + "grad_norm": 1.069393515586853, + "learning_rate": 6.474780749032018e-05, + "loss": 0.44371590614318845, + "step": 82760 + }, + { + "epoch": 0.3553489090955926, + "grad_norm": 0.7076935172080994, + "learning_rate": 6.474349577020256e-05, + "loss": 0.27069649696350095, + "step": 82770 + }, + { + "epoch": 0.3553918411856126, + "grad_norm": 5.1295695304870605, + "learning_rate": 6.473918405008495e-05, + "loss": 0.2316493034362793, + "step": 82780 + }, + { + "epoch": 0.3554347732756326, + "grad_norm": 2.332552194595337, + "learning_rate": 6.473487232996733e-05, + "loss": 0.159147310256958, + "step": 82790 + }, + { + "epoch": 0.3554777053656526, + "grad_norm": 0.1406073421239853, + "learning_rate": 6.47305606098497e-05, + "loss": 0.28304622173309324, + "step": 82800 + }, + { + "epoch": 0.35552063745567264, + "grad_norm": 0.30248576402664185, + "learning_rate": 6.472624888973208e-05, + "loss": 0.3555166721343994, + "step": 82810 + }, + { + "epoch": 0.3555635695456926, + "grad_norm": 0.0017919761594384909, + "learning_rate": 6.472193716961445e-05, + "loss": 0.12960315942764283, + "step": 82820 + }, + { + "epoch": 0.3556065016357126, + "grad_norm": 0.4125811755657196, + "learning_rate": 6.471762544949682e-05, + "loss": 0.2231734037399292, + "step": 82830 + }, + { + "epoch": 0.35564943372573266, + "grad_norm": 0.026550287380814552, + "learning_rate": 6.47133137293792e-05, + "loss": 0.004985055699944496, + "step": 82840 + }, + { + "epoch": 0.35569236581575264, + "grad_norm": 0.6616964340209961, + "learning_rate": 6.470900200926158e-05, + "loss": 0.05897141098976135, + "step": 82850 + }, + { + "epoch": 0.35573529790577263, + "grad_norm": 4.38887357711792, + "learning_rate": 6.470469028914396e-05, + "loss": 0.24175820350646973, + "step": 82860 + }, + { + "epoch": 0.35577822999579267, + "grad_norm": 0.04604952037334442, + "learning_rate": 6.470037856902633e-05, + "loss": 0.2245575189590454, + "step": 82870 + }, + { + "epoch": 0.35582116208581266, + "grad_norm": 0.8017083406448364, + "learning_rate": 6.469606684890871e-05, + "loss": 0.1778331756591797, + "step": 82880 + }, + { + "epoch": 0.35586409417583265, + "grad_norm": 1.9243903160095215, + "learning_rate": 6.469175512879109e-05, + "loss": 0.2638943910598755, + "step": 82890 + }, + { + "epoch": 0.3559070262658527, + "grad_norm": 1.4982459545135498, + "learning_rate": 6.468744340867345e-05, + "loss": 0.4658337116241455, + "step": 82900 + }, + { + "epoch": 0.3559499583558727, + "grad_norm": 0.10741309821605682, + "learning_rate": 6.468313168855583e-05, + "loss": 0.2154242753982544, + "step": 82910 + }, + { + "epoch": 0.35599289044589266, + "grad_norm": 0.015362433157861233, + "learning_rate": 6.467881996843821e-05, + "loss": 0.10966675281524658, + "step": 82920 + }, + { + "epoch": 0.3560358225359127, + "grad_norm": 1.1122446060180664, + "learning_rate": 6.467450824832058e-05, + "loss": 0.1882996916770935, + "step": 82930 + }, + { + "epoch": 0.3560787546259327, + "grad_norm": 2.366774320602417, + "learning_rate": 6.467019652820296e-05, + "loss": 0.2599278688430786, + "step": 82940 + }, + { + "epoch": 0.3561216867159527, + "grad_norm": 0.016711033880710602, + "learning_rate": 6.466588480808534e-05, + "loss": 0.0729515790939331, + "step": 82950 + }, + { + "epoch": 0.3561646188059727, + "grad_norm": 0.8760409355163574, + "learning_rate": 6.466157308796772e-05, + "loss": 0.324887752532959, + "step": 82960 + }, + { + "epoch": 0.3562075508959927, + "grad_norm": 0.08435682952404022, + "learning_rate": 6.46572613678501e-05, + "loss": 0.17404361963272094, + "step": 82970 + }, + { + "epoch": 0.35625048298601275, + "grad_norm": 0.0024207860697060823, + "learning_rate": 6.465294964773247e-05, + "loss": 0.02005379945039749, + "step": 82980 + }, + { + "epoch": 0.35629341507603274, + "grad_norm": 26.516660690307617, + "learning_rate": 6.464863792761483e-05, + "loss": 0.1444633960723877, + "step": 82990 + }, + { + "epoch": 0.3563363471660527, + "grad_norm": 10.03338623046875, + "learning_rate": 6.464432620749723e-05, + "loss": 0.23125367164611815, + "step": 83000 + }, + { + "epoch": 0.3563363471660527, + "eval_loss": 0.42964938282966614, + "eval_runtime": 27.1156, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 83000 + }, + { + "epoch": 0.35637927925607277, + "grad_norm": 1.4059851169586182, + "learning_rate": 6.46400144873796e-05, + "loss": 0.30113949775695803, + "step": 83010 + }, + { + "epoch": 0.35642221134609275, + "grad_norm": 0.0022157509811222553, + "learning_rate": 6.463570276726198e-05, + "loss": 0.09114395380020142, + "step": 83020 + }, + { + "epoch": 0.35646514343611274, + "grad_norm": 0.022014634683728218, + "learning_rate": 6.463139104714436e-05, + "loss": 0.2713270902633667, + "step": 83030 + }, + { + "epoch": 0.3565080755261328, + "grad_norm": 1.2134231328964233, + "learning_rate": 6.462707932702673e-05, + "loss": 0.1482961058616638, + "step": 83040 + }, + { + "epoch": 0.35655100761615277, + "grad_norm": 0.13742724061012268, + "learning_rate": 6.462276760690911e-05, + "loss": 0.2234196186065674, + "step": 83050 + }, + { + "epoch": 0.35659393970617276, + "grad_norm": 0.008671365678310394, + "learning_rate": 6.461845588679149e-05, + "loss": 0.20016765594482422, + "step": 83060 + }, + { + "epoch": 0.3566368717961928, + "grad_norm": 6.836441993713379, + "learning_rate": 6.461414416667385e-05, + "loss": 0.25808258056640626, + "step": 83070 + }, + { + "epoch": 0.3566798038862128, + "grad_norm": 1.2114756107330322, + "learning_rate": 6.460983244655623e-05, + "loss": 0.09312286376953124, + "step": 83080 + }, + { + "epoch": 0.3567227359762328, + "grad_norm": 0.00714797992259264, + "learning_rate": 6.460552072643861e-05, + "loss": 0.13834846019744873, + "step": 83090 + }, + { + "epoch": 0.3567656680662528, + "grad_norm": 10.475757598876953, + "learning_rate": 6.460120900632099e-05, + "loss": 0.29724955558776855, + "step": 83100 + }, + { + "epoch": 0.3568086001562728, + "grad_norm": 0.007221123669296503, + "learning_rate": 6.459689728620336e-05, + "loss": 0.13953012228012085, + "step": 83110 + }, + { + "epoch": 0.3568515322462928, + "grad_norm": 1.4416766166687012, + "learning_rate": 6.459258556608574e-05, + "loss": 0.19707913398742677, + "step": 83120 + }, + { + "epoch": 0.35689446433631283, + "grad_norm": 0.09982796013355255, + "learning_rate": 6.458827384596812e-05, + "loss": 0.24213099479675293, + "step": 83130 + }, + { + "epoch": 0.3569373964263328, + "grad_norm": 0.5285547375679016, + "learning_rate": 6.45839621258505e-05, + "loss": 0.2476147174835205, + "step": 83140 + }, + { + "epoch": 0.3569803285163528, + "grad_norm": 1.9965113401412964, + "learning_rate": 6.457965040573286e-05, + "loss": 0.21606626510620117, + "step": 83150 + }, + { + "epoch": 0.35702326060637285, + "grad_norm": 1.2305128574371338, + "learning_rate": 6.457533868561524e-05, + "loss": 0.5005519390106201, + "step": 83160 + }, + { + "epoch": 0.35706619269639284, + "grad_norm": 0.06872189790010452, + "learning_rate": 6.457102696549761e-05, + "loss": 0.37816739082336426, + "step": 83170 + }, + { + "epoch": 0.3571091247864129, + "grad_norm": 0.0010791983222588897, + "learning_rate": 6.456671524537999e-05, + "loss": 0.37646634578704835, + "step": 83180 + }, + { + "epoch": 0.35715205687643287, + "grad_norm": 0.0028619798831641674, + "learning_rate": 6.456240352526237e-05, + "loss": 0.18022361993789673, + "step": 83190 + }, + { + "epoch": 0.35719498896645285, + "grad_norm": 1.1735813617706299, + "learning_rate": 6.455809180514475e-05, + "loss": 0.23538787364959718, + "step": 83200 + }, + { + "epoch": 0.3572379210564729, + "grad_norm": 2.028512716293335, + "learning_rate": 6.455378008502712e-05, + "loss": 0.28205409049987795, + "step": 83210 + }, + { + "epoch": 0.3572808531464929, + "grad_norm": 0.35201296210289, + "learning_rate": 6.45494683649095e-05, + "loss": 0.06089202761650085, + "step": 83220 + }, + { + "epoch": 0.35732378523651287, + "grad_norm": 0.07536812871694565, + "learning_rate": 6.454515664479188e-05, + "loss": 0.2602132797241211, + "step": 83230 + }, + { + "epoch": 0.3573667173265329, + "grad_norm": 0.014509957283735275, + "learning_rate": 6.454084492467425e-05, + "loss": 0.24068257808685303, + "step": 83240 + }, + { + "epoch": 0.3574096494165529, + "grad_norm": 3.485565185546875, + "learning_rate": 6.453653320455663e-05, + "loss": 0.14384920597076417, + "step": 83250 + }, + { + "epoch": 0.3574525815065729, + "grad_norm": 0.010356109589338303, + "learning_rate": 6.453222148443901e-05, + "loss": 0.17156875133514404, + "step": 83260 + }, + { + "epoch": 0.35749551359659293, + "grad_norm": 0.8293179273605347, + "learning_rate": 6.452790976432139e-05, + "loss": 0.14238990545272828, + "step": 83270 + }, + { + "epoch": 0.3575384456866129, + "grad_norm": 1.1598352193832397, + "learning_rate": 6.452359804420376e-05, + "loss": 0.42585220336914065, + "step": 83280 + }, + { + "epoch": 0.3575813777766329, + "grad_norm": 1.778437852859497, + "learning_rate": 6.451928632408614e-05, + "loss": 0.2574995279312134, + "step": 83290 + }, + { + "epoch": 0.35762430986665295, + "grad_norm": 0.6484081745147705, + "learning_rate": 6.451497460396852e-05, + "loss": 0.28310160636901854, + "step": 83300 + }, + { + "epoch": 0.35766724195667293, + "grad_norm": 0.2528004050254822, + "learning_rate": 6.45106628838509e-05, + "loss": 0.11739856004714966, + "step": 83310 + }, + { + "epoch": 0.3577101740466929, + "grad_norm": 0.029177529737353325, + "learning_rate": 6.450635116373326e-05, + "loss": 0.026236182451248168, + "step": 83320 + }, + { + "epoch": 0.35775310613671296, + "grad_norm": 8.064805030822754, + "learning_rate": 6.450203944361564e-05, + "loss": 0.23941564559936523, + "step": 83330 + }, + { + "epoch": 0.35779603822673295, + "grad_norm": 0.12642982602119446, + "learning_rate": 6.449772772349801e-05, + "loss": 0.17229573726654052, + "step": 83340 + }, + { + "epoch": 0.35783897031675294, + "grad_norm": 0.01093371957540512, + "learning_rate": 6.449341600338039e-05, + "loss": 0.25434882640838624, + "step": 83350 + }, + { + "epoch": 0.357881902406773, + "grad_norm": 0.5927442908287048, + "learning_rate": 6.448910428326277e-05, + "loss": 0.39759347438812254, + "step": 83360 + }, + { + "epoch": 0.35792483449679297, + "grad_norm": 0.5692152976989746, + "learning_rate": 6.448479256314515e-05, + "loss": 0.2229149341583252, + "step": 83370 + }, + { + "epoch": 0.35796776658681295, + "grad_norm": 0.9230725169181824, + "learning_rate": 6.448048084302752e-05, + "loss": 0.0902398943901062, + "step": 83380 + }, + { + "epoch": 0.358010698676833, + "grad_norm": 0.0053199403919279575, + "learning_rate": 6.44761691229099e-05, + "loss": 0.20757627487182617, + "step": 83390 + }, + { + "epoch": 0.358053630766853, + "grad_norm": 1.6971391439437866, + "learning_rate": 6.447185740279227e-05, + "loss": 0.5305335044860839, + "step": 83400 + }, + { + "epoch": 0.358096562856873, + "grad_norm": 0.02110726200044155, + "learning_rate": 6.446754568267464e-05, + "loss": 0.07769713997840881, + "step": 83410 + }, + { + "epoch": 0.358139494946893, + "grad_norm": 0.005539445672184229, + "learning_rate": 6.446323396255702e-05, + "loss": 0.18624429702758788, + "step": 83420 + }, + { + "epoch": 0.358182427036913, + "grad_norm": 0.006300756242126226, + "learning_rate": 6.44589222424394e-05, + "loss": 0.18109878301620483, + "step": 83430 + }, + { + "epoch": 0.35822535912693304, + "grad_norm": 3.080735683441162, + "learning_rate": 6.445461052232177e-05, + "loss": 0.42140870094299315, + "step": 83440 + }, + { + "epoch": 0.358268291216953, + "grad_norm": 0.41588640213012695, + "learning_rate": 6.445029880220415e-05, + "loss": 0.28173749446868895, + "step": 83450 + }, + { + "epoch": 0.358311223306973, + "grad_norm": 0.07146584987640381, + "learning_rate": 6.444598708208653e-05, + "loss": 0.23534250259399414, + "step": 83460 + }, + { + "epoch": 0.35835415539699306, + "grad_norm": 0.0056983851827681065, + "learning_rate": 6.44416753619689e-05, + "loss": 0.11927134990692138, + "step": 83470 + }, + { + "epoch": 0.35839708748701304, + "grad_norm": 0.008995592594146729, + "learning_rate": 6.443736364185128e-05, + "loss": 0.24893059730529785, + "step": 83480 + }, + { + "epoch": 0.35844001957703303, + "grad_norm": 0.6611373424530029, + "learning_rate": 6.443305192173366e-05, + "loss": 0.18371953964233398, + "step": 83490 + }, + { + "epoch": 0.3584829516670531, + "grad_norm": 0.01235401164740324, + "learning_rate": 6.442874020161604e-05, + "loss": 0.05725529193878174, + "step": 83500 + }, + { + "epoch": 0.35852588375707306, + "grad_norm": 0.182949036359787, + "learning_rate": 6.442442848149842e-05, + "loss": 0.22440974712371825, + "step": 83510 + }, + { + "epoch": 0.35856881584709305, + "grad_norm": 5.291581153869629, + "learning_rate": 6.442011676138079e-05, + "loss": 0.283705997467041, + "step": 83520 + }, + { + "epoch": 0.3586117479371131, + "grad_norm": 0.016548916697502136, + "learning_rate": 6.441580504126317e-05, + "loss": 0.6019593238830566, + "step": 83530 + }, + { + "epoch": 0.3586546800271331, + "grad_norm": 0.08653654158115387, + "learning_rate": 6.441149332114555e-05, + "loss": 0.08072211146354676, + "step": 83540 + }, + { + "epoch": 0.35869761211715306, + "grad_norm": 0.13255316019058228, + "learning_rate": 6.440718160102793e-05, + "loss": 0.20978965759277343, + "step": 83550 + }, + { + "epoch": 0.3587405442071731, + "grad_norm": 0.0007236854871734977, + "learning_rate": 6.440286988091029e-05, + "loss": 0.07688579559326172, + "step": 83560 + }, + { + "epoch": 0.3587834762971931, + "grad_norm": 0.010213850066065788, + "learning_rate": 6.439855816079267e-05, + "loss": 0.12269260883331298, + "step": 83570 + }, + { + "epoch": 0.3588264083872131, + "grad_norm": 0.001452255412004888, + "learning_rate": 6.439424644067504e-05, + "loss": 0.10330994129180908, + "step": 83580 + }, + { + "epoch": 0.3588693404772331, + "grad_norm": 0.02720882184803486, + "learning_rate": 6.438993472055742e-05, + "loss": 0.08257366418838501, + "step": 83590 + }, + { + "epoch": 0.3589122725672531, + "grad_norm": 1.2951226234436035, + "learning_rate": 6.43856230004398e-05, + "loss": 0.0941824495792389, + "step": 83600 + }, + { + "epoch": 0.35895520465727315, + "grad_norm": 0.021939700469374657, + "learning_rate": 6.438131128032218e-05, + "loss": 0.1827239990234375, + "step": 83610 + }, + { + "epoch": 0.35899813674729314, + "grad_norm": 0.001229679910466075, + "learning_rate": 6.437699956020455e-05, + "loss": 0.23367047309875488, + "step": 83620 + }, + { + "epoch": 0.3590410688373131, + "grad_norm": 0.013304614461958408, + "learning_rate": 6.437268784008693e-05, + "loss": 0.32072179317474364, + "step": 83630 + }, + { + "epoch": 0.35908400092733317, + "grad_norm": 0.08952384442090988, + "learning_rate": 6.436837611996931e-05, + "loss": 0.6800857543945312, + "step": 83640 + }, + { + "epoch": 0.35912693301735316, + "grad_norm": 0.0006670946022495627, + "learning_rate": 6.436406439985167e-05, + "loss": 0.1628146767616272, + "step": 83650 + }, + { + "epoch": 0.35916986510737314, + "grad_norm": 1.8681294918060303, + "learning_rate": 6.435975267973405e-05, + "loss": 0.3500849485397339, + "step": 83660 + }, + { + "epoch": 0.3592127971973932, + "grad_norm": 0.05229023098945618, + "learning_rate": 6.435544095961643e-05, + "loss": 0.1595700979232788, + "step": 83670 + }, + { + "epoch": 0.3592557292874132, + "grad_norm": 3.6567986011505127, + "learning_rate": 6.43511292394988e-05, + "loss": 0.16614724397659303, + "step": 83680 + }, + { + "epoch": 0.35929866137743316, + "grad_norm": 2.2354629039764404, + "learning_rate": 6.434681751938118e-05, + "loss": 0.24699800014495848, + "step": 83690 + }, + { + "epoch": 0.3593415934674532, + "grad_norm": 1.6552985906600952, + "learning_rate": 6.434250579926356e-05, + "loss": 0.2638379096984863, + "step": 83700 + }, + { + "epoch": 0.3593845255574732, + "grad_norm": 0.1515263170003891, + "learning_rate": 6.433819407914594e-05, + "loss": 0.36674022674560547, + "step": 83710 + }, + { + "epoch": 0.3594274576474932, + "grad_norm": 1.1364909410476685, + "learning_rate": 6.433388235902831e-05, + "loss": 0.11204863786697387, + "step": 83720 + }, + { + "epoch": 0.3594703897375132, + "grad_norm": 0.1920362263917923, + "learning_rate": 6.432957063891069e-05, + "loss": 0.09521974325180053, + "step": 83730 + }, + { + "epoch": 0.3595133218275332, + "grad_norm": 1.830244541168213, + "learning_rate": 6.432525891879307e-05, + "loss": 0.26940653324127195, + "step": 83740 + }, + { + "epoch": 0.3595562539175532, + "grad_norm": 8.78477954864502, + "learning_rate": 6.432094719867544e-05, + "loss": 0.21584708690643312, + "step": 83750 + }, + { + "epoch": 0.35959918600757323, + "grad_norm": 0.012690392322838306, + "learning_rate": 6.431663547855782e-05, + "loss": 0.16282269954681397, + "step": 83760 + }, + { + "epoch": 0.3596421180975932, + "grad_norm": 0.03721415251493454, + "learning_rate": 6.43123237584402e-05, + "loss": 0.2313145875930786, + "step": 83770 + }, + { + "epoch": 0.3596850501876132, + "grad_norm": 0.06290198862552643, + "learning_rate": 6.430801203832258e-05, + "loss": 0.0945240318775177, + "step": 83780 + }, + { + "epoch": 0.35972798227763325, + "grad_norm": 0.15895670652389526, + "learning_rate": 6.430370031820495e-05, + "loss": 0.19284458160400392, + "step": 83790 + }, + { + "epoch": 0.35977091436765324, + "grad_norm": 1.286566138267517, + "learning_rate": 6.429938859808733e-05, + "loss": 0.09456119537353516, + "step": 83800 + }, + { + "epoch": 0.3598138464576732, + "grad_norm": 0.011230570264160633, + "learning_rate": 6.42950768779697e-05, + "loss": 0.08266975283622742, + "step": 83810 + }, + { + "epoch": 0.35985677854769327, + "grad_norm": 0.011820230633020401, + "learning_rate": 6.429076515785207e-05, + "loss": 0.16751736402511597, + "step": 83820 + }, + { + "epoch": 0.35989971063771325, + "grad_norm": 2.0594077110290527, + "learning_rate": 6.428645343773445e-05, + "loss": 0.2954521417617798, + "step": 83830 + }, + { + "epoch": 0.3599426427277333, + "grad_norm": 0.07682941108942032, + "learning_rate": 6.428214171761683e-05, + "loss": 0.0959571897983551, + "step": 83840 + }, + { + "epoch": 0.3599855748177533, + "grad_norm": 0.026437979191541672, + "learning_rate": 6.42778299974992e-05, + "loss": 0.25702903270721433, + "step": 83850 + }, + { + "epoch": 0.36002850690777327, + "grad_norm": 0.12116403132677078, + "learning_rate": 6.427351827738158e-05, + "loss": 0.2732200860977173, + "step": 83860 + }, + { + "epoch": 0.3600714389977933, + "grad_norm": 1.381988763809204, + "learning_rate": 6.426920655726396e-05, + "loss": 0.2366987943649292, + "step": 83870 + }, + { + "epoch": 0.3601143710878133, + "grad_norm": 0.008017339743673801, + "learning_rate": 6.426489483714634e-05, + "loss": 0.2086495876312256, + "step": 83880 + }, + { + "epoch": 0.3601573031778333, + "grad_norm": 0.03098870813846588, + "learning_rate": 6.42605831170287e-05, + "loss": 0.3768089771270752, + "step": 83890 + }, + { + "epoch": 0.36020023526785333, + "grad_norm": 0.017910556867718697, + "learning_rate": 6.425627139691108e-05, + "loss": 0.22290031909942626, + "step": 83900 + }, + { + "epoch": 0.3602431673578733, + "grad_norm": 0.010782677680253983, + "learning_rate": 6.425195967679346e-05, + "loss": 0.24496030807495117, + "step": 83910 + }, + { + "epoch": 0.3602860994478933, + "grad_norm": 0.1066930741071701, + "learning_rate": 6.424764795667583e-05, + "loss": 0.05222045183181763, + "step": 83920 + }, + { + "epoch": 0.36032903153791335, + "grad_norm": 10.818086624145508, + "learning_rate": 6.424333623655821e-05, + "loss": 0.06975013017654419, + "step": 83930 + }, + { + "epoch": 0.36037196362793333, + "grad_norm": 1.1675182580947876, + "learning_rate": 6.423902451644059e-05, + "loss": 0.26229162216186525, + "step": 83940 + }, + { + "epoch": 0.3604148957179533, + "grad_norm": 0.01140518020838499, + "learning_rate": 6.423471279632296e-05, + "loss": 0.15673032999038697, + "step": 83950 + }, + { + "epoch": 0.36045782780797336, + "grad_norm": 0.2980732321739197, + "learning_rate": 6.423040107620534e-05, + "loss": 0.31344664096832275, + "step": 83960 + }, + { + "epoch": 0.36050075989799335, + "grad_norm": 2.793539524078369, + "learning_rate": 6.422608935608772e-05, + "loss": 0.23717548847198486, + "step": 83970 + }, + { + "epoch": 0.36054369198801334, + "grad_norm": 0.06608152389526367, + "learning_rate": 6.42217776359701e-05, + "loss": 0.10230793952941894, + "step": 83980 + }, + { + "epoch": 0.3605866240780334, + "grad_norm": 1.9413366317749023, + "learning_rate": 6.421746591585247e-05, + "loss": 0.38025608062744143, + "step": 83990 + }, + { + "epoch": 0.36062955616805337, + "grad_norm": 0.32591816782951355, + "learning_rate": 6.421315419573485e-05, + "loss": 0.10384665727615357, + "step": 84000 + }, + { + "epoch": 0.36062955616805337, + "eval_loss": 0.4329765737056732, + "eval_runtime": 27.2682, + "eval_samples_per_second": 3.667, + "eval_steps_per_second": 3.667, + "step": 84000 + }, + { + "epoch": 0.36067248825807335, + "grad_norm": 2.1451854705810547, + "learning_rate": 6.420884247561723e-05, + "loss": 0.23780550956726074, + "step": 84010 + }, + { + "epoch": 0.3607154203480934, + "grad_norm": 0.008518136106431484, + "learning_rate": 6.42045307554996e-05, + "loss": 0.15147947072982787, + "step": 84020 + }, + { + "epoch": 0.3607583524381134, + "grad_norm": 1.9867619276046753, + "learning_rate": 6.420021903538198e-05, + "loss": 0.3307706594467163, + "step": 84030 + }, + { + "epoch": 0.3608012845281334, + "grad_norm": 0.012360497377812862, + "learning_rate": 6.419590731526436e-05, + "loss": 0.2077253818511963, + "step": 84040 + }, + { + "epoch": 0.3608442166181534, + "grad_norm": 0.1372559815645218, + "learning_rate": 6.419159559514674e-05, + "loss": 0.12807326316833495, + "step": 84050 + }, + { + "epoch": 0.3608871487081734, + "grad_norm": 0.07335194945335388, + "learning_rate": 6.41872838750291e-05, + "loss": 0.2796891212463379, + "step": 84060 + }, + { + "epoch": 0.36093008079819344, + "grad_norm": 4.2282304763793945, + "learning_rate": 6.418297215491148e-05, + "loss": 0.2018202781677246, + "step": 84070 + }, + { + "epoch": 0.36097301288821343, + "grad_norm": 2.3254733085632324, + "learning_rate": 6.417866043479386e-05, + "loss": 0.07892566919326782, + "step": 84080 + }, + { + "epoch": 0.3610159449782334, + "grad_norm": 0.038394901901483536, + "learning_rate": 6.417434871467623e-05, + "loss": 0.23869423866271972, + "step": 84090 + }, + { + "epoch": 0.36105887706825346, + "grad_norm": 6.333953380584717, + "learning_rate": 6.417003699455861e-05, + "loss": 0.22153542041778565, + "step": 84100 + }, + { + "epoch": 0.36110180915827345, + "grad_norm": 0.09468687325716019, + "learning_rate": 6.416572527444099e-05, + "loss": 0.6222201824188233, + "step": 84110 + }, + { + "epoch": 0.36114474124829343, + "grad_norm": 0.07344312965869904, + "learning_rate": 6.416141355432337e-05, + "loss": 0.3869171619415283, + "step": 84120 + }, + { + "epoch": 0.3611876733383135, + "grad_norm": 0.34509482979774475, + "learning_rate": 6.415710183420574e-05, + "loss": 0.10307228565216064, + "step": 84130 + }, + { + "epoch": 0.36123060542833346, + "grad_norm": 0.09708867967128754, + "learning_rate": 6.415279011408811e-05, + "loss": 0.14578685760498047, + "step": 84140 + }, + { + "epoch": 0.36127353751835345, + "grad_norm": 0.014592817053198814, + "learning_rate": 6.414847839397048e-05, + "loss": 0.22615408897399902, + "step": 84150 + }, + { + "epoch": 0.3613164696083735, + "grad_norm": 0.04447157680988312, + "learning_rate": 6.414416667385286e-05, + "loss": 0.170421826839447, + "step": 84160 + }, + { + "epoch": 0.3613594016983935, + "grad_norm": 0.013009901158511639, + "learning_rate": 6.413985495373524e-05, + "loss": 0.2288771390914917, + "step": 84170 + }, + { + "epoch": 0.36140233378841347, + "grad_norm": 0.04273710027337074, + "learning_rate": 6.413554323361762e-05, + "loss": 0.10149039030075073, + "step": 84180 + }, + { + "epoch": 0.3614452658784335, + "grad_norm": 0.683621883392334, + "learning_rate": 6.413123151350001e-05, + "loss": 0.467113733291626, + "step": 84190 + }, + { + "epoch": 0.3614881979684535, + "grad_norm": 1.7198909521102905, + "learning_rate": 6.412691979338238e-05, + "loss": 0.3722904443740845, + "step": 84200 + }, + { + "epoch": 0.3615311300584735, + "grad_norm": 0.16589732468128204, + "learning_rate": 6.412260807326476e-05, + "loss": 0.1510645031929016, + "step": 84210 + }, + { + "epoch": 0.3615740621484935, + "grad_norm": 0.452720046043396, + "learning_rate": 6.411829635314713e-05, + "loss": 0.3065751075744629, + "step": 84220 + }, + { + "epoch": 0.3616169942385135, + "grad_norm": 0.0688873827457428, + "learning_rate": 6.41139846330295e-05, + "loss": 0.11986181735992432, + "step": 84230 + }, + { + "epoch": 0.3616599263285335, + "grad_norm": 0.2770315408706665, + "learning_rate": 6.410967291291188e-05, + "loss": 0.34214606285095217, + "step": 84240 + }, + { + "epoch": 0.36170285841855354, + "grad_norm": 1.144366979598999, + "learning_rate": 6.410536119279426e-05, + "loss": 0.3191501617431641, + "step": 84250 + }, + { + "epoch": 0.36174579050857353, + "grad_norm": 5.4182047843933105, + "learning_rate": 6.410104947267664e-05, + "loss": 0.26186683177948, + "step": 84260 + }, + { + "epoch": 0.36178872259859357, + "grad_norm": 0.4700573682785034, + "learning_rate": 6.409673775255901e-05, + "loss": 0.16876962184906005, + "step": 84270 + }, + { + "epoch": 0.36183165468861356, + "grad_norm": 3.8683853149414062, + "learning_rate": 6.409242603244139e-05, + "loss": 0.11432063579559326, + "step": 84280 + }, + { + "epoch": 0.36187458677863354, + "grad_norm": 5.655078411102295, + "learning_rate": 6.408811431232377e-05, + "loss": 0.21527984142303466, + "step": 84290 + }, + { + "epoch": 0.3619175188686536, + "grad_norm": 3.533766746520996, + "learning_rate": 6.408380259220613e-05, + "loss": 0.43784823417663576, + "step": 84300 + }, + { + "epoch": 0.3619604509586736, + "grad_norm": 2.929824113845825, + "learning_rate": 6.407949087208851e-05, + "loss": 0.14191383123397827, + "step": 84310 + }, + { + "epoch": 0.36200338304869356, + "grad_norm": 0.17527572810649872, + "learning_rate": 6.407517915197089e-05, + "loss": 0.17120797634124757, + "step": 84320 + }, + { + "epoch": 0.3620463151387136, + "grad_norm": 0.04647151753306389, + "learning_rate": 6.407086743185326e-05, + "loss": 0.11106468439102173, + "step": 84330 + }, + { + "epoch": 0.3620892472287336, + "grad_norm": 3.4737348556518555, + "learning_rate": 6.406655571173564e-05, + "loss": 0.26698980331420896, + "step": 84340 + }, + { + "epoch": 0.3621321793187536, + "grad_norm": 0.062151987105607986, + "learning_rate": 6.406224399161802e-05, + "loss": 0.2645150423049927, + "step": 84350 + }, + { + "epoch": 0.3621751114087736, + "grad_norm": 0.3555840253829956, + "learning_rate": 6.40579322715004e-05, + "loss": 0.22405788898468018, + "step": 84360 + }, + { + "epoch": 0.3622180434987936, + "grad_norm": 0.04546572268009186, + "learning_rate": 6.405362055138277e-05, + "loss": 0.22337851524353028, + "step": 84370 + }, + { + "epoch": 0.3622609755888136, + "grad_norm": 0.0068736146204173565, + "learning_rate": 6.404930883126515e-05, + "loss": 0.166854190826416, + "step": 84380 + }, + { + "epoch": 0.36230390767883364, + "grad_norm": 51.327964782714844, + "learning_rate": 6.404499711114751e-05, + "loss": 0.23231921195983887, + "step": 84390 + }, + { + "epoch": 0.3623468397688536, + "grad_norm": 1.8268455266952515, + "learning_rate": 6.404068539102989e-05, + "loss": 0.31146812438964844, + "step": 84400 + }, + { + "epoch": 0.3623897718588736, + "grad_norm": 0.02235632948577404, + "learning_rate": 6.403637367091228e-05, + "loss": 0.26765482425689696, + "step": 84410 + }, + { + "epoch": 0.36243270394889365, + "grad_norm": 0.039541054517030716, + "learning_rate": 6.403206195079466e-05, + "loss": 0.15650541782379152, + "step": 84420 + }, + { + "epoch": 0.36247563603891364, + "grad_norm": 2.285512685775757, + "learning_rate": 6.402775023067704e-05, + "loss": 0.1562897801399231, + "step": 84430 + }, + { + "epoch": 0.3625185681289336, + "grad_norm": 0.002735693706199527, + "learning_rate": 6.402343851055941e-05, + "loss": 0.2821861743927002, + "step": 84440 + }, + { + "epoch": 0.36256150021895367, + "grad_norm": 0.5640804171562195, + "learning_rate": 6.401912679044179e-05, + "loss": 0.2460437536239624, + "step": 84450 + }, + { + "epoch": 0.36260443230897366, + "grad_norm": 0.6940175890922546, + "learning_rate": 6.401481507032417e-05, + "loss": 0.26609799861907957, + "step": 84460 + }, + { + "epoch": 0.3626473643989937, + "grad_norm": 0.008508739061653614, + "learning_rate": 6.401050335020653e-05, + "loss": 0.0524605393409729, + "step": 84470 + }, + { + "epoch": 0.3626902964890137, + "grad_norm": 1.0394551753997803, + "learning_rate": 6.400619163008891e-05, + "loss": 0.18994983434677123, + "step": 84480 + }, + { + "epoch": 0.3627332285790337, + "grad_norm": 2.4201126098632812, + "learning_rate": 6.400187990997129e-05, + "loss": 0.13662683963775635, + "step": 84490 + }, + { + "epoch": 0.3627761606690537, + "grad_norm": 0.029414892196655273, + "learning_rate": 6.399756818985366e-05, + "loss": 0.41878905296325686, + "step": 84500 + }, + { + "epoch": 0.3628190927590737, + "grad_norm": 2.0169215202331543, + "learning_rate": 6.399325646973604e-05, + "loss": 0.2553415775299072, + "step": 84510 + }, + { + "epoch": 0.3628620248490937, + "grad_norm": 4.364803314208984, + "learning_rate": 6.398894474961842e-05, + "loss": 0.2932882308959961, + "step": 84520 + }, + { + "epoch": 0.36290495693911373, + "grad_norm": 22.403047561645508, + "learning_rate": 6.39846330295008e-05, + "loss": 0.21755385398864746, + "step": 84530 + }, + { + "epoch": 0.3629478890291337, + "grad_norm": 0.10820141434669495, + "learning_rate": 6.398032130938317e-05, + "loss": 0.30017387866973877, + "step": 84540 + }, + { + "epoch": 0.3629908211191537, + "grad_norm": 0.06273016333580017, + "learning_rate": 6.397600958926554e-05, + "loss": 0.11642609834671021, + "step": 84550 + }, + { + "epoch": 0.36303375320917375, + "grad_norm": 0.03808317705988884, + "learning_rate": 6.397169786914791e-05, + "loss": 0.2759079456329346, + "step": 84560 + }, + { + "epoch": 0.36307668529919374, + "grad_norm": 4.863452434539795, + "learning_rate": 6.396738614903029e-05, + "loss": 0.20633177757263182, + "step": 84570 + }, + { + "epoch": 0.3631196173892137, + "grad_norm": 0.004839466884732246, + "learning_rate": 6.396307442891267e-05, + "loss": 0.31665678024291993, + "step": 84580 + }, + { + "epoch": 0.36316254947923376, + "grad_norm": 0.16685040295124054, + "learning_rate": 6.395876270879505e-05, + "loss": 0.07087898254394531, + "step": 84590 + }, + { + "epoch": 0.36320548156925375, + "grad_norm": 54.194393157958984, + "learning_rate": 6.395445098867742e-05, + "loss": 0.170097279548645, + "step": 84600 + }, + { + "epoch": 0.36324841365927374, + "grad_norm": 0.17545737326145172, + "learning_rate": 6.39501392685598e-05, + "loss": 0.10149813890457153, + "step": 84610 + }, + { + "epoch": 0.3632913457492938, + "grad_norm": 17.983226776123047, + "learning_rate": 6.394582754844218e-05, + "loss": 0.15684278011322023, + "step": 84620 + }, + { + "epoch": 0.36333427783931377, + "grad_norm": 1.1058499813079834, + "learning_rate": 6.394151582832456e-05, + "loss": 0.25113801956176757, + "step": 84630 + }, + { + "epoch": 0.36337720992933376, + "grad_norm": 0.1441742181777954, + "learning_rate": 6.393720410820693e-05, + "loss": 0.24742982387542725, + "step": 84640 + }, + { + "epoch": 0.3634201420193538, + "grad_norm": 0.03703548014163971, + "learning_rate": 6.393289238808931e-05, + "loss": 0.11437108516693115, + "step": 84650 + }, + { + "epoch": 0.3634630741093738, + "grad_norm": 2.121002197265625, + "learning_rate": 6.392858066797169e-05, + "loss": 0.2250274896621704, + "step": 84660 + }, + { + "epoch": 0.36350600619939377, + "grad_norm": 0.003910502418875694, + "learning_rate": 6.392426894785407e-05, + "loss": 0.2840403079986572, + "step": 84670 + }, + { + "epoch": 0.3635489382894138, + "grad_norm": 0.12619902193546295, + "learning_rate": 6.391995722773644e-05, + "loss": 0.10449312925338745, + "step": 84680 + }, + { + "epoch": 0.3635918703794338, + "grad_norm": 2.015465497970581, + "learning_rate": 6.391564550761882e-05, + "loss": 0.08222501277923584, + "step": 84690 + }, + { + "epoch": 0.36363480246945384, + "grad_norm": 2.5868568420410156, + "learning_rate": 6.39113337875012e-05, + "loss": 0.45690197944641114, + "step": 84700 + }, + { + "epoch": 0.36367773455947383, + "grad_norm": 4.2143096923828125, + "learning_rate": 6.390702206738356e-05, + "loss": 0.28699326515197754, + "step": 84710 + }, + { + "epoch": 0.3637206666494938, + "grad_norm": 0.997793972492218, + "learning_rate": 6.390271034726594e-05, + "loss": 0.1634911060333252, + "step": 84720 + }, + { + "epoch": 0.36376359873951386, + "grad_norm": 0.06723709404468536, + "learning_rate": 6.389839862714832e-05, + "loss": 0.1785125970840454, + "step": 84730 + }, + { + "epoch": 0.36380653082953385, + "grad_norm": 0.07979264855384827, + "learning_rate": 6.38940869070307e-05, + "loss": 0.2313058614730835, + "step": 84740 + }, + { + "epoch": 0.36384946291955383, + "grad_norm": 1.5970796346664429, + "learning_rate": 6.388977518691307e-05, + "loss": 0.11084781885147095, + "step": 84750 + }, + { + "epoch": 0.3638923950095739, + "grad_norm": 0.09508782625198364, + "learning_rate": 6.388546346679545e-05, + "loss": 0.19648040533065797, + "step": 84760 + }, + { + "epoch": 0.36393532709959386, + "grad_norm": 0.004324431996792555, + "learning_rate": 6.388115174667783e-05, + "loss": 0.12327823638916016, + "step": 84770 + }, + { + "epoch": 0.36397825918961385, + "grad_norm": 9.756826400756836, + "learning_rate": 6.38768400265602e-05, + "loss": 0.15264017581939698, + "step": 84780 + }, + { + "epoch": 0.3640211912796339, + "grad_norm": 2.773954391479492, + "learning_rate": 6.387252830644258e-05, + "loss": 0.24624147415161132, + "step": 84790 + }, + { + "epoch": 0.3640641233696539, + "grad_norm": 41.181640625, + "learning_rate": 6.386821658632494e-05, + "loss": 0.3611812353134155, + "step": 84800 + }, + { + "epoch": 0.36410705545967387, + "grad_norm": 3.124552011489868, + "learning_rate": 6.386390486620732e-05, + "loss": 0.17753114700317382, + "step": 84810 + }, + { + "epoch": 0.3641499875496939, + "grad_norm": 0.17336466908454895, + "learning_rate": 6.38595931460897e-05, + "loss": 0.28252851963043213, + "step": 84820 + }, + { + "epoch": 0.3641929196397139, + "grad_norm": 2.1589248180389404, + "learning_rate": 6.385528142597208e-05, + "loss": 0.3980719566345215, + "step": 84830 + }, + { + "epoch": 0.3642358517297339, + "grad_norm": 0.006989433895796537, + "learning_rate": 6.385096970585445e-05, + "loss": 0.32679018974304197, + "step": 84840 + }, + { + "epoch": 0.3642787838197539, + "grad_norm": 0.03605174273252487, + "learning_rate": 6.384665798573683e-05, + "loss": 0.4704775333404541, + "step": 84850 + }, + { + "epoch": 0.3643217159097739, + "grad_norm": 4.0266523361206055, + "learning_rate": 6.384234626561921e-05, + "loss": 0.16243298053741456, + "step": 84860 + }, + { + "epoch": 0.3643646479997939, + "grad_norm": 0.20851069688796997, + "learning_rate": 6.383803454550159e-05, + "loss": 0.15209826231002807, + "step": 84870 + }, + { + "epoch": 0.36440758008981394, + "grad_norm": 2.2499935626983643, + "learning_rate": 6.383372282538396e-05, + "loss": 0.2580280303955078, + "step": 84880 + }, + { + "epoch": 0.36445051217983393, + "grad_norm": 1.435296654701233, + "learning_rate": 6.382941110526634e-05, + "loss": 0.3792762756347656, + "step": 84890 + }, + { + "epoch": 0.364493444269854, + "grad_norm": 0.5152401924133301, + "learning_rate": 6.382509938514872e-05, + "loss": 0.21894633769989014, + "step": 84900 + }, + { + "epoch": 0.36453637635987396, + "grad_norm": 0.04706510156393051, + "learning_rate": 6.38207876650311e-05, + "loss": 0.23475384712219238, + "step": 84910 + }, + { + "epoch": 0.36457930844989395, + "grad_norm": 4.913792133331299, + "learning_rate": 6.381647594491347e-05, + "loss": 0.3245802879333496, + "step": 84920 + }, + { + "epoch": 0.364622240539914, + "grad_norm": 0.2699010372161865, + "learning_rate": 6.381216422479585e-05, + "loss": 0.11922560930252075, + "step": 84930 + }, + { + "epoch": 0.364665172629934, + "grad_norm": 0.00447971373796463, + "learning_rate": 6.380785250467823e-05, + "loss": 0.10177613496780395, + "step": 84940 + }, + { + "epoch": 0.36470810471995396, + "grad_norm": 0.0390581451356411, + "learning_rate": 6.38035407845606e-05, + "loss": 0.09930930137634278, + "step": 84950 + }, + { + "epoch": 0.364751036809974, + "grad_norm": 0.002693427260965109, + "learning_rate": 6.379922906444297e-05, + "loss": 0.36963183879852296, + "step": 84960 + }, + { + "epoch": 0.364793968899994, + "grad_norm": 0.002317856065928936, + "learning_rate": 6.379491734432535e-05, + "loss": 0.36330618858337405, + "step": 84970 + }, + { + "epoch": 0.364836900990014, + "grad_norm": 2.9636216163635254, + "learning_rate": 6.379060562420772e-05, + "loss": 0.26229047775268555, + "step": 84980 + }, + { + "epoch": 0.364879833080034, + "grad_norm": 0.3010944724082947, + "learning_rate": 6.37862939040901e-05, + "loss": 0.07681341767311096, + "step": 84990 + }, + { + "epoch": 0.364922765170054, + "grad_norm": 1.5422542095184326, + "learning_rate": 6.378198218397248e-05, + "loss": 0.18676744699478148, + "step": 85000 + }, + { + "epoch": 0.364922765170054, + "eval_loss": 0.43643614649772644, + "eval_runtime": 27.1696, + "eval_samples_per_second": 3.681, + "eval_steps_per_second": 3.681, + "step": 85000 + }, + { + "epoch": 0.364965697260074, + "grad_norm": 1.8786780834197998, + "learning_rate": 6.377767046385485e-05, + "loss": 0.29012351036071776, + "step": 85010 + }, + { + "epoch": 0.36500862935009404, + "grad_norm": 3.3790924549102783, + "learning_rate": 6.377335874373723e-05, + "loss": 0.27429943084716796, + "step": 85020 + }, + { + "epoch": 0.365051561440114, + "grad_norm": 0.01983797922730446, + "learning_rate": 6.376904702361961e-05, + "loss": 0.2209841251373291, + "step": 85030 + }, + { + "epoch": 0.365094493530134, + "grad_norm": 0.0703393742442131, + "learning_rate": 6.376473530350197e-05, + "loss": 0.38023080825805666, + "step": 85040 + }, + { + "epoch": 0.36513742562015405, + "grad_norm": 0.014833835884928703, + "learning_rate": 6.376042358338435e-05, + "loss": 0.23475439548492433, + "step": 85050 + }, + { + "epoch": 0.36518035771017404, + "grad_norm": 0.014177965931594372, + "learning_rate": 6.375611186326673e-05, + "loss": 0.19169943332672118, + "step": 85060 + }, + { + "epoch": 0.36522328980019403, + "grad_norm": 0.021022124215960503, + "learning_rate": 6.37518001431491e-05, + "loss": 0.23145816326141358, + "step": 85070 + }, + { + "epoch": 0.36526622189021407, + "grad_norm": 0.8904299139976501, + "learning_rate": 6.374748842303148e-05, + "loss": 0.19575103521347045, + "step": 85080 + }, + { + "epoch": 0.36530915398023406, + "grad_norm": 1.63796067237854, + "learning_rate": 6.374317670291386e-05, + "loss": 0.20815696716308593, + "step": 85090 + }, + { + "epoch": 0.36535208607025405, + "grad_norm": 0.1709212213754654, + "learning_rate": 6.373886498279624e-05, + "loss": 0.11579384803771972, + "step": 85100 + }, + { + "epoch": 0.3653950181602741, + "grad_norm": 0.10949070751667023, + "learning_rate": 6.373455326267861e-05, + "loss": 0.14310755729675292, + "step": 85110 + }, + { + "epoch": 0.3654379502502941, + "grad_norm": 0.05624139681458473, + "learning_rate": 6.373024154256099e-05, + "loss": 0.20229153633117675, + "step": 85120 + }, + { + "epoch": 0.3654808823403141, + "grad_norm": 0.008176709525287151, + "learning_rate": 6.372592982244337e-05, + "loss": 0.09302598834037781, + "step": 85130 + }, + { + "epoch": 0.3655238144303341, + "grad_norm": 1.395006537437439, + "learning_rate": 6.372161810232575e-05, + "loss": 0.13691210746765137, + "step": 85140 + }, + { + "epoch": 0.3655667465203541, + "grad_norm": 0.0070328544825315475, + "learning_rate": 6.371730638220812e-05, + "loss": 0.07568751573562622, + "step": 85150 + }, + { + "epoch": 0.36560967861037413, + "grad_norm": 0.9231022596359253, + "learning_rate": 6.37129946620905e-05, + "loss": 0.21788375377655028, + "step": 85160 + }, + { + "epoch": 0.3656526107003941, + "grad_norm": 6.861789226531982, + "learning_rate": 6.370868294197288e-05, + "loss": 0.4098199844360352, + "step": 85170 + }, + { + "epoch": 0.3656955427904141, + "grad_norm": 0.015893081203103065, + "learning_rate": 6.370437122185526e-05, + "loss": 0.1904071807861328, + "step": 85180 + }, + { + "epoch": 0.36573847488043415, + "grad_norm": 0.09461787343025208, + "learning_rate": 6.370005950173763e-05, + "loss": 0.22163398265838624, + "step": 85190 + }, + { + "epoch": 0.36578140697045414, + "grad_norm": 0.006280150264501572, + "learning_rate": 6.369574778162001e-05, + "loss": 0.056524789333343504, + "step": 85200 + }, + { + "epoch": 0.3658243390604741, + "grad_norm": 0.5843717455863953, + "learning_rate": 6.369143606150237e-05, + "loss": 0.43214945793151854, + "step": 85210 + }, + { + "epoch": 0.36586727115049417, + "grad_norm": 0.04193728789687157, + "learning_rate": 6.368712434138475e-05, + "loss": 0.15790834426879882, + "step": 85220 + }, + { + "epoch": 0.36591020324051415, + "grad_norm": 0.024928787723183632, + "learning_rate": 6.368281262126713e-05, + "loss": 0.1830769658088684, + "step": 85230 + }, + { + "epoch": 0.36595313533053414, + "grad_norm": 0.05677323043346405, + "learning_rate": 6.36785009011495e-05, + "loss": 0.24995017051696777, + "step": 85240 + }, + { + "epoch": 0.3659960674205542, + "grad_norm": 1.3689550161361694, + "learning_rate": 6.367418918103188e-05, + "loss": 0.1743025302886963, + "step": 85250 + }, + { + "epoch": 0.36603899951057417, + "grad_norm": 1.934273362159729, + "learning_rate": 6.366987746091426e-05, + "loss": 0.19505960941314698, + "step": 85260 + }, + { + "epoch": 0.36608193160059416, + "grad_norm": 1.3472764492034912, + "learning_rate": 6.366556574079664e-05, + "loss": 0.30374248027801515, + "step": 85270 + }, + { + "epoch": 0.3661248636906142, + "grad_norm": 0.033247269690036774, + "learning_rate": 6.366125402067902e-05, + "loss": 0.12651869058609008, + "step": 85280 + }, + { + "epoch": 0.3661677957806342, + "grad_norm": 0.18167521059513092, + "learning_rate": 6.365694230056138e-05, + "loss": 0.08280782699584961, + "step": 85290 + }, + { + "epoch": 0.3662107278706542, + "grad_norm": 0.9661501049995422, + "learning_rate": 6.365263058044376e-05, + "loss": 0.06233731508255005, + "step": 85300 + }, + { + "epoch": 0.3662536599606742, + "grad_norm": 1.2363988161087036, + "learning_rate": 6.364831886032613e-05, + "loss": 0.36128768920898435, + "step": 85310 + }, + { + "epoch": 0.3662965920506942, + "grad_norm": 0.04574267193675041, + "learning_rate": 6.364400714020851e-05, + "loss": 0.21424858570098876, + "step": 85320 + }, + { + "epoch": 0.36633952414071425, + "grad_norm": 3.4972920417785645, + "learning_rate": 6.363969542009089e-05, + "loss": 0.3097004175186157, + "step": 85330 + }, + { + "epoch": 0.36638245623073423, + "grad_norm": 8.96206283569336, + "learning_rate": 6.363538369997327e-05, + "loss": 0.33934545516967773, + "step": 85340 + }, + { + "epoch": 0.3664253883207542, + "grad_norm": 0.020525289699435234, + "learning_rate": 6.363107197985564e-05, + "loss": 0.30977373123168944, + "step": 85350 + }, + { + "epoch": 0.36646832041077426, + "grad_norm": 0.051443614065647125, + "learning_rate": 6.362676025973802e-05, + "loss": 0.2037959337234497, + "step": 85360 + }, + { + "epoch": 0.36651125250079425, + "grad_norm": 0.0785449668765068, + "learning_rate": 6.36224485396204e-05, + "loss": 0.2925347089767456, + "step": 85370 + }, + { + "epoch": 0.36655418459081424, + "grad_norm": 0.15500248968601227, + "learning_rate": 6.361813681950278e-05, + "loss": 0.18055803775787355, + "step": 85380 + }, + { + "epoch": 0.3665971166808343, + "grad_norm": 0.03973287343978882, + "learning_rate": 6.361382509938515e-05, + "loss": 0.3326643228530884, + "step": 85390 + }, + { + "epoch": 0.36664004877085427, + "grad_norm": 0.893135130405426, + "learning_rate": 6.360951337926753e-05, + "loss": 0.297300910949707, + "step": 85400 + }, + { + "epoch": 0.36668298086087425, + "grad_norm": 4.013089656829834, + "learning_rate": 6.360520165914991e-05, + "loss": 0.3480620622634888, + "step": 85410 + }, + { + "epoch": 0.3667259129508943, + "grad_norm": 0.017691394314169884, + "learning_rate": 6.360088993903229e-05, + "loss": 0.03665188848972321, + "step": 85420 + }, + { + "epoch": 0.3667688450409143, + "grad_norm": 0.9386993646621704, + "learning_rate": 6.359657821891466e-05, + "loss": 0.3473350048065186, + "step": 85430 + }, + { + "epoch": 0.36681177713093427, + "grad_norm": 14.044384002685547, + "learning_rate": 6.359226649879704e-05, + "loss": 0.2297840118408203, + "step": 85440 + }, + { + "epoch": 0.3668547092209543, + "grad_norm": 1.6908752918243408, + "learning_rate": 6.35879547786794e-05, + "loss": 0.15974087715148927, + "step": 85450 + }, + { + "epoch": 0.3668976413109743, + "grad_norm": 3.4702935218811035, + "learning_rate": 6.358364305856178e-05, + "loss": 0.1769618272781372, + "step": 85460 + }, + { + "epoch": 0.3669405734009943, + "grad_norm": 0.022644072771072388, + "learning_rate": 6.357933133844416e-05, + "loss": 0.426633882522583, + "step": 85470 + }, + { + "epoch": 0.36698350549101433, + "grad_norm": 1.2880467176437378, + "learning_rate": 6.357501961832654e-05, + "loss": 0.2576338052749634, + "step": 85480 + }, + { + "epoch": 0.3670264375810343, + "grad_norm": 4.9231743812561035, + "learning_rate": 6.357070789820891e-05, + "loss": 0.3569629192352295, + "step": 85490 + }, + { + "epoch": 0.3670693696710543, + "grad_norm": 0.006700332276523113, + "learning_rate": 6.356639617809129e-05, + "loss": 0.1909146189689636, + "step": 85500 + }, + { + "epoch": 0.36711230176107434, + "grad_norm": 0.5436863303184509, + "learning_rate": 6.356208445797367e-05, + "loss": 0.20022847652435302, + "step": 85510 + }, + { + "epoch": 0.36715523385109433, + "grad_norm": 0.03987458720803261, + "learning_rate": 6.355777273785604e-05, + "loss": 0.2638335466384888, + "step": 85520 + }, + { + "epoch": 0.3671981659411143, + "grad_norm": 3.6243085861206055, + "learning_rate": 6.355346101773842e-05, + "loss": 0.271874213218689, + "step": 85530 + }, + { + "epoch": 0.36724109803113436, + "grad_norm": 1.5448380708694458, + "learning_rate": 6.354914929762079e-05, + "loss": 0.3061804294586182, + "step": 85540 + }, + { + "epoch": 0.36728403012115435, + "grad_norm": 0.32279542088508606, + "learning_rate": 6.354483757750316e-05, + "loss": 0.04409662783145905, + "step": 85550 + }, + { + "epoch": 0.3673269622111744, + "grad_norm": 0.03715287521481514, + "learning_rate": 6.354052585738554e-05, + "loss": 0.21204564571380616, + "step": 85560 + }, + { + "epoch": 0.3673698943011944, + "grad_norm": 0.0034397614654153585, + "learning_rate": 6.353621413726792e-05, + "loss": 0.05352402925491333, + "step": 85570 + }, + { + "epoch": 0.36741282639121436, + "grad_norm": 0.10549236088991165, + "learning_rate": 6.35319024171503e-05, + "loss": 0.2931922674179077, + "step": 85580 + }, + { + "epoch": 0.3674557584812344, + "grad_norm": 0.061459217220544815, + "learning_rate": 6.352759069703267e-05, + "loss": 0.10618021488189697, + "step": 85590 + }, + { + "epoch": 0.3674986905712544, + "grad_norm": 0.68232262134552, + "learning_rate": 6.352327897691506e-05, + "loss": 0.2445995330810547, + "step": 85600 + }, + { + "epoch": 0.3675416226612744, + "grad_norm": 0.13406546413898468, + "learning_rate": 6.351896725679744e-05, + "loss": 0.05700792670249939, + "step": 85610 + }, + { + "epoch": 0.3675845547512944, + "grad_norm": 1.9055348634719849, + "learning_rate": 6.35146555366798e-05, + "loss": 0.4043318748474121, + "step": 85620 + }, + { + "epoch": 0.3676274868413144, + "grad_norm": 1.7802194356918335, + "learning_rate": 6.351034381656218e-05, + "loss": 0.15079153776168824, + "step": 85630 + }, + { + "epoch": 0.3676704189313344, + "grad_norm": 3.792257785797119, + "learning_rate": 6.350603209644456e-05, + "loss": 0.026846662163734436, + "step": 85640 + }, + { + "epoch": 0.36771335102135444, + "grad_norm": 0.4064582586288452, + "learning_rate": 6.350172037632694e-05, + "loss": 0.27849345207214354, + "step": 85650 + }, + { + "epoch": 0.3677562831113744, + "grad_norm": 2.508204460144043, + "learning_rate": 6.349740865620931e-05, + "loss": 0.1909274935722351, + "step": 85660 + }, + { + "epoch": 0.3677992152013944, + "grad_norm": 1.741932988166809, + "learning_rate": 6.349309693609169e-05, + "loss": 0.28033435344696045, + "step": 85670 + }, + { + "epoch": 0.36784214729141446, + "grad_norm": 0.08544985949993134, + "learning_rate": 6.348878521597407e-05, + "loss": 0.2724045991897583, + "step": 85680 + }, + { + "epoch": 0.36788507938143444, + "grad_norm": 1.7382010221481323, + "learning_rate": 6.348447349585645e-05, + "loss": 0.4020527839660645, + "step": 85690 + }, + { + "epoch": 0.36792801147145443, + "grad_norm": 0.20443333685398102, + "learning_rate": 6.348016177573881e-05, + "loss": 0.25672647953033445, + "step": 85700 + }, + { + "epoch": 0.3679709435614745, + "grad_norm": 0.056993529200553894, + "learning_rate": 6.347585005562119e-05, + "loss": 0.10209436416625976, + "step": 85710 + }, + { + "epoch": 0.36801387565149446, + "grad_norm": 2.561861276626587, + "learning_rate": 6.347153833550356e-05, + "loss": 0.12558455467224122, + "step": 85720 + }, + { + "epoch": 0.36805680774151445, + "grad_norm": 0.21617703139781952, + "learning_rate": 6.346722661538594e-05, + "loss": 0.030121004581451415, + "step": 85730 + }, + { + "epoch": 0.3680997398315345, + "grad_norm": 1.5870252847671509, + "learning_rate": 6.346291489526832e-05, + "loss": 0.19039595127105713, + "step": 85740 + }, + { + "epoch": 0.3681426719215545, + "grad_norm": 1.228922963142395, + "learning_rate": 6.34586031751507e-05, + "loss": 0.3952412843704224, + "step": 85750 + }, + { + "epoch": 0.3681856040115745, + "grad_norm": 1.321328043937683, + "learning_rate": 6.345429145503307e-05, + "loss": 0.18042598962783812, + "step": 85760 + }, + { + "epoch": 0.3682285361015945, + "grad_norm": 1.9140307903289795, + "learning_rate": 6.344997973491545e-05, + "loss": 0.1848459005355835, + "step": 85770 + }, + { + "epoch": 0.3682714681916145, + "grad_norm": 0.00764481769874692, + "learning_rate": 6.344566801479782e-05, + "loss": 0.10012017488479615, + "step": 85780 + }, + { + "epoch": 0.36831440028163454, + "grad_norm": 0.9462195634841919, + "learning_rate": 6.344135629468019e-05, + "loss": 0.301893949508667, + "step": 85790 + }, + { + "epoch": 0.3683573323716545, + "grad_norm": 0.013289229944348335, + "learning_rate": 6.343704457456257e-05, + "loss": 0.3308419704437256, + "step": 85800 + }, + { + "epoch": 0.3684002644616745, + "grad_norm": 0.002317358274012804, + "learning_rate": 6.343273285444495e-05, + "loss": 0.30897011756896975, + "step": 85810 + }, + { + "epoch": 0.36844319655169455, + "grad_norm": 0.2647899091243744, + "learning_rate": 6.342842113432734e-05, + "loss": 0.1876887321472168, + "step": 85820 + }, + { + "epoch": 0.36848612864171454, + "grad_norm": 0.22192956507205963, + "learning_rate": 6.342410941420972e-05, + "loss": 0.0698373019695282, + "step": 85830 + }, + { + "epoch": 0.3685290607317345, + "grad_norm": 0.0032855840399861336, + "learning_rate": 6.341979769409209e-05, + "loss": 0.205841588973999, + "step": 85840 + }, + { + "epoch": 0.36857199282175457, + "grad_norm": 0.017166977748274803, + "learning_rate": 6.341548597397447e-05, + "loss": 0.39359591007232664, + "step": 85850 + }, + { + "epoch": 0.36861492491177456, + "grad_norm": 2.2726593017578125, + "learning_rate": 6.341117425385685e-05, + "loss": 0.2558701992034912, + "step": 85860 + }, + { + "epoch": 0.36865785700179454, + "grad_norm": 4.76535177230835, + "learning_rate": 6.340686253373921e-05, + "loss": 0.4349191188812256, + "step": 85870 + }, + { + "epoch": 0.3687007890918146, + "grad_norm": 0.05056734010577202, + "learning_rate": 6.340255081362159e-05, + "loss": 0.15551151037216188, + "step": 85880 + }, + { + "epoch": 0.36874372118183457, + "grad_norm": 0.014606560580432415, + "learning_rate": 6.339823909350397e-05, + "loss": 0.10910115242004395, + "step": 85890 + }, + { + "epoch": 0.36878665327185456, + "grad_norm": 4.6601128578186035, + "learning_rate": 6.339392737338634e-05, + "loss": 0.41695199012756345, + "step": 85900 + }, + { + "epoch": 0.3688295853618746, + "grad_norm": 0.02767966315150261, + "learning_rate": 6.338961565326872e-05, + "loss": 0.26373655796051027, + "step": 85910 + }, + { + "epoch": 0.3688725174518946, + "grad_norm": 0.00913521833717823, + "learning_rate": 6.33853039331511e-05, + "loss": 0.13180015087127686, + "step": 85920 + }, + { + "epoch": 0.3689154495419146, + "grad_norm": 2.545062780380249, + "learning_rate": 6.338099221303348e-05, + "loss": 0.1353710412979126, + "step": 85930 + }, + { + "epoch": 0.3689583816319346, + "grad_norm": 1.2263450622558594, + "learning_rate": 6.337668049291585e-05, + "loss": 0.5144428253173828, + "step": 85940 + }, + { + "epoch": 0.3690013137219546, + "grad_norm": 0.11725396662950516, + "learning_rate": 6.337236877279822e-05, + "loss": 0.09795107245445252, + "step": 85950 + }, + { + "epoch": 0.3690442458119746, + "grad_norm": 0.9828706979751587, + "learning_rate": 6.33680570526806e-05, + "loss": 0.08187245130538941, + "step": 85960 + }, + { + "epoch": 0.36908717790199463, + "grad_norm": 1.8195759057998657, + "learning_rate": 6.336374533256297e-05, + "loss": 0.3792530298233032, + "step": 85970 + }, + { + "epoch": 0.3691301099920146, + "grad_norm": 6.6570000648498535, + "learning_rate": 6.335943361244535e-05, + "loss": 0.1150534987449646, + "step": 85980 + }, + { + "epoch": 0.36917304208203466, + "grad_norm": 0.7525167465209961, + "learning_rate": 6.335512189232773e-05, + "loss": 0.15633391141891478, + "step": 85990 + }, + { + "epoch": 0.36921597417205465, + "grad_norm": 0.055644210427999496, + "learning_rate": 6.33508101722101e-05, + "loss": 0.04366555511951446, + "step": 86000 + }, + { + "epoch": 0.36921597417205465, + "eval_loss": 0.4317818582057953, + "eval_runtime": 27.1777, + "eval_samples_per_second": 3.679, + "eval_steps_per_second": 3.679, + "step": 86000 + }, + { + "epoch": 0.36925890626207464, + "grad_norm": 0.035437412559986115, + "learning_rate": 6.334649845209248e-05, + "loss": 0.26076564788818357, + "step": 86010 + }, + { + "epoch": 0.3693018383520947, + "grad_norm": 0.12425762414932251, + "learning_rate": 6.334218673197486e-05, + "loss": 0.3686336040496826, + "step": 86020 + }, + { + "epoch": 0.36934477044211467, + "grad_norm": 0.00577976368367672, + "learning_rate": 6.333787501185722e-05, + "loss": 0.19180816411972046, + "step": 86030 + }, + { + "epoch": 0.36938770253213465, + "grad_norm": 0.10685226321220398, + "learning_rate": 6.333356329173961e-05, + "loss": 0.14715638160705566, + "step": 86040 + }, + { + "epoch": 0.3694306346221547, + "grad_norm": 0.037477098405361176, + "learning_rate": 6.332925157162199e-05, + "loss": 0.1778426170349121, + "step": 86050 + }, + { + "epoch": 0.3694735667121747, + "grad_norm": 0.2366664856672287, + "learning_rate": 6.332493985150437e-05, + "loss": 0.23205168247222902, + "step": 86060 + }, + { + "epoch": 0.36951649880219467, + "grad_norm": 16.036067962646484, + "learning_rate": 6.332062813138674e-05, + "loss": 0.14291188716888428, + "step": 86070 + }, + { + "epoch": 0.3695594308922147, + "grad_norm": 0.0030410068575292826, + "learning_rate": 6.331631641126912e-05, + "loss": 0.0985245168209076, + "step": 86080 + }, + { + "epoch": 0.3696023629822347, + "grad_norm": 2.2322115898132324, + "learning_rate": 6.33120046911515e-05, + "loss": 0.3554996013641357, + "step": 86090 + }, + { + "epoch": 0.3696452950722547, + "grad_norm": 2.5456347465515137, + "learning_rate": 6.330769297103388e-05, + "loss": 0.11362524032592773, + "step": 86100 + }, + { + "epoch": 0.36968822716227473, + "grad_norm": 0.03111317940056324, + "learning_rate": 6.330338125091624e-05, + "loss": 0.39795794486999514, + "step": 86110 + }, + { + "epoch": 0.3697311592522947, + "grad_norm": 1.2640771865844727, + "learning_rate": 6.329906953079862e-05, + "loss": 0.3355656623840332, + "step": 86120 + }, + { + "epoch": 0.3697740913423147, + "grad_norm": 0.7932472229003906, + "learning_rate": 6.3294757810681e-05, + "loss": 0.19024043083190917, + "step": 86130 + }, + { + "epoch": 0.36981702343233475, + "grad_norm": 1.5137596130371094, + "learning_rate": 6.329044609056337e-05, + "loss": 0.21221389770507812, + "step": 86140 + }, + { + "epoch": 0.36985995552235473, + "grad_norm": 0.9180253148078918, + "learning_rate": 6.328613437044575e-05, + "loss": 0.3574000358581543, + "step": 86150 + }, + { + "epoch": 0.3699028876123747, + "grad_norm": 3.4195072650909424, + "learning_rate": 6.328182265032813e-05, + "loss": 0.33246517181396484, + "step": 86160 + }, + { + "epoch": 0.36994581970239476, + "grad_norm": 0.40100616216659546, + "learning_rate": 6.32775109302105e-05, + "loss": 0.14388233423233032, + "step": 86170 + }, + { + "epoch": 0.36998875179241475, + "grad_norm": 0.16329815983772278, + "learning_rate": 6.327319921009288e-05, + "loss": 0.23172502517700194, + "step": 86180 + }, + { + "epoch": 0.3700316838824348, + "grad_norm": 0.4270736873149872, + "learning_rate": 6.326888748997526e-05, + "loss": 0.17500052452087403, + "step": 86190 + }, + { + "epoch": 0.3700746159724548, + "grad_norm": 1.2367000579833984, + "learning_rate": 6.326457576985762e-05, + "loss": 0.1254699110984802, + "step": 86200 + }, + { + "epoch": 0.37011754806247477, + "grad_norm": 0.02919580042362213, + "learning_rate": 6.326026404974e-05, + "loss": 0.2366110563278198, + "step": 86210 + }, + { + "epoch": 0.3701604801524948, + "grad_norm": 0.2189096361398697, + "learning_rate": 6.325595232962238e-05, + "loss": 0.16678482294082642, + "step": 86220 + }, + { + "epoch": 0.3702034122425148, + "grad_norm": 0.012859572656452656, + "learning_rate": 6.325164060950475e-05, + "loss": 0.3061917543411255, + "step": 86230 + }, + { + "epoch": 0.3702463443325348, + "grad_norm": 0.0979384034872055, + "learning_rate": 6.324732888938713e-05, + "loss": 0.16311391592025756, + "step": 86240 + }, + { + "epoch": 0.3702892764225548, + "grad_norm": 0.9386596083641052, + "learning_rate": 6.324301716926951e-05, + "loss": 0.14398750066757202, + "step": 86250 + }, + { + "epoch": 0.3703322085125748, + "grad_norm": 0.08035529404878616, + "learning_rate": 6.323870544915189e-05, + "loss": 0.2772280931472778, + "step": 86260 + }, + { + "epoch": 0.3703751406025948, + "grad_norm": 0.00924977008253336, + "learning_rate": 6.323439372903426e-05, + "loss": 0.3513696908950806, + "step": 86270 + }, + { + "epoch": 0.37041807269261484, + "grad_norm": 0.46690458059310913, + "learning_rate": 6.323008200891664e-05, + "loss": 0.19348812103271484, + "step": 86280 + }, + { + "epoch": 0.37046100478263483, + "grad_norm": 0.008506237529218197, + "learning_rate": 6.322577028879902e-05, + "loss": 0.18626704216003417, + "step": 86290 + }, + { + "epoch": 0.3705039368726548, + "grad_norm": 1.465051531791687, + "learning_rate": 6.32214585686814e-05, + "loss": 0.085543692111969, + "step": 86300 + }, + { + "epoch": 0.37054686896267486, + "grad_norm": 0.24718154966831207, + "learning_rate": 6.321714684856377e-05, + "loss": 0.5089605808258056, + "step": 86310 + }, + { + "epoch": 0.37058980105269484, + "grad_norm": 1.2836334705352783, + "learning_rate": 6.321283512844615e-05, + "loss": 0.313499903678894, + "step": 86320 + }, + { + "epoch": 0.37063273314271483, + "grad_norm": 2.7097294330596924, + "learning_rate": 6.320852340832853e-05, + "loss": 0.2826047420501709, + "step": 86330 + }, + { + "epoch": 0.3706756652327349, + "grad_norm": 10.594082832336426, + "learning_rate": 6.32042116882109e-05, + "loss": 0.2791079759597778, + "step": 86340 + }, + { + "epoch": 0.37071859732275486, + "grad_norm": 0.009699574671685696, + "learning_rate": 6.319989996809328e-05, + "loss": 0.16034698486328125, + "step": 86350 + }, + { + "epoch": 0.37076152941277485, + "grad_norm": 0.37866318225860596, + "learning_rate": 6.319558824797565e-05, + "loss": 0.1663529634475708, + "step": 86360 + }, + { + "epoch": 0.3708044615027949, + "grad_norm": 6.374588489532471, + "learning_rate": 6.319127652785802e-05, + "loss": 0.2670762538909912, + "step": 86370 + }, + { + "epoch": 0.3708473935928149, + "grad_norm": 0.12831802666187286, + "learning_rate": 6.31869648077404e-05, + "loss": 0.3458155393600464, + "step": 86380 + }, + { + "epoch": 0.37089032568283486, + "grad_norm": 0.013740170747041702, + "learning_rate": 6.318265308762278e-05, + "loss": 0.19004298448562623, + "step": 86390 + }, + { + "epoch": 0.3709332577728549, + "grad_norm": 0.009914065711200237, + "learning_rate": 6.317834136750516e-05, + "loss": 0.18159607648849488, + "step": 86400 + }, + { + "epoch": 0.3709761898628749, + "grad_norm": 0.11303571611642838, + "learning_rate": 6.317402964738753e-05, + "loss": 0.19277287721633912, + "step": 86410 + }, + { + "epoch": 0.37101912195289494, + "grad_norm": 0.2996561825275421, + "learning_rate": 6.316971792726991e-05, + "loss": 0.09895474910736084, + "step": 86420 + }, + { + "epoch": 0.3710620540429149, + "grad_norm": 0.4050360321998596, + "learning_rate": 6.316540620715229e-05, + "loss": 0.29994912147521974, + "step": 86430 + }, + { + "epoch": 0.3711049861329349, + "grad_norm": 2.208390474319458, + "learning_rate": 6.316109448703465e-05, + "loss": 0.2208533763885498, + "step": 86440 + }, + { + "epoch": 0.37114791822295495, + "grad_norm": 0.021781016141176224, + "learning_rate": 6.315678276691703e-05, + "loss": 0.09090492129325867, + "step": 86450 + }, + { + "epoch": 0.37119085031297494, + "grad_norm": 0.028992371633648872, + "learning_rate": 6.31524710467994e-05, + "loss": 0.2799768209457397, + "step": 86460 + }, + { + "epoch": 0.3712337824029949, + "grad_norm": 0.1658248007297516, + "learning_rate": 6.314815932668178e-05, + "loss": 0.1564157485961914, + "step": 86470 + }, + { + "epoch": 0.37127671449301497, + "grad_norm": 12.077330589294434, + "learning_rate": 6.314384760656416e-05, + "loss": 0.25147428512573244, + "step": 86480 + }, + { + "epoch": 0.37131964658303496, + "grad_norm": 0.13617444038391113, + "learning_rate": 6.313953588644654e-05, + "loss": 0.4071957588195801, + "step": 86490 + }, + { + "epoch": 0.37136257867305494, + "grad_norm": 1.4712886810302734, + "learning_rate": 6.313522416632892e-05, + "loss": 0.1500526785850525, + "step": 86500 + }, + { + "epoch": 0.371405510763075, + "grad_norm": 0.09021810442209244, + "learning_rate": 6.31309124462113e-05, + "loss": 0.07514996528625488, + "step": 86510 + }, + { + "epoch": 0.371448442853095, + "grad_norm": 6.454043388366699, + "learning_rate": 6.312660072609367e-05, + "loss": 0.5268967628479004, + "step": 86520 + }, + { + "epoch": 0.37149137494311496, + "grad_norm": 0.22059176862239838, + "learning_rate": 6.312228900597605e-05, + "loss": 0.21707556247711182, + "step": 86530 + }, + { + "epoch": 0.371534307033135, + "grad_norm": 1.2967723608016968, + "learning_rate": 6.311797728585843e-05, + "loss": 0.2733563184738159, + "step": 86540 + }, + { + "epoch": 0.371577239123155, + "grad_norm": 1.554063081741333, + "learning_rate": 6.31136655657408e-05, + "loss": 0.16565685272216796, + "step": 86550 + }, + { + "epoch": 0.371620171213175, + "grad_norm": 0.09298480302095413, + "learning_rate": 6.310935384562318e-05, + "loss": 0.18474723100662233, + "step": 86560 + }, + { + "epoch": 0.371663103303195, + "grad_norm": 4.236249923706055, + "learning_rate": 6.310504212550556e-05, + "loss": 0.3226930618286133, + "step": 86570 + }, + { + "epoch": 0.371706035393215, + "grad_norm": 0.6600306630134583, + "learning_rate": 6.310073040538793e-05, + "loss": 0.2629078388214111, + "step": 86580 + }, + { + "epoch": 0.371748967483235, + "grad_norm": 2.777444839477539, + "learning_rate": 6.309641868527031e-05, + "loss": 0.17999004125595092, + "step": 86590 + }, + { + "epoch": 0.37179189957325504, + "grad_norm": 0.15516115725040436, + "learning_rate": 6.309210696515269e-05, + "loss": 0.19380651712417601, + "step": 86600 + }, + { + "epoch": 0.371834831663275, + "grad_norm": 0.7865200638771057, + "learning_rate": 6.308779524503505e-05, + "loss": 0.2544250965118408, + "step": 86610 + }, + { + "epoch": 0.37187776375329507, + "grad_norm": 0.14352178573608398, + "learning_rate": 6.308348352491743e-05, + "loss": 0.3288907527923584, + "step": 86620 + }, + { + "epoch": 0.37192069584331505, + "grad_norm": 64.95818328857422, + "learning_rate": 6.307917180479981e-05, + "loss": 0.3481205940246582, + "step": 86630 + }, + { + "epoch": 0.37196362793333504, + "grad_norm": 5.94766092300415, + "learning_rate": 6.307486008468219e-05, + "loss": 0.26511225700378416, + "step": 86640 + }, + { + "epoch": 0.3720065600233551, + "grad_norm": 0.028272368013858795, + "learning_rate": 6.307054836456456e-05, + "loss": 0.24481871128082275, + "step": 86650 + }, + { + "epoch": 0.37204949211337507, + "grad_norm": 0.1931532770395279, + "learning_rate": 6.306623664444694e-05, + "loss": 0.0944499909877777, + "step": 86660 + }, + { + "epoch": 0.37209242420339506, + "grad_norm": 0.02626582235097885, + "learning_rate": 6.306192492432932e-05, + "loss": 0.17654716968536377, + "step": 86670 + }, + { + "epoch": 0.3721353562934151, + "grad_norm": 5.341935634613037, + "learning_rate": 6.30576132042117e-05, + "loss": 0.2550107479095459, + "step": 86680 + }, + { + "epoch": 0.3721782883834351, + "grad_norm": 0.019004661589860916, + "learning_rate": 6.305330148409406e-05, + "loss": 0.2920238018035889, + "step": 86690 + }, + { + "epoch": 0.37222122047345507, + "grad_norm": 0.12061820179224014, + "learning_rate": 6.304898976397644e-05, + "loss": 0.13331669569015503, + "step": 86700 + }, + { + "epoch": 0.3722641525634751, + "grad_norm": 0.004373606294393539, + "learning_rate": 6.304467804385881e-05, + "loss": 0.14938576221466066, + "step": 86710 + }, + { + "epoch": 0.3723070846534951, + "grad_norm": 0.06134732440114021, + "learning_rate": 6.304036632374119e-05, + "loss": 0.3664681434631348, + "step": 86720 + }, + { + "epoch": 0.3723500167435151, + "grad_norm": 0.03127064183354378, + "learning_rate": 6.303605460362357e-05, + "loss": 0.19766569137573242, + "step": 86730 + }, + { + "epoch": 0.37239294883353513, + "grad_norm": 0.6509777903556824, + "learning_rate": 6.303174288350595e-05, + "loss": 0.3099424123764038, + "step": 86740 + }, + { + "epoch": 0.3724358809235551, + "grad_norm": 0.10416044294834137, + "learning_rate": 6.302743116338832e-05, + "loss": 0.20144286155700683, + "step": 86750 + }, + { + "epoch": 0.3724788130135751, + "grad_norm": 0.034175265580415726, + "learning_rate": 6.30231194432707e-05, + "loss": 0.3239238739013672, + "step": 86760 + }, + { + "epoch": 0.37252174510359515, + "grad_norm": 1.1124687194824219, + "learning_rate": 6.301880772315308e-05, + "loss": 0.3482966899871826, + "step": 86770 + }, + { + "epoch": 0.37256467719361513, + "grad_norm": 2.5001158714294434, + "learning_rate": 6.301449600303545e-05, + "loss": 0.1421452760696411, + "step": 86780 + }, + { + "epoch": 0.3726076092836351, + "grad_norm": 2.893115520477295, + "learning_rate": 6.301018428291783e-05, + "loss": 0.2680665493011475, + "step": 86790 + }, + { + "epoch": 0.37265054137365516, + "grad_norm": 1.1407378911972046, + "learning_rate": 6.300587256280021e-05, + "loss": 0.24747891426086427, + "step": 86800 + }, + { + "epoch": 0.37269347346367515, + "grad_norm": 3.4893882274627686, + "learning_rate": 6.300156084268259e-05, + "loss": 0.17726287841796876, + "step": 86810 + }, + { + "epoch": 0.37273640555369514, + "grad_norm": 0.4985063672065735, + "learning_rate": 6.299724912256496e-05, + "loss": 0.195892333984375, + "step": 86820 + }, + { + "epoch": 0.3727793376437152, + "grad_norm": 3.9934985637664795, + "learning_rate": 6.299293740244734e-05, + "loss": 0.15681538581848145, + "step": 86830 + }, + { + "epoch": 0.37282226973373517, + "grad_norm": 2.1810057163238525, + "learning_rate": 6.298862568232972e-05, + "loss": 0.2681360006332397, + "step": 86840 + }, + { + "epoch": 0.3728652018237552, + "grad_norm": 2.539444923400879, + "learning_rate": 6.298431396221208e-05, + "loss": 0.030637264251708984, + "step": 86850 + }, + { + "epoch": 0.3729081339137752, + "grad_norm": 0.027268722653388977, + "learning_rate": 6.298000224209446e-05, + "loss": 0.35657052993774413, + "step": 86860 + }, + { + "epoch": 0.3729510660037952, + "grad_norm": 0.06975622475147247, + "learning_rate": 6.297569052197684e-05, + "loss": 0.014813748002052308, + "step": 86870 + }, + { + "epoch": 0.3729939980938152, + "grad_norm": 0.36983099579811096, + "learning_rate": 6.297137880185921e-05, + "loss": 0.2915071487426758, + "step": 86880 + }, + { + "epoch": 0.3730369301838352, + "grad_norm": 0.021252155303955078, + "learning_rate": 6.296706708174159e-05, + "loss": 0.1459917426109314, + "step": 86890 + }, + { + "epoch": 0.3730798622738552, + "grad_norm": 3.1471076011657715, + "learning_rate": 6.296275536162397e-05, + "loss": 0.36190290451049806, + "step": 86900 + }, + { + "epoch": 0.37312279436387524, + "grad_norm": 0.017519652843475342, + "learning_rate": 6.295844364150635e-05, + "loss": 0.09693054556846618, + "step": 86910 + }, + { + "epoch": 0.37316572645389523, + "grad_norm": 2.969122886657715, + "learning_rate": 6.295413192138872e-05, + "loss": 0.31624269485473633, + "step": 86920 + }, + { + "epoch": 0.3732086585439152, + "grad_norm": 5.2511420249938965, + "learning_rate": 6.29498202012711e-05, + "loss": 0.22679619789123534, + "step": 86930 + }, + { + "epoch": 0.37325159063393526, + "grad_norm": 0.019706908613443375, + "learning_rate": 6.294550848115347e-05, + "loss": 0.2215877056121826, + "step": 86940 + }, + { + "epoch": 0.37329452272395525, + "grad_norm": 5.416234493255615, + "learning_rate": 6.294119676103584e-05, + "loss": 0.38700578212738035, + "step": 86950 + }, + { + "epoch": 0.37333745481397523, + "grad_norm": 2.552222490310669, + "learning_rate": 6.293688504091822e-05, + "loss": 0.3708165168762207, + "step": 86960 + }, + { + "epoch": 0.3733803869039953, + "grad_norm": 0.34137478470802307, + "learning_rate": 6.29325733208006e-05, + "loss": 0.19809558391571044, + "step": 86970 + }, + { + "epoch": 0.37342331899401526, + "grad_norm": 0.11157994717359543, + "learning_rate": 6.292826160068297e-05, + "loss": 0.25876359939575194, + "step": 86980 + }, + { + "epoch": 0.37346625108403525, + "grad_norm": 2.443289041519165, + "learning_rate": 6.292394988056535e-05, + "loss": 0.14729411602020265, + "step": 86990 + }, + { + "epoch": 0.3735091831740553, + "grad_norm": 0.17522040009498596, + "learning_rate": 6.291963816044774e-05, + "loss": 0.3029524564743042, + "step": 87000 + }, + { + "epoch": 0.3735091831740553, + "eval_loss": 0.4400550425052643, + "eval_runtime": 27.2101, + "eval_samples_per_second": 3.675, + "eval_steps_per_second": 3.675, + "step": 87000 + }, + { + "epoch": 0.3735521152640753, + "grad_norm": 2.0458552837371826, + "learning_rate": 6.291532644033012e-05, + "loss": 0.18736555576324462, + "step": 87010 + }, + { + "epoch": 0.37359504735409527, + "grad_norm": 2.114137649536133, + "learning_rate": 6.291101472021248e-05, + "loss": 0.05374835729598999, + "step": 87020 + }, + { + "epoch": 0.3736379794441153, + "grad_norm": 0.018394406884908676, + "learning_rate": 6.290670300009486e-05, + "loss": 0.18129925727844237, + "step": 87030 + }, + { + "epoch": 0.3736809115341353, + "grad_norm": 0.13711842894554138, + "learning_rate": 6.290239127997724e-05, + "loss": 0.2827707052230835, + "step": 87040 + }, + { + "epoch": 0.37372384362415534, + "grad_norm": 0.027511335909366608, + "learning_rate": 6.289807955985962e-05, + "loss": 0.2441173553466797, + "step": 87050 + }, + { + "epoch": 0.3737667757141753, + "grad_norm": 2.395310640335083, + "learning_rate": 6.289376783974199e-05, + "loss": 0.11282216310501099, + "step": 87060 + }, + { + "epoch": 0.3738097078041953, + "grad_norm": 5.538151741027832, + "learning_rate": 6.288945611962437e-05, + "loss": 0.0628732681274414, + "step": 87070 + }, + { + "epoch": 0.37385263989421536, + "grad_norm": 0.09157660603523254, + "learning_rate": 6.288514439950675e-05, + "loss": 0.09940847158432006, + "step": 87080 + }, + { + "epoch": 0.37389557198423534, + "grad_norm": 0.004044859204441309, + "learning_rate": 6.288083267938913e-05, + "loss": 0.012469526380300522, + "step": 87090 + }, + { + "epoch": 0.37393850407425533, + "grad_norm": 0.02948996238410473, + "learning_rate": 6.287652095927149e-05, + "loss": 0.22384850978851317, + "step": 87100 + }, + { + "epoch": 0.37398143616427537, + "grad_norm": 0.02249191142618656, + "learning_rate": 6.287220923915387e-05, + "loss": 0.2673391580581665, + "step": 87110 + }, + { + "epoch": 0.37402436825429536, + "grad_norm": 0.003082884708419442, + "learning_rate": 6.286789751903624e-05, + "loss": 0.12813795804977418, + "step": 87120 + }, + { + "epoch": 0.37406730034431535, + "grad_norm": 0.01360277272760868, + "learning_rate": 6.286358579891862e-05, + "loss": 0.22645974159240723, + "step": 87130 + }, + { + "epoch": 0.3741102324343354, + "grad_norm": 0.021722465753555298, + "learning_rate": 6.2859274078801e-05, + "loss": 0.1636356830596924, + "step": 87140 + }, + { + "epoch": 0.3741531645243554, + "grad_norm": 4.585870742797852, + "learning_rate": 6.285496235868338e-05, + "loss": 0.2738921642303467, + "step": 87150 + }, + { + "epoch": 0.37419609661437536, + "grad_norm": 2.0261483192443848, + "learning_rate": 6.285065063856575e-05, + "loss": 0.2621325969696045, + "step": 87160 + }, + { + "epoch": 0.3742390287043954, + "grad_norm": 1.8282241821289062, + "learning_rate": 6.284633891844813e-05, + "loss": 0.23655502796173095, + "step": 87170 + }, + { + "epoch": 0.3742819607944154, + "grad_norm": 0.038744006305933, + "learning_rate": 6.28420271983305e-05, + "loss": 0.28763017654418943, + "step": 87180 + }, + { + "epoch": 0.3743248928844354, + "grad_norm": 0.030506085604429245, + "learning_rate": 6.283771547821287e-05, + "loss": 0.1275754451751709, + "step": 87190 + }, + { + "epoch": 0.3743678249744554, + "grad_norm": 0.0035087086725980043, + "learning_rate": 6.283340375809525e-05, + "loss": 0.1500408411026001, + "step": 87200 + }, + { + "epoch": 0.3744107570644754, + "grad_norm": 5.859970569610596, + "learning_rate": 6.282909203797763e-05, + "loss": 0.2530032157897949, + "step": 87210 + }, + { + "epoch": 0.3744536891544954, + "grad_norm": 0.0439981184899807, + "learning_rate": 6.282478031786002e-05, + "loss": 0.2651748895645142, + "step": 87220 + }, + { + "epoch": 0.37449662124451544, + "grad_norm": 3.4375038146972656, + "learning_rate": 6.28204685977424e-05, + "loss": 0.2220317840576172, + "step": 87230 + }, + { + "epoch": 0.3745395533345354, + "grad_norm": 0.005168871488422155, + "learning_rate": 6.281615687762477e-05, + "loss": 0.43964247703552245, + "step": 87240 + }, + { + "epoch": 0.3745824854245554, + "grad_norm": 0.041050877422094345, + "learning_rate": 6.281184515750715e-05, + "loss": 0.4249211311340332, + "step": 87250 + }, + { + "epoch": 0.37462541751457545, + "grad_norm": 0.08297502249479294, + "learning_rate": 6.280753343738951e-05, + "loss": 0.4031353950500488, + "step": 87260 + }, + { + "epoch": 0.37466834960459544, + "grad_norm": 0.10702119022607803, + "learning_rate": 6.280322171727189e-05, + "loss": 0.5200258731842041, + "step": 87270 + }, + { + "epoch": 0.3747112816946155, + "grad_norm": 0.04025140777230263, + "learning_rate": 6.279890999715427e-05, + "loss": 0.10984960794448853, + "step": 87280 + }, + { + "epoch": 0.37475421378463547, + "grad_norm": 0.033183638006448746, + "learning_rate": 6.279459827703664e-05, + "loss": 0.32496328353881837, + "step": 87290 + }, + { + "epoch": 0.37479714587465546, + "grad_norm": 4.962788105010986, + "learning_rate": 6.279028655691902e-05, + "loss": 0.18989670276641846, + "step": 87300 + }, + { + "epoch": 0.3748400779646755, + "grad_norm": 0.16489867866039276, + "learning_rate": 6.27859748368014e-05, + "loss": 0.23146965503692626, + "step": 87310 + }, + { + "epoch": 0.3748830100546955, + "grad_norm": 0.2948302626609802, + "learning_rate": 6.278166311668378e-05, + "loss": 0.21655528545379638, + "step": 87320 + }, + { + "epoch": 0.3749259421447155, + "grad_norm": 1.196093201637268, + "learning_rate": 6.277735139656615e-05, + "loss": 0.39217898845672605, + "step": 87330 + }, + { + "epoch": 0.3749688742347355, + "grad_norm": 1.7001733779907227, + "learning_rate": 6.277303967644853e-05, + "loss": 0.25654077529907227, + "step": 87340 + }, + { + "epoch": 0.3750118063247555, + "grad_norm": 0.7644587755203247, + "learning_rate": 6.27687279563309e-05, + "loss": 0.2915396451950073, + "step": 87350 + }, + { + "epoch": 0.3750547384147755, + "grad_norm": 0.12401857227087021, + "learning_rate": 6.276441623621327e-05, + "loss": 0.3159175872802734, + "step": 87360 + }, + { + "epoch": 0.37509767050479553, + "grad_norm": 0.022166695445775986, + "learning_rate": 6.276010451609565e-05, + "loss": 0.1497678279876709, + "step": 87370 + }, + { + "epoch": 0.3751406025948155, + "grad_norm": 0.4812676012516022, + "learning_rate": 6.275579279597803e-05, + "loss": 0.12899582386016845, + "step": 87380 + }, + { + "epoch": 0.3751835346848355, + "grad_norm": 0.10921736806631088, + "learning_rate": 6.27514810758604e-05, + "loss": 0.1952446460723877, + "step": 87390 + }, + { + "epoch": 0.37522646677485555, + "grad_norm": 0.9592009782791138, + "learning_rate": 6.274716935574278e-05, + "loss": 0.2865952730178833, + "step": 87400 + }, + { + "epoch": 0.37526939886487554, + "grad_norm": 0.010206897743046284, + "learning_rate": 6.274285763562516e-05, + "loss": 0.2494706392288208, + "step": 87410 + }, + { + "epoch": 0.3753123309548955, + "grad_norm": 0.1299165040254593, + "learning_rate": 6.273854591550754e-05, + "loss": 0.1522657871246338, + "step": 87420 + }, + { + "epoch": 0.37535526304491557, + "grad_norm": 0.009295583702623844, + "learning_rate": 6.27342341953899e-05, + "loss": 0.01438492089509964, + "step": 87430 + }, + { + "epoch": 0.37539819513493555, + "grad_norm": 0.699145495891571, + "learning_rate": 6.272992247527229e-05, + "loss": 0.15755927562713623, + "step": 87440 + }, + { + "epoch": 0.37544112722495554, + "grad_norm": 13.779772758483887, + "learning_rate": 6.272561075515467e-05, + "loss": 0.10173404216766357, + "step": 87450 + }, + { + "epoch": 0.3754840593149756, + "grad_norm": 0.06877686083316803, + "learning_rate": 6.272129903503705e-05, + "loss": 0.11068353652954102, + "step": 87460 + }, + { + "epoch": 0.37552699140499557, + "grad_norm": 1.3501126766204834, + "learning_rate": 6.271698731491942e-05, + "loss": 0.38095693588256835, + "step": 87470 + }, + { + "epoch": 0.3755699234950156, + "grad_norm": 9.027297973632812, + "learning_rate": 6.27126755948018e-05, + "loss": 0.2303825855255127, + "step": 87480 + }, + { + "epoch": 0.3756128555850356, + "grad_norm": 0.003722363617271185, + "learning_rate": 6.270836387468418e-05, + "loss": 0.33140523433685304, + "step": 87490 + }, + { + "epoch": 0.3756557876750556, + "grad_norm": 0.3718756139278412, + "learning_rate": 6.270405215456656e-05, + "loss": 0.24214658737182618, + "step": 87500 + }, + { + "epoch": 0.37569871976507563, + "grad_norm": 3.6789193153381348, + "learning_rate": 6.269974043444892e-05, + "loss": 0.20105509757995604, + "step": 87510 + }, + { + "epoch": 0.3757416518550956, + "grad_norm": 0.7093227505683899, + "learning_rate": 6.26954287143313e-05, + "loss": 0.16223851442337037, + "step": 87520 + }, + { + "epoch": 0.3757845839451156, + "grad_norm": 0.2839501202106476, + "learning_rate": 6.269111699421367e-05, + "loss": 0.2748666763305664, + "step": 87530 + }, + { + "epoch": 0.37582751603513564, + "grad_norm": 0.004592269193381071, + "learning_rate": 6.268680527409605e-05, + "loss": 0.20933430194854735, + "step": 87540 + }, + { + "epoch": 0.37587044812515563, + "grad_norm": 1.9469878673553467, + "learning_rate": 6.268249355397843e-05, + "loss": 0.21942739486694335, + "step": 87550 + }, + { + "epoch": 0.3759133802151756, + "grad_norm": 0.05047018826007843, + "learning_rate": 6.26781818338608e-05, + "loss": 0.17022851705551148, + "step": 87560 + }, + { + "epoch": 0.37595631230519566, + "grad_norm": 0.0031462402548640966, + "learning_rate": 6.267387011374318e-05, + "loss": 0.15885262489318847, + "step": 87570 + }, + { + "epoch": 0.37599924439521565, + "grad_norm": 0.9981968402862549, + "learning_rate": 6.266955839362556e-05, + "loss": 0.40951828956604003, + "step": 87580 + }, + { + "epoch": 0.37604217648523564, + "grad_norm": 0.5256025791168213, + "learning_rate": 6.266524667350792e-05, + "loss": 0.03144158124923706, + "step": 87590 + }, + { + "epoch": 0.3760851085752557, + "grad_norm": 1.802929162979126, + "learning_rate": 6.26609349533903e-05, + "loss": 0.23492536544799805, + "step": 87600 + }, + { + "epoch": 0.37612804066527566, + "grad_norm": 2.9922046661376953, + "learning_rate": 6.265662323327268e-05, + "loss": 0.016963517665863036, + "step": 87610 + }, + { + "epoch": 0.37617097275529565, + "grad_norm": 12.366278648376465, + "learning_rate": 6.265231151315506e-05, + "loss": 0.33857011795043945, + "step": 87620 + }, + { + "epoch": 0.3762139048453157, + "grad_norm": 1.1289916038513184, + "learning_rate": 6.264799979303743e-05, + "loss": 0.4807577610015869, + "step": 87630 + }, + { + "epoch": 0.3762568369353357, + "grad_norm": 1.944948673248291, + "learning_rate": 6.264368807291981e-05, + "loss": 0.244700026512146, + "step": 87640 + }, + { + "epoch": 0.37629976902535567, + "grad_norm": 1.5617321729660034, + "learning_rate": 6.263937635280219e-05, + "loss": 0.3912205219268799, + "step": 87650 + }, + { + "epoch": 0.3763427011153757, + "grad_norm": 0.002875519683584571, + "learning_rate": 6.263506463268457e-05, + "loss": 0.07082734107971192, + "step": 87660 + }, + { + "epoch": 0.3763856332053957, + "grad_norm": 0.6047908663749695, + "learning_rate": 6.263075291256694e-05, + "loss": 0.03974010944366455, + "step": 87670 + }, + { + "epoch": 0.3764285652954157, + "grad_norm": 0.021802136674523354, + "learning_rate": 6.262644119244932e-05, + "loss": 0.12680743932723998, + "step": 87680 + }, + { + "epoch": 0.3764714973854357, + "grad_norm": 1.6270766258239746, + "learning_rate": 6.26221294723317e-05, + "loss": 0.2659365177154541, + "step": 87690 + }, + { + "epoch": 0.3765144294754557, + "grad_norm": 1.6786472797393799, + "learning_rate": 6.261781775221408e-05, + "loss": 0.26304676532745364, + "step": 87700 + }, + { + "epoch": 0.37655736156547576, + "grad_norm": 0.0038254903629422188, + "learning_rate": 6.261350603209645e-05, + "loss": 0.2967263698577881, + "step": 87710 + }, + { + "epoch": 0.37660029365549574, + "grad_norm": 0.021582119166851044, + "learning_rate": 6.260919431197883e-05, + "loss": 0.2736553907394409, + "step": 87720 + }, + { + "epoch": 0.37664322574551573, + "grad_norm": 2.2392396926879883, + "learning_rate": 6.260488259186121e-05, + "loss": 0.19963181018829346, + "step": 87730 + }, + { + "epoch": 0.3766861578355358, + "grad_norm": 3.8308897018432617, + "learning_rate": 6.260057087174358e-05, + "loss": 0.33628668785095217, + "step": 87740 + }, + { + "epoch": 0.37672908992555576, + "grad_norm": 4.784469127655029, + "learning_rate": 6.259625915162596e-05, + "loss": 0.16687393188476562, + "step": 87750 + }, + { + "epoch": 0.37677202201557575, + "grad_norm": 9.717080116271973, + "learning_rate": 6.259194743150833e-05, + "loss": 0.14312649965286256, + "step": 87760 + }, + { + "epoch": 0.3768149541055958, + "grad_norm": 0.1213330551981926, + "learning_rate": 6.25876357113907e-05, + "loss": 0.04287871420383453, + "step": 87770 + }, + { + "epoch": 0.3768578861956158, + "grad_norm": 3.9862990379333496, + "learning_rate": 6.258332399127308e-05, + "loss": 0.12331254482269287, + "step": 87780 + }, + { + "epoch": 0.37690081828563576, + "grad_norm": 0.02957821451127529, + "learning_rate": 6.257901227115546e-05, + "loss": 0.20847928524017334, + "step": 87790 + }, + { + "epoch": 0.3769437503756558, + "grad_norm": 0.5684049129486084, + "learning_rate": 6.257470055103784e-05, + "loss": 0.22962512969970703, + "step": 87800 + }, + { + "epoch": 0.3769866824656758, + "grad_norm": 1.9917635917663574, + "learning_rate": 6.257038883092021e-05, + "loss": 0.465483283996582, + "step": 87810 + }, + { + "epoch": 0.3770296145556958, + "grad_norm": 0.004031947813928127, + "learning_rate": 6.256607711080259e-05, + "loss": 0.10231343507766724, + "step": 87820 + }, + { + "epoch": 0.3770725466457158, + "grad_norm": 2.37412691116333, + "learning_rate": 6.256176539068497e-05, + "loss": 0.20844850540161133, + "step": 87830 + }, + { + "epoch": 0.3771154787357358, + "grad_norm": 0.03548434004187584, + "learning_rate": 6.255745367056733e-05, + "loss": 0.31219236850738524, + "step": 87840 + }, + { + "epoch": 0.3771584108257558, + "grad_norm": 0.015120014548301697, + "learning_rate": 6.255314195044971e-05, + "loss": 0.2350841760635376, + "step": 87850 + }, + { + "epoch": 0.37720134291577584, + "grad_norm": 0.021302489563822746, + "learning_rate": 6.254883023033209e-05, + "loss": 0.11671949625015259, + "step": 87860 + }, + { + "epoch": 0.3772442750057958, + "grad_norm": 0.023564008995890617, + "learning_rate": 6.254451851021446e-05, + "loss": 0.23691887855529786, + "step": 87870 + }, + { + "epoch": 0.3772872070958158, + "grad_norm": 2.984902858734131, + "learning_rate": 6.254020679009684e-05, + "loss": 0.24498653411865234, + "step": 87880 + }, + { + "epoch": 0.37733013918583586, + "grad_norm": 0.09107789397239685, + "learning_rate": 6.253589506997922e-05, + "loss": 0.21076819896697999, + "step": 87890 + }, + { + "epoch": 0.37737307127585584, + "grad_norm": 2.0704238414764404, + "learning_rate": 6.25315833498616e-05, + "loss": 0.13833118677139283, + "step": 87900 + }, + { + "epoch": 0.3774160033658759, + "grad_norm": 0.008586696349084377, + "learning_rate": 6.252727162974397e-05, + "loss": 0.34412851333618166, + "step": 87910 + }, + { + "epoch": 0.37745893545589587, + "grad_norm": 0.24232687056064606, + "learning_rate": 6.252295990962635e-05, + "loss": 0.1365652084350586, + "step": 87920 + }, + { + "epoch": 0.37750186754591586, + "grad_norm": 0.003949606791138649, + "learning_rate": 6.251864818950873e-05, + "loss": 0.26673967838287355, + "step": 87930 + }, + { + "epoch": 0.3775447996359359, + "grad_norm": 2.1608128547668457, + "learning_rate": 6.25143364693911e-05, + "loss": 0.29307947158813474, + "step": 87940 + }, + { + "epoch": 0.3775877317259559, + "grad_norm": 1.1866542100906372, + "learning_rate": 6.251002474927348e-05, + "loss": 0.33245224952697755, + "step": 87950 + }, + { + "epoch": 0.3776306638159759, + "grad_norm": 0.047932349145412445, + "learning_rate": 6.250571302915586e-05, + "loss": 0.08770357966423034, + "step": 87960 + }, + { + "epoch": 0.3776735959059959, + "grad_norm": 3.1019997596740723, + "learning_rate": 6.250140130903824e-05, + "loss": 0.18892154693603516, + "step": 87970 + }, + { + "epoch": 0.3777165279960159, + "grad_norm": 6.068072319030762, + "learning_rate": 6.249708958892061e-05, + "loss": 0.23206815719604493, + "step": 87980 + }, + { + "epoch": 0.3777594600860359, + "grad_norm": 0.910423219203949, + "learning_rate": 6.249277786880299e-05, + "loss": 0.35682499408721924, + "step": 87990 + }, + { + "epoch": 0.37780239217605593, + "grad_norm": 4.847745895385742, + "learning_rate": 6.248846614868535e-05, + "loss": 0.39348864555358887, + "step": 88000 + }, + { + "epoch": 0.37780239217605593, + "eval_loss": 0.4211534261703491, + "eval_runtime": 27.3365, + "eval_samples_per_second": 3.658, + "eval_steps_per_second": 3.658, + "step": 88000 + }, + { + "epoch": 0.3778453242660759, + "grad_norm": 1.5804665088653564, + "learning_rate": 6.248415442856773e-05, + "loss": 0.10657334327697754, + "step": 88010 + }, + { + "epoch": 0.3778882563560959, + "grad_norm": 2.5569381713867188, + "learning_rate": 6.247984270845011e-05, + "loss": 0.19821202754974365, + "step": 88020 + }, + { + "epoch": 0.37793118844611595, + "grad_norm": 0.07708010822534561, + "learning_rate": 6.247553098833249e-05, + "loss": 0.13068944215774536, + "step": 88030 + }, + { + "epoch": 0.37797412053613594, + "grad_norm": 0.36910587549209595, + "learning_rate": 6.247121926821486e-05, + "loss": 0.1964368462562561, + "step": 88040 + }, + { + "epoch": 0.3780170526261559, + "grad_norm": 0.6348251700401306, + "learning_rate": 6.246690754809724e-05, + "loss": 0.29259209632873534, + "step": 88050 + }, + { + "epoch": 0.37805998471617597, + "grad_norm": 1.2968003749847412, + "learning_rate": 6.246259582797962e-05, + "loss": 0.31879286766052245, + "step": 88060 + }, + { + "epoch": 0.37810291680619595, + "grad_norm": 0.01956726796925068, + "learning_rate": 6.2458284107862e-05, + "loss": 0.10881918668746948, + "step": 88070 + }, + { + "epoch": 0.37814584889621594, + "grad_norm": 0.05537987872958183, + "learning_rate": 6.245397238774437e-05, + "loss": 0.3085724592208862, + "step": 88080 + }, + { + "epoch": 0.378188780986236, + "grad_norm": 0.010545702651143074, + "learning_rate": 6.244966066762674e-05, + "loss": 0.153480064868927, + "step": 88090 + }, + { + "epoch": 0.37823171307625597, + "grad_norm": 6.235033988952637, + "learning_rate": 6.244534894750911e-05, + "loss": 0.30473511219024657, + "step": 88100 + }, + { + "epoch": 0.37827464516627596, + "grad_norm": 1.8143759965896606, + "learning_rate": 6.244103722739149e-05, + "loss": 0.35205180644989015, + "step": 88110 + }, + { + "epoch": 0.378317577256296, + "grad_norm": 1.0785503387451172, + "learning_rate": 6.243672550727387e-05, + "loss": 0.2504326343536377, + "step": 88120 + }, + { + "epoch": 0.378360509346316, + "grad_norm": 5.685330867767334, + "learning_rate": 6.243241378715625e-05, + "loss": 0.2820307970046997, + "step": 88130 + }, + { + "epoch": 0.37840344143633603, + "grad_norm": 0.021171852946281433, + "learning_rate": 6.242810206703862e-05, + "loss": 0.10080406665802003, + "step": 88140 + }, + { + "epoch": 0.378446373526356, + "grad_norm": 0.008075188845396042, + "learning_rate": 6.2423790346921e-05, + "loss": 0.0766272485256195, + "step": 88150 + }, + { + "epoch": 0.378489305616376, + "grad_norm": 0.523626446723938, + "learning_rate": 6.241947862680338e-05, + "loss": 0.3302738189697266, + "step": 88160 + }, + { + "epoch": 0.37853223770639605, + "grad_norm": 1.6001014709472656, + "learning_rate": 6.241516690668576e-05, + "loss": 0.12499821186065674, + "step": 88170 + }, + { + "epoch": 0.37857516979641603, + "grad_norm": 0.010254238732159138, + "learning_rate": 6.241085518656813e-05, + "loss": 0.1693050742149353, + "step": 88180 + }, + { + "epoch": 0.378618101886436, + "grad_norm": 0.06305219233036041, + "learning_rate": 6.240654346645051e-05, + "loss": 0.28610665798187257, + "step": 88190 + }, + { + "epoch": 0.37866103397645606, + "grad_norm": 0.9428789615631104, + "learning_rate": 6.240223174633289e-05, + "loss": 0.2268359899520874, + "step": 88200 + }, + { + "epoch": 0.37870396606647605, + "grad_norm": 0.20084306597709656, + "learning_rate": 6.239792002621527e-05, + "loss": 0.36552431583404543, + "step": 88210 + }, + { + "epoch": 0.37874689815649604, + "grad_norm": 1.3825594186782837, + "learning_rate": 6.239360830609764e-05, + "loss": 0.42622647285461424, + "step": 88220 + }, + { + "epoch": 0.3787898302465161, + "grad_norm": 0.0558270625770092, + "learning_rate": 6.238929658598002e-05, + "loss": 0.265299391746521, + "step": 88230 + }, + { + "epoch": 0.37883276233653607, + "grad_norm": 0.01165605615824461, + "learning_rate": 6.23849848658624e-05, + "loss": 0.12851730585098267, + "step": 88240 + }, + { + "epoch": 0.37887569442655605, + "grad_norm": 0.011059621348977089, + "learning_rate": 6.238067314574476e-05, + "loss": 0.295037055015564, + "step": 88250 + }, + { + "epoch": 0.3789186265165761, + "grad_norm": 30.9415340423584, + "learning_rate": 6.237636142562714e-05, + "loss": 0.330984902381897, + "step": 88260 + }, + { + "epoch": 0.3789615586065961, + "grad_norm": 0.08154928684234619, + "learning_rate": 6.237204970550952e-05, + "loss": 0.29265894889831545, + "step": 88270 + }, + { + "epoch": 0.37900449069661607, + "grad_norm": 5.459999084472656, + "learning_rate": 6.23677379853919e-05, + "loss": 0.4351907253265381, + "step": 88280 + }, + { + "epoch": 0.3790474227866361, + "grad_norm": 0.3953285217285156, + "learning_rate": 6.236342626527427e-05, + "loss": 0.13419573307037352, + "step": 88290 + }, + { + "epoch": 0.3790903548766561, + "grad_norm": 0.019297489896416664, + "learning_rate": 6.235911454515665e-05, + "loss": 0.2427154302597046, + "step": 88300 + }, + { + "epoch": 0.3791332869666761, + "grad_norm": 1.6823676824569702, + "learning_rate": 6.235480282503903e-05, + "loss": 0.17967859506607056, + "step": 88310 + }, + { + "epoch": 0.37917621905669613, + "grad_norm": 0.00811012089252472, + "learning_rate": 6.23504911049214e-05, + "loss": 0.1739598512649536, + "step": 88320 + }, + { + "epoch": 0.3792191511467161, + "grad_norm": 0.5991887450218201, + "learning_rate": 6.234617938480377e-05, + "loss": 0.09256443977355958, + "step": 88330 + }, + { + "epoch": 0.37926208323673616, + "grad_norm": 1.724471092224121, + "learning_rate": 6.234186766468614e-05, + "loss": 0.2317936897277832, + "step": 88340 + }, + { + "epoch": 0.37930501532675615, + "grad_norm": 0.15021900832653046, + "learning_rate": 6.233755594456852e-05, + "loss": 0.22777304649353028, + "step": 88350 + }, + { + "epoch": 0.37934794741677613, + "grad_norm": 0.007549840956926346, + "learning_rate": 6.23332442244509e-05, + "loss": 0.08671526312828064, + "step": 88360 + }, + { + "epoch": 0.3793908795067962, + "grad_norm": 0.0036326975096017122, + "learning_rate": 6.232893250433328e-05, + "loss": 0.2025750160217285, + "step": 88370 + }, + { + "epoch": 0.37943381159681616, + "grad_norm": 0.011411375366151333, + "learning_rate": 6.232462078421565e-05, + "loss": 0.32199418544769287, + "step": 88380 + }, + { + "epoch": 0.37947674368683615, + "grad_norm": 0.10104304552078247, + "learning_rate": 6.232030906409803e-05, + "loss": 0.2756567716598511, + "step": 88390 + }, + { + "epoch": 0.3795196757768562, + "grad_norm": 0.26092761754989624, + "learning_rate": 6.231599734398041e-05, + "loss": 0.038532555103302, + "step": 88400 + }, + { + "epoch": 0.3795626078668762, + "grad_norm": 0.013410309329628944, + "learning_rate": 6.23116856238628e-05, + "loss": 0.3681929111480713, + "step": 88410 + }, + { + "epoch": 0.37960553995689617, + "grad_norm": 0.0316680446267128, + "learning_rate": 6.230737390374516e-05, + "loss": 0.059136635065078734, + "step": 88420 + }, + { + "epoch": 0.3796484720469162, + "grad_norm": 0.10209963470697403, + "learning_rate": 6.230306218362754e-05, + "loss": 0.3428246021270752, + "step": 88430 + }, + { + "epoch": 0.3796914041369362, + "grad_norm": 0.009456177242100239, + "learning_rate": 6.229875046350992e-05, + "loss": 0.04225753843784332, + "step": 88440 + }, + { + "epoch": 0.3797343362269562, + "grad_norm": 1.0888117551803589, + "learning_rate": 6.22944387433923e-05, + "loss": 0.07131630182266235, + "step": 88450 + }, + { + "epoch": 0.3797772683169762, + "grad_norm": 8.46331787109375, + "learning_rate": 6.229012702327467e-05, + "loss": 0.21127099990844728, + "step": 88460 + }, + { + "epoch": 0.3798202004069962, + "grad_norm": 0.002510474296286702, + "learning_rate": 6.228581530315705e-05, + "loss": 0.3312537431716919, + "step": 88470 + }, + { + "epoch": 0.3798631324970162, + "grad_norm": 0.028360135853290558, + "learning_rate": 6.228150358303943e-05, + "loss": 0.10665277242660523, + "step": 88480 + }, + { + "epoch": 0.37990606458703624, + "grad_norm": 0.011928168125450611, + "learning_rate": 6.22771918629218e-05, + "loss": 0.42289509773254397, + "step": 88490 + }, + { + "epoch": 0.3799489966770562, + "grad_norm": 1.114374041557312, + "learning_rate": 6.227288014280417e-05, + "loss": 0.26947269439697263, + "step": 88500 + }, + { + "epoch": 0.3799919287670762, + "grad_norm": 0.034509461373090744, + "learning_rate": 6.226856842268655e-05, + "loss": 0.16201649904251098, + "step": 88510 + }, + { + "epoch": 0.38003486085709626, + "grad_norm": 0.006820637732744217, + "learning_rate": 6.226425670256892e-05, + "loss": 0.012356171011924743, + "step": 88520 + }, + { + "epoch": 0.38007779294711624, + "grad_norm": 0.09843739867210388, + "learning_rate": 6.22599449824513e-05, + "loss": 0.13386306762695313, + "step": 88530 + }, + { + "epoch": 0.38012072503713623, + "grad_norm": 1.5230803489685059, + "learning_rate": 6.225563326233368e-05, + "loss": 0.22778193950653075, + "step": 88540 + }, + { + "epoch": 0.3801636571271563, + "grad_norm": 0.2800234854221344, + "learning_rate": 6.225132154221605e-05, + "loss": 0.20246121883392335, + "step": 88550 + }, + { + "epoch": 0.38020658921717626, + "grad_norm": 1.4027798175811768, + "learning_rate": 6.224700982209843e-05, + "loss": 0.2890122175216675, + "step": 88560 + }, + { + "epoch": 0.3802495213071963, + "grad_norm": 0.01395477820187807, + "learning_rate": 6.224269810198081e-05, + "loss": 0.18396477699279784, + "step": 88570 + }, + { + "epoch": 0.3802924533972163, + "grad_norm": 0.1274530589580536, + "learning_rate": 6.223838638186317e-05, + "loss": 0.0587466835975647, + "step": 88580 + }, + { + "epoch": 0.3803353854872363, + "grad_norm": 0.07611143589019775, + "learning_rate": 6.223407466174555e-05, + "loss": 0.26608948707580565, + "step": 88590 + }, + { + "epoch": 0.3803783175772563, + "grad_norm": 0.9535974860191345, + "learning_rate": 6.222976294162793e-05, + "loss": 0.30292160511016847, + "step": 88600 + }, + { + "epoch": 0.3804212496672763, + "grad_norm": 0.015950385481119156, + "learning_rate": 6.22254512215103e-05, + "loss": 0.2924375057220459, + "step": 88610 + }, + { + "epoch": 0.3804641817572963, + "grad_norm": 0.011609098874032497, + "learning_rate": 6.222113950139268e-05, + "loss": 0.11583701372146607, + "step": 88620 + }, + { + "epoch": 0.38050711384731634, + "grad_norm": 2.3866891860961914, + "learning_rate": 6.221682778127507e-05, + "loss": 0.33168849945068357, + "step": 88630 + }, + { + "epoch": 0.3805500459373363, + "grad_norm": 3.5178093910217285, + "learning_rate": 6.221251606115745e-05, + "loss": 0.2832959175109863, + "step": 88640 + }, + { + "epoch": 0.3805929780273563, + "grad_norm": 0.0034149482380598783, + "learning_rate": 6.220820434103983e-05, + "loss": 0.16352550983428954, + "step": 88650 + }, + { + "epoch": 0.38063591011737635, + "grad_norm": 1.0718631744384766, + "learning_rate": 6.220389262092219e-05, + "loss": 0.22328333854675292, + "step": 88660 + }, + { + "epoch": 0.38067884220739634, + "grad_norm": 0.0010514046298339963, + "learning_rate": 6.219958090080457e-05, + "loss": 0.21548836231231688, + "step": 88670 + }, + { + "epoch": 0.3807217742974163, + "grad_norm": 0.01005020085722208, + "learning_rate": 6.219526918068695e-05, + "loss": 0.18633402585983277, + "step": 88680 + }, + { + "epoch": 0.38076470638743637, + "grad_norm": 0.013129375874996185, + "learning_rate": 6.219095746056932e-05, + "loss": 0.2295675277709961, + "step": 88690 + }, + { + "epoch": 0.38080763847745636, + "grad_norm": 1.2943649291992188, + "learning_rate": 6.21866457404517e-05, + "loss": 0.32459824085235595, + "step": 88700 + }, + { + "epoch": 0.38085057056747634, + "grad_norm": 0.011330182664096355, + "learning_rate": 6.218233402033408e-05, + "loss": 0.10692859888076782, + "step": 88710 + }, + { + "epoch": 0.3808935026574964, + "grad_norm": 1.1285091638565063, + "learning_rate": 6.217802230021646e-05, + "loss": 0.2512716770172119, + "step": 88720 + }, + { + "epoch": 0.3809364347475164, + "grad_norm": 2.265028476715088, + "learning_rate": 6.217371058009883e-05, + "loss": 0.11686290502548217, + "step": 88730 + }, + { + "epoch": 0.38097936683753636, + "grad_norm": 1.254490852355957, + "learning_rate": 6.216939885998121e-05, + "loss": 0.4674684524536133, + "step": 88740 + }, + { + "epoch": 0.3810222989275564, + "grad_norm": 0.0046071987599134445, + "learning_rate": 6.216508713986357e-05, + "loss": 0.32127716541290285, + "step": 88750 + }, + { + "epoch": 0.3810652310175764, + "grad_norm": 1.2878999710083008, + "learning_rate": 6.216077541974595e-05, + "loss": 0.25494959354400637, + "step": 88760 + }, + { + "epoch": 0.38110816310759643, + "grad_norm": 0.03140444681048393, + "learning_rate": 6.215646369962833e-05, + "loss": 0.26149399280548097, + "step": 88770 + }, + { + "epoch": 0.3811510951976164, + "grad_norm": 1.8332608938217163, + "learning_rate": 6.21521519795107e-05, + "loss": 0.4215863704681396, + "step": 88780 + }, + { + "epoch": 0.3811940272876364, + "grad_norm": 0.016005532816052437, + "learning_rate": 6.214784025939308e-05, + "loss": 0.18732340335845948, + "step": 88790 + }, + { + "epoch": 0.38123695937765645, + "grad_norm": 0.9914336204528809, + "learning_rate": 6.214352853927546e-05, + "loss": 0.40707788467407224, + "step": 88800 + }, + { + "epoch": 0.38127989146767644, + "grad_norm": 0.029879281297326088, + "learning_rate": 6.213921681915784e-05, + "loss": 0.2158869504928589, + "step": 88810 + }, + { + "epoch": 0.3813228235576964, + "grad_norm": 34.581050872802734, + "learning_rate": 6.213490509904022e-05, + "loss": 0.3515269994735718, + "step": 88820 + }, + { + "epoch": 0.38136575564771646, + "grad_norm": 1.1815255880355835, + "learning_rate": 6.213059337892258e-05, + "loss": 0.3360306262969971, + "step": 88830 + }, + { + "epoch": 0.38140868773773645, + "grad_norm": 0.08978642523288727, + "learning_rate": 6.212628165880496e-05, + "loss": 0.029180306196212768, + "step": 88840 + }, + { + "epoch": 0.38145161982775644, + "grad_norm": 0.3570984899997711, + "learning_rate": 6.212196993868735e-05, + "loss": 0.1382278323173523, + "step": 88850 + }, + { + "epoch": 0.3814945519177765, + "grad_norm": 0.2925564646720886, + "learning_rate": 6.211765821856973e-05, + "loss": 0.12889453172683715, + "step": 88860 + }, + { + "epoch": 0.38153748400779647, + "grad_norm": 0.6374716758728027, + "learning_rate": 6.21133464984521e-05, + "loss": 0.2685490608215332, + "step": 88870 + }, + { + "epoch": 0.38158041609781645, + "grad_norm": 0.07237851619720459, + "learning_rate": 6.210903477833448e-05, + "loss": 0.034711554646492004, + "step": 88880 + }, + { + "epoch": 0.3816233481878365, + "grad_norm": 0.08661754429340363, + "learning_rate": 6.210472305821686e-05, + "loss": 0.26672289371490476, + "step": 88890 + }, + { + "epoch": 0.3816662802778565, + "grad_norm": 1.017971158027649, + "learning_rate": 6.210041133809923e-05, + "loss": 0.26748826503753664, + "step": 88900 + }, + { + "epoch": 0.38170921236787647, + "grad_norm": 4.834644794464111, + "learning_rate": 6.20960996179816e-05, + "loss": 0.12524523735046386, + "step": 88910 + }, + { + "epoch": 0.3817521444578965, + "grad_norm": 0.36303168535232544, + "learning_rate": 6.209178789786398e-05, + "loss": 0.2487999677658081, + "step": 88920 + }, + { + "epoch": 0.3817950765479165, + "grad_norm": 1.2829387187957764, + "learning_rate": 6.208747617774635e-05, + "loss": 0.1831027865409851, + "step": 88930 + }, + { + "epoch": 0.3818380086379365, + "grad_norm": 2.0571751594543457, + "learning_rate": 6.208316445762873e-05, + "loss": 0.28766908645629885, + "step": 88940 + }, + { + "epoch": 0.38188094072795653, + "grad_norm": 0.15016916394233704, + "learning_rate": 6.207885273751111e-05, + "loss": 0.19521453380584716, + "step": 88950 + }, + { + "epoch": 0.3819238728179765, + "grad_norm": 0.10648097097873688, + "learning_rate": 6.207454101739348e-05, + "loss": 0.29527380466461184, + "step": 88960 + }, + { + "epoch": 0.3819668049079965, + "grad_norm": 0.014429030939936638, + "learning_rate": 6.207022929727586e-05, + "loss": 0.1196025252342224, + "step": 88970 + }, + { + "epoch": 0.38200973699801655, + "grad_norm": 0.022127997130155563, + "learning_rate": 6.206591757715824e-05, + "loss": 0.27551116943359377, + "step": 88980 + }, + { + "epoch": 0.38205266908803653, + "grad_norm": 0.04227323830127716, + "learning_rate": 6.20616058570406e-05, + "loss": 0.13282071352005004, + "step": 88990 + }, + { + "epoch": 0.3820956011780566, + "grad_norm": 0.009546788409352303, + "learning_rate": 6.205729413692298e-05, + "loss": 0.2698558807373047, + "step": 89000 + }, + { + "epoch": 0.3820956011780566, + "eval_loss": 0.4414127767086029, + "eval_runtime": 27.1823, + "eval_samples_per_second": 3.679, + "eval_steps_per_second": 3.679, + "step": 89000 + }, + { + "epoch": 0.38213853326807656, + "grad_norm": 0.037746500223875046, + "learning_rate": 6.205298241680536e-05, + "loss": 0.3705944776535034, + "step": 89010 + }, + { + "epoch": 0.38218146535809655, + "grad_norm": 0.18876878917217255, + "learning_rate": 6.204867069668774e-05, + "loss": 0.1594752550125122, + "step": 89020 + }, + { + "epoch": 0.3822243974481166, + "grad_norm": 0.006270625162869692, + "learning_rate": 6.204435897657011e-05, + "loss": 0.0447078138589859, + "step": 89030 + }, + { + "epoch": 0.3822673295381366, + "grad_norm": 13.3214111328125, + "learning_rate": 6.204004725645249e-05, + "loss": 0.293184232711792, + "step": 89040 + }, + { + "epoch": 0.38231026162815657, + "grad_norm": 2.262474298477173, + "learning_rate": 6.203573553633487e-05, + "loss": 0.12684309482574463, + "step": 89050 + }, + { + "epoch": 0.3823531937181766, + "grad_norm": 0.18495914340019226, + "learning_rate": 6.203142381621724e-05, + "loss": 0.12461878061294555, + "step": 89060 + }, + { + "epoch": 0.3823961258081966, + "grad_norm": 1.989532470703125, + "learning_rate": 6.202711209609962e-05, + "loss": 0.3887135982513428, + "step": 89070 + }, + { + "epoch": 0.3824390578982166, + "grad_norm": 0.06415251642465591, + "learning_rate": 6.2022800375982e-05, + "loss": 0.12190742492675781, + "step": 89080 + }, + { + "epoch": 0.3824819899882366, + "grad_norm": 1.0233348608016968, + "learning_rate": 6.201848865586438e-05, + "loss": 0.16654955148696898, + "step": 89090 + }, + { + "epoch": 0.3825249220782566, + "grad_norm": 0.013394716195762157, + "learning_rate": 6.201417693574675e-05, + "loss": 0.3020642757415771, + "step": 89100 + }, + { + "epoch": 0.3825678541682766, + "grad_norm": 0.10267344862222672, + "learning_rate": 6.200986521562913e-05, + "loss": 0.18132349252700805, + "step": 89110 + }, + { + "epoch": 0.38261078625829664, + "grad_norm": 1.8178410530090332, + "learning_rate": 6.200555349551151e-05, + "loss": 0.3368029832839966, + "step": 89120 + }, + { + "epoch": 0.38265371834831663, + "grad_norm": 0.005919590126723051, + "learning_rate": 6.200124177539389e-05, + "loss": 0.17633817195892335, + "step": 89130 + }, + { + "epoch": 0.3826966504383366, + "grad_norm": 1.393633484840393, + "learning_rate": 6.199693005527626e-05, + "loss": 0.4469784736633301, + "step": 89140 + }, + { + "epoch": 0.38273958252835666, + "grad_norm": 0.07115230709314346, + "learning_rate": 6.199261833515864e-05, + "loss": 0.3376830339431763, + "step": 89150 + }, + { + "epoch": 0.38278251461837665, + "grad_norm": 0.03959648311138153, + "learning_rate": 6.1988306615041e-05, + "loss": 0.05683208107948303, + "step": 89160 + }, + { + "epoch": 0.38282544670839663, + "grad_norm": 0.031142039224505424, + "learning_rate": 6.198399489492338e-05, + "loss": 0.09311988949775696, + "step": 89170 + }, + { + "epoch": 0.3828683787984167, + "grad_norm": 0.0646190270781517, + "learning_rate": 6.197968317480576e-05, + "loss": 0.4846409797668457, + "step": 89180 + }, + { + "epoch": 0.38291131088843666, + "grad_norm": 0.0026598910335451365, + "learning_rate": 6.197537145468814e-05, + "loss": 0.3485872268676758, + "step": 89190 + }, + { + "epoch": 0.3829542429784567, + "grad_norm": 0.010424236766994, + "learning_rate": 6.197105973457051e-05, + "loss": 0.353135085105896, + "step": 89200 + }, + { + "epoch": 0.3829971750684767, + "grad_norm": 3.2976186275482178, + "learning_rate": 6.196674801445289e-05, + "loss": 0.1672595739364624, + "step": 89210 + }, + { + "epoch": 0.3830401071584967, + "grad_norm": 0.7941388487815857, + "learning_rate": 6.196243629433527e-05, + "loss": 0.371187424659729, + "step": 89220 + }, + { + "epoch": 0.3830830392485167, + "grad_norm": 1.3344122171401978, + "learning_rate": 6.195812457421765e-05, + "loss": 0.2827876806259155, + "step": 89230 + }, + { + "epoch": 0.3831259713385367, + "grad_norm": 3.533451795578003, + "learning_rate": 6.195381285410001e-05, + "loss": 0.15025629997253417, + "step": 89240 + }, + { + "epoch": 0.3831689034285567, + "grad_norm": 0.4801417589187622, + "learning_rate": 6.194950113398239e-05, + "loss": 0.17558751106262208, + "step": 89250 + }, + { + "epoch": 0.38321183551857674, + "grad_norm": 1.5117595195770264, + "learning_rate": 6.194518941386476e-05, + "loss": 0.40224194526672363, + "step": 89260 + }, + { + "epoch": 0.3832547676085967, + "grad_norm": 13.442787170410156, + "learning_rate": 6.194087769374714e-05, + "loss": 0.132839834690094, + "step": 89270 + }, + { + "epoch": 0.3832976996986167, + "grad_norm": 9.266427040100098, + "learning_rate": 6.193656597362952e-05, + "loss": 0.38136115074157717, + "step": 89280 + }, + { + "epoch": 0.38334063178863675, + "grad_norm": 0.11393710970878601, + "learning_rate": 6.19322542535119e-05, + "loss": 0.09291578531265259, + "step": 89290 + }, + { + "epoch": 0.38338356387865674, + "grad_norm": 3.6095845699310303, + "learning_rate": 6.192794253339427e-05, + "loss": 0.2640878200531006, + "step": 89300 + }, + { + "epoch": 0.38342649596867673, + "grad_norm": 1.8564183712005615, + "learning_rate": 6.192363081327665e-05, + "loss": 0.29603559970855714, + "step": 89310 + }, + { + "epoch": 0.38346942805869677, + "grad_norm": 0.913407027721405, + "learning_rate": 6.191931909315903e-05, + "loss": 0.28334932327270507, + "step": 89320 + }, + { + "epoch": 0.38351236014871676, + "grad_norm": 0.0012131475377827883, + "learning_rate": 6.19150073730414e-05, + "loss": 0.0755197286605835, + "step": 89330 + }, + { + "epoch": 0.38355529223873674, + "grad_norm": 0.2928409278392792, + "learning_rate": 6.191069565292378e-05, + "loss": 0.18231843709945678, + "step": 89340 + }, + { + "epoch": 0.3835982243287568, + "grad_norm": 0.03733060136437416, + "learning_rate": 6.190638393280616e-05, + "loss": 0.25749950408935546, + "step": 89350 + }, + { + "epoch": 0.3836411564187768, + "grad_norm": 0.039106305688619614, + "learning_rate": 6.190207221268854e-05, + "loss": 0.26590471267700194, + "step": 89360 + }, + { + "epoch": 0.38368408850879676, + "grad_norm": 1.9658153057098389, + "learning_rate": 6.189776049257092e-05, + "loss": 0.4718448638916016, + "step": 89370 + }, + { + "epoch": 0.3837270205988168, + "grad_norm": 4.088800430297852, + "learning_rate": 6.189344877245329e-05, + "loss": 0.11173911094665527, + "step": 89380 + }, + { + "epoch": 0.3837699526888368, + "grad_norm": 9.061023712158203, + "learning_rate": 6.188913705233567e-05, + "loss": 0.23896961212158202, + "step": 89390 + }, + { + "epoch": 0.3838128847788568, + "grad_norm": 0.018133578822016716, + "learning_rate": 6.188482533221803e-05, + "loss": 0.36962740421295165, + "step": 89400 + }, + { + "epoch": 0.3838558168688768, + "grad_norm": 0.03915409743785858, + "learning_rate": 6.188051361210041e-05, + "loss": 0.12740023136138917, + "step": 89410 + }, + { + "epoch": 0.3838987489588968, + "grad_norm": 0.032663632184267044, + "learning_rate": 6.187620189198279e-05, + "loss": 0.15168129205703734, + "step": 89420 + }, + { + "epoch": 0.38394168104891685, + "grad_norm": 0.005301196593791246, + "learning_rate": 6.187189017186517e-05, + "loss": 0.04259181320667267, + "step": 89430 + }, + { + "epoch": 0.38398461313893684, + "grad_norm": 0.01621132902801037, + "learning_rate": 6.186757845174754e-05, + "loss": 0.3094786167144775, + "step": 89440 + }, + { + "epoch": 0.3840275452289568, + "grad_norm": 0.007388011552393436, + "learning_rate": 6.186326673162992e-05, + "loss": 0.17484718561172485, + "step": 89450 + }, + { + "epoch": 0.38407047731897687, + "grad_norm": 0.06491857022047043, + "learning_rate": 6.18589550115123e-05, + "loss": 0.22071504592895508, + "step": 89460 + }, + { + "epoch": 0.38411340940899685, + "grad_norm": 0.08055774867534637, + "learning_rate": 6.185464329139468e-05, + "loss": 0.18757699728012084, + "step": 89470 + }, + { + "epoch": 0.38415634149901684, + "grad_norm": 0.2903849482536316, + "learning_rate": 6.185033157127705e-05, + "loss": 0.21719033718109132, + "step": 89480 + }, + { + "epoch": 0.3841992735890369, + "grad_norm": 0.0042831008322536945, + "learning_rate": 6.184601985115942e-05, + "loss": 0.35606064796447756, + "step": 89490 + }, + { + "epoch": 0.38424220567905687, + "grad_norm": 0.009732181206345558, + "learning_rate": 6.18417081310418e-05, + "loss": 0.13790091276168823, + "step": 89500 + }, + { + "epoch": 0.38428513776907686, + "grad_norm": 0.02407350391149521, + "learning_rate": 6.183739641092417e-05, + "loss": 0.19972857236862182, + "step": 89510 + }, + { + "epoch": 0.3843280698590969, + "grad_norm": 0.003901082556694746, + "learning_rate": 6.183308469080655e-05, + "loss": 0.23962645530700682, + "step": 89520 + }, + { + "epoch": 0.3843710019491169, + "grad_norm": 4.334282398223877, + "learning_rate": 6.182877297068893e-05, + "loss": 0.1551468014717102, + "step": 89530 + }, + { + "epoch": 0.3844139340391369, + "grad_norm": 0.1462079882621765, + "learning_rate": 6.18244612505713e-05, + "loss": 0.2502715826034546, + "step": 89540 + }, + { + "epoch": 0.3844568661291569, + "grad_norm": 0.002122892765328288, + "learning_rate": 6.182014953045368e-05, + "loss": 0.24722750186920167, + "step": 89550 + }, + { + "epoch": 0.3844997982191769, + "grad_norm": 0.08925088495016098, + "learning_rate": 6.181583781033606e-05, + "loss": 0.17703983783721924, + "step": 89560 + }, + { + "epoch": 0.3845427303091969, + "grad_norm": 0.19102783501148224, + "learning_rate": 6.181152609021844e-05, + "loss": 0.269983696937561, + "step": 89570 + }, + { + "epoch": 0.38458566239921693, + "grad_norm": 0.05688232555985451, + "learning_rate": 6.180721437010081e-05, + "loss": 0.2018204689025879, + "step": 89580 + }, + { + "epoch": 0.3846285944892369, + "grad_norm": 0.09904558211565018, + "learning_rate": 6.180290264998319e-05, + "loss": 0.12981395721435546, + "step": 89590 + }, + { + "epoch": 0.3846715265792569, + "grad_norm": 0.1914941668510437, + "learning_rate": 6.179859092986557e-05, + "loss": 0.12313872575759888, + "step": 89600 + }, + { + "epoch": 0.38471445866927695, + "grad_norm": 0.019617699086666107, + "learning_rate": 6.179427920974794e-05, + "loss": 0.1696901798248291, + "step": 89610 + }, + { + "epoch": 0.38475739075929694, + "grad_norm": 0.008060222491621971, + "learning_rate": 6.178996748963032e-05, + "loss": 0.2816110610961914, + "step": 89620 + }, + { + "epoch": 0.384800322849317, + "grad_norm": 0.03914284333586693, + "learning_rate": 6.17856557695127e-05, + "loss": 0.017223869264125825, + "step": 89630 + }, + { + "epoch": 0.38484325493933697, + "grad_norm": 0.02014545537531376, + "learning_rate": 6.178134404939508e-05, + "loss": 0.07916730642318726, + "step": 89640 + }, + { + "epoch": 0.38488618702935695, + "grad_norm": 0.474691241979599, + "learning_rate": 6.177703232927744e-05, + "loss": 0.2577148675918579, + "step": 89650 + }, + { + "epoch": 0.384929119119377, + "grad_norm": 0.13608905673027039, + "learning_rate": 6.177272060915982e-05, + "loss": 0.1876443147659302, + "step": 89660 + }, + { + "epoch": 0.384972051209397, + "grad_norm": 0.0023101787082850933, + "learning_rate": 6.17684088890422e-05, + "loss": 0.33298826217651367, + "step": 89670 + }, + { + "epoch": 0.38501498329941697, + "grad_norm": 1.3062105178833008, + "learning_rate": 6.176409716892457e-05, + "loss": 0.1542769193649292, + "step": 89680 + }, + { + "epoch": 0.385057915389437, + "grad_norm": 0.005330412648618221, + "learning_rate": 6.175978544880695e-05, + "loss": 0.23672285079956054, + "step": 89690 + }, + { + "epoch": 0.385100847479457, + "grad_norm": 1.3016077280044556, + "learning_rate": 6.175547372868933e-05, + "loss": 0.2891198396682739, + "step": 89700 + }, + { + "epoch": 0.385143779569477, + "grad_norm": 0.10209011286497116, + "learning_rate": 6.17511620085717e-05, + "loss": 0.14344640970230102, + "step": 89710 + }, + { + "epoch": 0.385186711659497, + "grad_norm": 0.4255737364292145, + "learning_rate": 6.174685028845408e-05, + "loss": 0.29021642208099363, + "step": 89720 + }, + { + "epoch": 0.385229643749517, + "grad_norm": 0.11061238497495651, + "learning_rate": 6.174253856833645e-05, + "loss": 0.17971653938293458, + "step": 89730 + }, + { + "epoch": 0.385272575839537, + "grad_norm": 0.034988872706890106, + "learning_rate": 6.173822684821882e-05, + "loss": 0.1534808397293091, + "step": 89740 + }, + { + "epoch": 0.38531550792955704, + "grad_norm": 1.2242257595062256, + "learning_rate": 6.17339151281012e-05, + "loss": 0.41945724487304686, + "step": 89750 + }, + { + "epoch": 0.38535844001957703, + "grad_norm": 0.007697496097534895, + "learning_rate": 6.172960340798358e-05, + "loss": 0.3344566822052002, + "step": 89760 + }, + { + "epoch": 0.385401372109597, + "grad_norm": 6.822228908538818, + "learning_rate": 6.172529168786595e-05, + "loss": 0.3244169235229492, + "step": 89770 + }, + { + "epoch": 0.38544430419961706, + "grad_norm": 2.186828374862671, + "learning_rate": 6.172097996774833e-05, + "loss": 0.4501382827758789, + "step": 89780 + }, + { + "epoch": 0.38548723628963705, + "grad_norm": 0.2173774540424347, + "learning_rate": 6.171666824763071e-05, + "loss": 0.23860557079315187, + "step": 89790 + }, + { + "epoch": 0.38553016837965703, + "grad_norm": 0.04658151790499687, + "learning_rate": 6.171235652751309e-05, + "loss": 0.1996417760848999, + "step": 89800 + }, + { + "epoch": 0.3855731004696771, + "grad_norm": 0.02228648215532303, + "learning_rate": 6.170804480739546e-05, + "loss": 0.22978696823120118, + "step": 89810 + }, + { + "epoch": 0.38561603255969706, + "grad_norm": 2.033644437789917, + "learning_rate": 6.170373308727784e-05, + "loss": 0.2479996681213379, + "step": 89820 + }, + { + "epoch": 0.38565896464971705, + "grad_norm": 0.8258361220359802, + "learning_rate": 6.169942136716022e-05, + "loss": 0.3214423656463623, + "step": 89830 + }, + { + "epoch": 0.3857018967397371, + "grad_norm": 0.01886231079697609, + "learning_rate": 6.16951096470426e-05, + "loss": 0.31837873458862304, + "step": 89840 + }, + { + "epoch": 0.3857448288297571, + "grad_norm": 2.3415749073028564, + "learning_rate": 6.169079792692497e-05, + "loss": 0.08363971710205079, + "step": 89850 + }, + { + "epoch": 0.3857877609197771, + "grad_norm": 0.0015655852621421218, + "learning_rate": 6.168648620680735e-05, + "loss": 0.004582761600613594, + "step": 89860 + }, + { + "epoch": 0.3858306930097971, + "grad_norm": 0.011474518105387688, + "learning_rate": 6.168217448668973e-05, + "loss": 0.32756221294403076, + "step": 89870 + }, + { + "epoch": 0.3858736250998171, + "grad_norm": 17.503582000732422, + "learning_rate": 6.16778627665721e-05, + "loss": 0.11809229850769043, + "step": 89880 + }, + { + "epoch": 0.38591655718983714, + "grad_norm": 0.06740511953830719, + "learning_rate": 6.167355104645448e-05, + "loss": 0.19865074157714843, + "step": 89890 + }, + { + "epoch": 0.3859594892798571, + "grad_norm": 1.3640791177749634, + "learning_rate": 6.166923932633685e-05, + "loss": 0.5600822448730469, + "step": 89900 + }, + { + "epoch": 0.3860024213698771, + "grad_norm": 0.012692203745245934, + "learning_rate": 6.166492760621922e-05, + "loss": 0.1536575198173523, + "step": 89910 + }, + { + "epoch": 0.38604535345989716, + "grad_norm": 2.2844793796539307, + "learning_rate": 6.16606158861016e-05, + "loss": 0.34578001499176025, + "step": 89920 + }, + { + "epoch": 0.38608828554991714, + "grad_norm": 7.917400360107422, + "learning_rate": 6.165630416598398e-05, + "loss": 0.2631634473800659, + "step": 89930 + }, + { + "epoch": 0.38613121763993713, + "grad_norm": 2.6729063987731934, + "learning_rate": 6.165199244586636e-05, + "loss": 0.2432703971862793, + "step": 89940 + }, + { + "epoch": 0.3861741497299572, + "grad_norm": 0.0074627818539738655, + "learning_rate": 6.164768072574873e-05, + "loss": 0.11718699932098389, + "step": 89950 + }, + { + "epoch": 0.38621708181997716, + "grad_norm": 0.6019059419631958, + "learning_rate": 6.164336900563111e-05, + "loss": 0.3247262716293335, + "step": 89960 + }, + { + "epoch": 0.38626001390999715, + "grad_norm": 0.9982324838638306, + "learning_rate": 6.163905728551349e-05, + "loss": 0.22884700298309327, + "step": 89970 + }, + { + "epoch": 0.3863029460000172, + "grad_norm": 0.01680828258395195, + "learning_rate": 6.163474556539585e-05, + "loss": 0.16624144315719605, + "step": 89980 + }, + { + "epoch": 0.3863458780900372, + "grad_norm": 0.020545918494462967, + "learning_rate": 6.163043384527823e-05, + "loss": 0.13353365659713745, + "step": 89990 + }, + { + "epoch": 0.38638881018005716, + "grad_norm": 0.009240344166755676, + "learning_rate": 6.16261221251606e-05, + "loss": 0.28017189502716067, + "step": 90000 + }, + { + "epoch": 0.38638881018005716, + "eval_loss": 0.42577657103538513, + "eval_runtime": 27.1328, + "eval_samples_per_second": 3.686, + "eval_steps_per_second": 3.686, + "step": 90000 + }, + { + "epoch": 0.3864317422700772, + "grad_norm": 2.90472149848938, + "learning_rate": 6.162181040504298e-05, + "loss": 0.10356920957565308, + "step": 90010 + }, + { + "epoch": 0.3864746743600972, + "grad_norm": 0.43290838599205017, + "learning_rate": 6.161749868492536e-05, + "loss": 0.1836892247200012, + "step": 90020 + }, + { + "epoch": 0.3865176064501172, + "grad_norm": 0.1611838936805725, + "learning_rate": 6.161318696480774e-05, + "loss": 0.18457709550857543, + "step": 90030 + }, + { + "epoch": 0.3865605385401372, + "grad_norm": 0.20687133073806763, + "learning_rate": 6.160887524469013e-05, + "loss": 0.2525459051132202, + "step": 90040 + }, + { + "epoch": 0.3866034706301572, + "grad_norm": 0.10173666477203369, + "learning_rate": 6.160456352457251e-05, + "loss": 0.18307244777679443, + "step": 90050 + }, + { + "epoch": 0.38664640272017725, + "grad_norm": 0.30298879742622375, + "learning_rate": 6.160025180445487e-05, + "loss": 0.2733901500701904, + "step": 90060 + }, + { + "epoch": 0.38668933481019724, + "grad_norm": 3.4610722064971924, + "learning_rate": 6.159594008433725e-05, + "loss": 0.238478422164917, + "step": 90070 + }, + { + "epoch": 0.3867322669002172, + "grad_norm": 18.95323944091797, + "learning_rate": 6.159162836421963e-05, + "loss": 0.1477491021156311, + "step": 90080 + }, + { + "epoch": 0.38677519899023727, + "grad_norm": 0.0035032792948186398, + "learning_rate": 6.1587316644102e-05, + "loss": 0.22136282920837402, + "step": 90090 + }, + { + "epoch": 0.38681813108025725, + "grad_norm": 0.27647852897644043, + "learning_rate": 6.158300492398438e-05, + "loss": 0.1641569495201111, + "step": 90100 + }, + { + "epoch": 0.38686106317027724, + "grad_norm": 0.33862271904945374, + "learning_rate": 6.157869320386676e-05, + "loss": 0.31995439529418945, + "step": 90110 + }, + { + "epoch": 0.3869039952602973, + "grad_norm": 0.6005828976631165, + "learning_rate": 6.157438148374913e-05, + "loss": 0.26249744892120364, + "step": 90120 + }, + { + "epoch": 0.38694692735031727, + "grad_norm": 0.013982965610921383, + "learning_rate": 6.157006976363151e-05, + "loss": 0.16258199214935304, + "step": 90130 + }, + { + "epoch": 0.38698985944033726, + "grad_norm": 0.46348854899406433, + "learning_rate": 6.156575804351388e-05, + "loss": 0.19305338859558105, + "step": 90140 + }, + { + "epoch": 0.3870327915303573, + "grad_norm": 3.7916457653045654, + "learning_rate": 6.156144632339625e-05, + "loss": 0.40165019035339355, + "step": 90150 + }, + { + "epoch": 0.3870757236203773, + "grad_norm": 0.041253313422203064, + "learning_rate": 6.155713460327863e-05, + "loss": 0.027036473155021667, + "step": 90160 + }, + { + "epoch": 0.3871186557103973, + "grad_norm": 0.05210256576538086, + "learning_rate": 6.155282288316101e-05, + "loss": 0.26577320098876955, + "step": 90170 + }, + { + "epoch": 0.3871615878004173, + "grad_norm": 0.48661676049232483, + "learning_rate": 6.154851116304339e-05, + "loss": 0.2865142822265625, + "step": 90180 + }, + { + "epoch": 0.3872045198904373, + "grad_norm": 1.0749396085739136, + "learning_rate": 6.154419944292576e-05, + "loss": 0.2049044370651245, + "step": 90190 + }, + { + "epoch": 0.3872474519804573, + "grad_norm": 0.06901497393846512, + "learning_rate": 6.153988772280814e-05, + "loss": 0.07353252768516541, + "step": 90200 + }, + { + "epoch": 0.38729038407047733, + "grad_norm": 0.007591388188302517, + "learning_rate": 6.153557600269052e-05, + "loss": 0.22325377464294432, + "step": 90210 + }, + { + "epoch": 0.3873333161604973, + "grad_norm": 0.03437507525086403, + "learning_rate": 6.15312642825729e-05, + "loss": 0.22393579483032228, + "step": 90220 + }, + { + "epoch": 0.3873762482505173, + "grad_norm": 0.022142138332128525, + "learning_rate": 6.152695256245526e-05, + "loss": 0.26709728240966796, + "step": 90230 + }, + { + "epoch": 0.38741918034053735, + "grad_norm": 0.015263757668435574, + "learning_rate": 6.152264084233764e-05, + "loss": 0.39098849296569826, + "step": 90240 + }, + { + "epoch": 0.38746211243055734, + "grad_norm": 0.131820410490036, + "learning_rate": 6.151832912222001e-05, + "loss": 0.4499326229095459, + "step": 90250 + }, + { + "epoch": 0.3875050445205773, + "grad_norm": 0.005828079301863909, + "learning_rate": 6.15140174021024e-05, + "loss": 0.20458014011383058, + "step": 90260 + }, + { + "epoch": 0.38754797661059737, + "grad_norm": 0.16223856806755066, + "learning_rate": 6.150970568198478e-05, + "loss": 0.24097530841827391, + "step": 90270 + }, + { + "epoch": 0.38759090870061735, + "grad_norm": 1.206310749053955, + "learning_rate": 6.150539396186716e-05, + "loss": 0.21269071102142334, + "step": 90280 + }, + { + "epoch": 0.3876338407906374, + "grad_norm": 1.7968086004257202, + "learning_rate": 6.150108224174954e-05, + "loss": 0.18731291294097902, + "step": 90290 + }, + { + "epoch": 0.3876767728806574, + "grad_norm": 1.2903820276260376, + "learning_rate": 6.149677052163191e-05, + "loss": 0.22080676555633544, + "step": 90300 + }, + { + "epoch": 0.38771970497067737, + "grad_norm": 0.005241678096354008, + "learning_rate": 6.149245880151428e-05, + "loss": 0.06120513081550598, + "step": 90310 + }, + { + "epoch": 0.3877626370606974, + "grad_norm": 0.29554280638694763, + "learning_rate": 6.148814708139665e-05, + "loss": 0.08351311683654786, + "step": 90320 + }, + { + "epoch": 0.3878055691507174, + "grad_norm": 0.0033163491170853376, + "learning_rate": 6.148383536127903e-05, + "loss": 0.13242911100387572, + "step": 90330 + }, + { + "epoch": 0.3878485012407374, + "grad_norm": 0.016962099820375443, + "learning_rate": 6.147952364116141e-05, + "loss": 0.22619824409484862, + "step": 90340 + }, + { + "epoch": 0.38789143333075743, + "grad_norm": 0.5038765668869019, + "learning_rate": 6.147521192104379e-05, + "loss": 0.11690418720245362, + "step": 90350 + }, + { + "epoch": 0.3879343654207774, + "grad_norm": 0.136198490858078, + "learning_rate": 6.147090020092616e-05, + "loss": 0.060647428035736084, + "step": 90360 + }, + { + "epoch": 0.3879772975107974, + "grad_norm": 0.01311681792140007, + "learning_rate": 6.146658848080854e-05, + "loss": 0.26727359294891356, + "step": 90370 + }, + { + "epoch": 0.38802022960081745, + "grad_norm": 5.027169227600098, + "learning_rate": 6.146227676069092e-05, + "loss": 0.2338623046875, + "step": 90380 + }, + { + "epoch": 0.38806316169083743, + "grad_norm": 0.007105580065399408, + "learning_rate": 6.145796504057328e-05, + "loss": 0.1273871660232544, + "step": 90390 + }, + { + "epoch": 0.3881060937808574, + "grad_norm": 5.178821563720703, + "learning_rate": 6.145365332045566e-05, + "loss": 0.3393435001373291, + "step": 90400 + }, + { + "epoch": 0.38814902587087746, + "grad_norm": 0.008485247381031513, + "learning_rate": 6.144934160033804e-05, + "loss": 0.2772747278213501, + "step": 90410 + }, + { + "epoch": 0.38819195796089745, + "grad_norm": 2.125918388366699, + "learning_rate": 6.144502988022041e-05, + "loss": 0.6011878967285156, + "step": 90420 + }, + { + "epoch": 0.38823489005091744, + "grad_norm": 3.208209753036499, + "learning_rate": 6.144071816010279e-05, + "loss": 0.14225001335144044, + "step": 90430 + }, + { + "epoch": 0.3882778221409375, + "grad_norm": 2.0369322299957275, + "learning_rate": 6.143640643998517e-05, + "loss": 0.2426603078842163, + "step": 90440 + }, + { + "epoch": 0.38832075423095747, + "grad_norm": 6.514814853668213, + "learning_rate": 6.143209471986755e-05, + "loss": 0.2078092575073242, + "step": 90450 + }, + { + "epoch": 0.38836368632097745, + "grad_norm": 0.1408424824476242, + "learning_rate": 6.142778299974992e-05, + "loss": 0.22487127780914307, + "step": 90460 + }, + { + "epoch": 0.3884066184109975, + "grad_norm": 0.9192783236503601, + "learning_rate": 6.142347127963229e-05, + "loss": 0.32507715225219724, + "step": 90470 + }, + { + "epoch": 0.3884495505010175, + "grad_norm": 0.02033153735101223, + "learning_rate": 6.141915955951468e-05, + "loss": 0.09576951265335083, + "step": 90480 + }, + { + "epoch": 0.3884924825910375, + "grad_norm": 0.005774068646132946, + "learning_rate": 6.141484783939706e-05, + "loss": 0.32229766845703123, + "step": 90490 + }, + { + "epoch": 0.3885354146810575, + "grad_norm": 2.699652910232544, + "learning_rate": 6.141053611927943e-05, + "loss": 0.34198732376098634, + "step": 90500 + }, + { + "epoch": 0.3885783467710775, + "grad_norm": 0.06830363720655441, + "learning_rate": 6.140622439916181e-05, + "loss": 0.23384521007537842, + "step": 90510 + }, + { + "epoch": 0.38862127886109754, + "grad_norm": 1.3438990116119385, + "learning_rate": 6.140191267904419e-05, + "loss": 0.35765848159790037, + "step": 90520 + }, + { + "epoch": 0.38866421095111753, + "grad_norm": 1.0712690353393555, + "learning_rate": 6.139760095892657e-05, + "loss": 0.2655247926712036, + "step": 90530 + }, + { + "epoch": 0.3887071430411375, + "grad_norm": 0.019178811460733414, + "learning_rate": 6.139328923880894e-05, + "loss": 0.0837427020072937, + "step": 90540 + }, + { + "epoch": 0.38875007513115756, + "grad_norm": 1.6567952632904053, + "learning_rate": 6.138897751869132e-05, + "loss": 0.29982790946960447, + "step": 90550 + }, + { + "epoch": 0.38879300722117754, + "grad_norm": 1.4304605722427368, + "learning_rate": 6.138466579857368e-05, + "loss": 0.3659378528594971, + "step": 90560 + }, + { + "epoch": 0.38883593931119753, + "grad_norm": 0.2591516673564911, + "learning_rate": 6.138035407845606e-05, + "loss": 0.20934972763061524, + "step": 90570 + }, + { + "epoch": 0.3888788714012176, + "grad_norm": 43.77493667602539, + "learning_rate": 6.137604235833844e-05, + "loss": 0.2530338764190674, + "step": 90580 + }, + { + "epoch": 0.38892180349123756, + "grad_norm": 1.3299733400344849, + "learning_rate": 6.137173063822082e-05, + "loss": 0.16540772914886476, + "step": 90590 + }, + { + "epoch": 0.38896473558125755, + "grad_norm": 1.6994363069534302, + "learning_rate": 6.136741891810319e-05, + "loss": 0.3406243324279785, + "step": 90600 + }, + { + "epoch": 0.3890076676712776, + "grad_norm": 1.222379207611084, + "learning_rate": 6.136310719798557e-05, + "loss": 0.2493795394897461, + "step": 90610 + }, + { + "epoch": 0.3890505997612976, + "grad_norm": 1.4869258403778076, + "learning_rate": 6.135879547786795e-05, + "loss": 0.3182868242263794, + "step": 90620 + }, + { + "epoch": 0.38909353185131756, + "grad_norm": 0.08125866949558258, + "learning_rate": 6.135448375775033e-05, + "loss": 0.03489781022071838, + "step": 90630 + }, + { + "epoch": 0.3891364639413376, + "grad_norm": 1.8703261613845825, + "learning_rate": 6.135017203763269e-05, + "loss": 0.2631047487258911, + "step": 90640 + }, + { + "epoch": 0.3891793960313576, + "grad_norm": 0.9088125228881836, + "learning_rate": 6.134586031751507e-05, + "loss": 0.2439821720123291, + "step": 90650 + }, + { + "epoch": 0.3892223281213776, + "grad_norm": 4.052822113037109, + "learning_rate": 6.134154859739744e-05, + "loss": 0.1447490930557251, + "step": 90660 + }, + { + "epoch": 0.3892652602113976, + "grad_norm": 0.35810771584510803, + "learning_rate": 6.133723687727982e-05, + "loss": 0.2471384048461914, + "step": 90670 + }, + { + "epoch": 0.3893081923014176, + "grad_norm": 0.06162268668413162, + "learning_rate": 6.13329251571622e-05, + "loss": 0.1883344292640686, + "step": 90680 + }, + { + "epoch": 0.3893511243914376, + "grad_norm": 1.449040412902832, + "learning_rate": 6.132861343704458e-05, + "loss": 0.18545243740081788, + "step": 90690 + }, + { + "epoch": 0.38939405648145764, + "grad_norm": 0.01574113965034485, + "learning_rate": 6.132430171692695e-05, + "loss": 0.27572879791259763, + "step": 90700 + }, + { + "epoch": 0.3894369885714776, + "grad_norm": 0.47327515482902527, + "learning_rate": 6.131998999680933e-05, + "loss": 0.06660547256469726, + "step": 90710 + }, + { + "epoch": 0.38947992066149767, + "grad_norm": 2.003725290298462, + "learning_rate": 6.131567827669171e-05, + "loss": 0.23638532161712647, + "step": 90720 + }, + { + "epoch": 0.38952285275151766, + "grad_norm": 0.05184811353683472, + "learning_rate": 6.131136655657408e-05, + "loss": 0.23077709674835206, + "step": 90730 + }, + { + "epoch": 0.38956578484153764, + "grad_norm": 0.01105536986142397, + "learning_rate": 6.130705483645646e-05, + "loss": 0.3030509948730469, + "step": 90740 + }, + { + "epoch": 0.3896087169315577, + "grad_norm": 0.7250270247459412, + "learning_rate": 6.130274311633884e-05, + "loss": 0.11001847982406616, + "step": 90750 + }, + { + "epoch": 0.3896516490215777, + "grad_norm": 7.975006580352783, + "learning_rate": 6.129843139622122e-05, + "loss": 0.3358157634735107, + "step": 90760 + }, + { + "epoch": 0.38969458111159766, + "grad_norm": 0.15005670487880707, + "learning_rate": 6.12941196761036e-05, + "loss": 0.19826846122741698, + "step": 90770 + }, + { + "epoch": 0.3897375132016177, + "grad_norm": 0.12587501108646393, + "learning_rate": 6.128980795598597e-05, + "loss": 0.17790353298187256, + "step": 90780 + }, + { + "epoch": 0.3897804452916377, + "grad_norm": 0.02037876844406128, + "learning_rate": 6.128549623586835e-05, + "loss": 0.20419929027557374, + "step": 90790 + }, + { + "epoch": 0.3898233773816577, + "grad_norm": 0.016147956252098083, + "learning_rate": 6.128118451575071e-05, + "loss": 0.1969937801361084, + "step": 90800 + }, + { + "epoch": 0.3898663094716777, + "grad_norm": 0.711434006690979, + "learning_rate": 6.127687279563309e-05, + "loss": 0.17485193014144898, + "step": 90810 + }, + { + "epoch": 0.3899092415616977, + "grad_norm": 0.005552069749683142, + "learning_rate": 6.127256107551547e-05, + "loss": 0.33175613880157473, + "step": 90820 + }, + { + "epoch": 0.3899521736517177, + "grad_norm": 0.00681539298966527, + "learning_rate": 6.126824935539784e-05, + "loss": 0.11351083517074585, + "step": 90830 + }, + { + "epoch": 0.38999510574173774, + "grad_norm": 0.006817379966378212, + "learning_rate": 6.126393763528022e-05, + "loss": 0.25123019218444825, + "step": 90840 + }, + { + "epoch": 0.3900380378317577, + "grad_norm": 0.02623910829424858, + "learning_rate": 6.12596259151626e-05, + "loss": 0.26213512420654295, + "step": 90850 + }, + { + "epoch": 0.3900809699217777, + "grad_norm": 21.62917137145996, + "learning_rate": 6.125531419504498e-05, + "loss": 0.27692012786865233, + "step": 90860 + }, + { + "epoch": 0.39012390201179775, + "grad_norm": 0.008832416497170925, + "learning_rate": 6.125100247492735e-05, + "loss": 0.2713014602661133, + "step": 90870 + }, + { + "epoch": 0.39016683410181774, + "grad_norm": 0.0016400536987930536, + "learning_rate": 6.124669075480972e-05, + "loss": 0.17731016874313354, + "step": 90880 + }, + { + "epoch": 0.3902097661918377, + "grad_norm": 0.11445105075836182, + "learning_rate": 6.12423790346921e-05, + "loss": 0.3271883487701416, + "step": 90890 + }, + { + "epoch": 0.39025269828185777, + "grad_norm": 0.06182974949479103, + "learning_rate": 6.123806731457447e-05, + "loss": 0.09882626533508301, + "step": 90900 + }, + { + "epoch": 0.39029563037187776, + "grad_norm": 0.010938971303403378, + "learning_rate": 6.123375559445685e-05, + "loss": 0.19311379194259642, + "step": 90910 + }, + { + "epoch": 0.3903385624618978, + "grad_norm": 0.3110086917877197, + "learning_rate": 6.122944387433923e-05, + "loss": 0.2579014301300049, + "step": 90920 + }, + { + "epoch": 0.3903814945519178, + "grad_norm": 0.0062742093577980995, + "learning_rate": 6.12251321542216e-05, + "loss": 0.12250322103500366, + "step": 90930 + }, + { + "epoch": 0.39042442664193777, + "grad_norm": 0.3339044153690338, + "learning_rate": 6.122082043410398e-05, + "loss": 0.38106014728546145, + "step": 90940 + }, + { + "epoch": 0.3904673587319578, + "grad_norm": 0.5673826932907104, + "learning_rate": 6.121650871398636e-05, + "loss": 0.26010856628417967, + "step": 90950 + }, + { + "epoch": 0.3905102908219778, + "grad_norm": 0.878169596195221, + "learning_rate": 6.121219699386874e-05, + "loss": 0.27169249057769773, + "step": 90960 + }, + { + "epoch": 0.3905532229119978, + "grad_norm": 0.04031570255756378, + "learning_rate": 6.120788527375111e-05, + "loss": 0.05566856265068054, + "step": 90970 + }, + { + "epoch": 0.39059615500201783, + "grad_norm": 0.007886065170168877, + "learning_rate": 6.120357355363349e-05, + "loss": 0.004305092990398407, + "step": 90980 + }, + { + "epoch": 0.3906390870920378, + "grad_norm": 0.11100518703460693, + "learning_rate": 6.119926183351587e-05, + "loss": 0.09936892390251159, + "step": 90990 + }, + { + "epoch": 0.3906820191820578, + "grad_norm": 1.3127731084823608, + "learning_rate": 6.119495011339825e-05, + "loss": 0.18303449153900148, + "step": 91000 + }, + { + "epoch": 0.3906820191820578, + "eval_loss": 0.42876288294792175, + "eval_runtime": 27.0908, + "eval_samples_per_second": 3.691, + "eval_steps_per_second": 3.691, + "step": 91000 + }, + { + "epoch": 0.39072495127207785, + "grad_norm": 0.011186370626091957, + "learning_rate": 6.119063839328062e-05, + "loss": 0.1739388585090637, + "step": 91010 + }, + { + "epoch": 0.39076788336209783, + "grad_norm": 0.18774914741516113, + "learning_rate": 6.1186326673163e-05, + "loss": 0.23218233585357667, + "step": 91020 + }, + { + "epoch": 0.3908108154521178, + "grad_norm": 0.02326073683798313, + "learning_rate": 6.118201495304538e-05, + "loss": 0.18793890476226807, + "step": 91030 + }, + { + "epoch": 0.39085374754213786, + "grad_norm": 0.026896804571151733, + "learning_rate": 6.117770323292776e-05, + "loss": 0.11842405796051025, + "step": 91040 + }, + { + "epoch": 0.39089667963215785, + "grad_norm": 5.239316940307617, + "learning_rate": 6.117339151281012e-05, + "loss": 0.3570317983627319, + "step": 91050 + }, + { + "epoch": 0.39093961172217784, + "grad_norm": 0.013899068348109722, + "learning_rate": 6.11690797926925e-05, + "loss": 0.09054339528083802, + "step": 91060 + }, + { + "epoch": 0.3909825438121979, + "grad_norm": 0.0057420190423727036, + "learning_rate": 6.116476807257487e-05, + "loss": 0.045521339774131774, + "step": 91070 + }, + { + "epoch": 0.39102547590221787, + "grad_norm": 0.033044878393411636, + "learning_rate": 6.116045635245725e-05, + "loss": 0.08407972455024719, + "step": 91080 + }, + { + "epoch": 0.39106840799223785, + "grad_norm": 0.008280238136649132, + "learning_rate": 6.115614463233963e-05, + "loss": 0.1877911329269409, + "step": 91090 + }, + { + "epoch": 0.3911113400822579, + "grad_norm": 0.0190195944160223, + "learning_rate": 6.1151832912222e-05, + "loss": 0.12840522527694703, + "step": 91100 + }, + { + "epoch": 0.3911542721722779, + "grad_norm": 0.006106968969106674, + "learning_rate": 6.114752119210438e-05, + "loss": 0.09369821548461914, + "step": 91110 + }, + { + "epoch": 0.39119720426229787, + "grad_norm": 46.82925796508789, + "learning_rate": 6.114320947198676e-05, + "loss": 0.1664884328842163, + "step": 91120 + }, + { + "epoch": 0.3912401363523179, + "grad_norm": 0.0006028416682966053, + "learning_rate": 6.113889775186912e-05, + "loss": 0.20093607902526855, + "step": 91130 + }, + { + "epoch": 0.3912830684423379, + "grad_norm": 3.5816397666931152, + "learning_rate": 6.11345860317515e-05, + "loss": 0.3592935800552368, + "step": 91140 + }, + { + "epoch": 0.39132600053235794, + "grad_norm": 2.122619867324829, + "learning_rate": 6.113027431163388e-05, + "loss": 0.14562952518463135, + "step": 91150 + }, + { + "epoch": 0.39136893262237793, + "grad_norm": 0.0007185607682913542, + "learning_rate": 6.112596259151626e-05, + "loss": 0.1464229106903076, + "step": 91160 + }, + { + "epoch": 0.3914118647123979, + "grad_norm": 0.10905706137418747, + "learning_rate": 6.112165087139863e-05, + "loss": 0.11112669706344605, + "step": 91170 + }, + { + "epoch": 0.39145479680241796, + "grad_norm": 0.8562988638877869, + "learning_rate": 6.111733915128101e-05, + "loss": 0.2025907516479492, + "step": 91180 + }, + { + "epoch": 0.39149772889243795, + "grad_norm": 0.01761394925415516, + "learning_rate": 6.111302743116339e-05, + "loss": 0.10299869775772094, + "step": 91190 + }, + { + "epoch": 0.39154066098245793, + "grad_norm": 2.9216980934143066, + "learning_rate": 6.110871571104577e-05, + "loss": 0.25521130561828614, + "step": 91200 + }, + { + "epoch": 0.391583593072478, + "grad_norm": 0.001002555713057518, + "learning_rate": 6.110440399092814e-05, + "loss": 0.15010665655136107, + "step": 91210 + }, + { + "epoch": 0.39162652516249796, + "grad_norm": 0.005722669418901205, + "learning_rate": 6.110009227081052e-05, + "loss": 0.2383742332458496, + "step": 91220 + }, + { + "epoch": 0.39166945725251795, + "grad_norm": 1.7847390174865723, + "learning_rate": 6.10957805506929e-05, + "loss": 0.19723060131072997, + "step": 91230 + }, + { + "epoch": 0.391712389342538, + "grad_norm": 0.0781240239739418, + "learning_rate": 6.109146883057528e-05, + "loss": 0.2860633373260498, + "step": 91240 + }, + { + "epoch": 0.391755321432558, + "grad_norm": 3.915970802307129, + "learning_rate": 6.108715711045765e-05, + "loss": 0.12680907249450685, + "step": 91250 + }, + { + "epoch": 0.39179825352257797, + "grad_norm": 0.037515632808208466, + "learning_rate": 6.108284539034003e-05, + "loss": 0.3318891763687134, + "step": 91260 + }, + { + "epoch": 0.391841185612598, + "grad_norm": 1.438825249671936, + "learning_rate": 6.107853367022241e-05, + "loss": 0.27229154109954834, + "step": 91270 + }, + { + "epoch": 0.391884117702618, + "grad_norm": 1.8069391250610352, + "learning_rate": 6.107422195010478e-05, + "loss": 0.135508930683136, + "step": 91280 + }, + { + "epoch": 0.391927049792638, + "grad_norm": 0.00529401283711195, + "learning_rate": 6.106991022998716e-05, + "loss": 0.3083492755889893, + "step": 91290 + }, + { + "epoch": 0.391969981882658, + "grad_norm": 0.45584940910339355, + "learning_rate": 6.106559850986953e-05, + "loss": 0.25160813331604004, + "step": 91300 + }, + { + "epoch": 0.392012913972678, + "grad_norm": 0.9806048274040222, + "learning_rate": 6.10612867897519e-05, + "loss": 0.20746960639953613, + "step": 91310 + }, + { + "epoch": 0.392055846062698, + "grad_norm": 0.007097348570823669, + "learning_rate": 6.105697506963428e-05, + "loss": 0.18413705825805665, + "step": 91320 + }, + { + "epoch": 0.39209877815271804, + "grad_norm": 0.017154572531580925, + "learning_rate": 6.105266334951666e-05, + "loss": 0.17170287370681764, + "step": 91330 + }, + { + "epoch": 0.39214171024273803, + "grad_norm": 5.822543621063232, + "learning_rate": 6.104835162939904e-05, + "loss": 0.15484896898269654, + "step": 91340 + }, + { + "epoch": 0.39218464233275807, + "grad_norm": 0.002759866416454315, + "learning_rate": 6.104403990928141e-05, + "loss": 0.0738214910030365, + "step": 91350 + }, + { + "epoch": 0.39222757442277806, + "grad_norm": 0.6158252358436584, + "learning_rate": 6.103972818916379e-05, + "loss": 0.08194655179977417, + "step": 91360 + }, + { + "epoch": 0.39227050651279805, + "grad_norm": 0.1930149346590042, + "learning_rate": 6.103541646904617e-05, + "loss": 0.15161114931106567, + "step": 91370 + }, + { + "epoch": 0.3923134386028181, + "grad_norm": 1.6960185766220093, + "learning_rate": 6.103110474892854e-05, + "loss": 0.2454068899154663, + "step": 91380 + }, + { + "epoch": 0.3923563706928381, + "grad_norm": 0.28160202503204346, + "learning_rate": 6.1026793028810915e-05, + "loss": 0.158779513835907, + "step": 91390 + }, + { + "epoch": 0.39239930278285806, + "grad_norm": 0.050931766629219055, + "learning_rate": 6.102248130869329e-05, + "loss": 0.12442408800125122, + "step": 91400 + }, + { + "epoch": 0.3924422348728781, + "grad_norm": 1.0036672353744507, + "learning_rate": 6.101816958857567e-05, + "loss": 0.1712067246437073, + "step": 91410 + }, + { + "epoch": 0.3924851669628981, + "grad_norm": 0.1027325913310051, + "learning_rate": 6.101385786845805e-05, + "loss": 0.4437578201293945, + "step": 91420 + }, + { + "epoch": 0.3925280990529181, + "grad_norm": 1.9295845031738281, + "learning_rate": 6.1009546148340424e-05, + "loss": 0.4104002475738525, + "step": 91430 + }, + { + "epoch": 0.3925710311429381, + "grad_norm": 0.3841754198074341, + "learning_rate": 6.10052344282228e-05, + "loss": 0.37738420963287356, + "step": 91440 + }, + { + "epoch": 0.3926139632329581, + "grad_norm": 0.07138053327798843, + "learning_rate": 6.100092270810518e-05, + "loss": 0.06622268557548523, + "step": 91450 + }, + { + "epoch": 0.3926568953229781, + "grad_norm": 0.012696065939962864, + "learning_rate": 6.099661098798754e-05, + "loss": 0.1565432906150818, + "step": 91460 + }, + { + "epoch": 0.39269982741299814, + "grad_norm": 8.017718315124512, + "learning_rate": 6.099229926786992e-05, + "loss": 0.20162699222564698, + "step": 91470 + }, + { + "epoch": 0.3927427595030181, + "grad_norm": 0.8579626679420471, + "learning_rate": 6.0987987547752304e-05, + "loss": 0.29092042446136473, + "step": 91480 + }, + { + "epoch": 0.3927856915930381, + "grad_norm": 1.805871605873108, + "learning_rate": 6.098367582763468e-05, + "loss": 0.34696040153503416, + "step": 91490 + }, + { + "epoch": 0.39282862368305815, + "grad_norm": 0.008364181965589523, + "learning_rate": 6.097936410751706e-05, + "loss": 0.24159841537475585, + "step": 91500 + }, + { + "epoch": 0.39287155577307814, + "grad_norm": 0.0062538920901715755, + "learning_rate": 6.0975052387399436e-05, + "loss": 0.211861252784729, + "step": 91510 + }, + { + "epoch": 0.3929144878630981, + "grad_norm": 3.8626821041107178, + "learning_rate": 6.0970740667281814e-05, + "loss": 0.23252692222595214, + "step": 91520 + }, + { + "epoch": 0.39295741995311817, + "grad_norm": 0.014011452905833721, + "learning_rate": 6.096642894716419e-05, + "loss": 0.12460508346557617, + "step": 91530 + }, + { + "epoch": 0.39300035204313816, + "grad_norm": 0.009509447030723095, + "learning_rate": 6.0962117227046555e-05, + "loss": 0.21465637683868408, + "step": 91540 + }, + { + "epoch": 0.39304328413315814, + "grad_norm": 0.0024375561624765396, + "learning_rate": 6.095780550692893e-05, + "loss": 0.21100099086761476, + "step": 91550 + }, + { + "epoch": 0.3930862162231782, + "grad_norm": 4.144960403442383, + "learning_rate": 6.095349378681131e-05, + "loss": 0.2823643684387207, + "step": 91560 + }, + { + "epoch": 0.3931291483131982, + "grad_norm": 1.322467565536499, + "learning_rate": 6.094918206669369e-05, + "loss": 0.08026805520057678, + "step": 91570 + }, + { + "epoch": 0.3931720804032182, + "grad_norm": 0.06816992163658142, + "learning_rate": 6.0944870346576064e-05, + "loss": 0.06737584471702576, + "step": 91580 + }, + { + "epoch": 0.3932150124932382, + "grad_norm": 0.3935220539569855, + "learning_rate": 6.094055862645844e-05, + "loss": 0.12359728813171386, + "step": 91590 + }, + { + "epoch": 0.3932579445832582, + "grad_norm": 2.4706647396087646, + "learning_rate": 6.093624690634082e-05, + "loss": 0.3537791967391968, + "step": 91600 + }, + { + "epoch": 0.39330087667327823, + "grad_norm": 1.1445716619491577, + "learning_rate": 6.0931935186223196e-05, + "loss": 0.1657669425010681, + "step": 91610 + }, + { + "epoch": 0.3933438087632982, + "grad_norm": 0.05122748389840126, + "learning_rate": 6.092762346610557e-05, + "loss": 0.20379807949066162, + "step": 91620 + }, + { + "epoch": 0.3933867408533182, + "grad_norm": 0.1313648372888565, + "learning_rate": 6.0923311745987944e-05, + "loss": 0.09172605872154235, + "step": 91630 + }, + { + "epoch": 0.39342967294333825, + "grad_norm": 0.07003146409988403, + "learning_rate": 6.091900002587032e-05, + "loss": 0.22285842895507812, + "step": 91640 + }, + { + "epoch": 0.39347260503335824, + "grad_norm": 1.9692494869232178, + "learning_rate": 6.09146883057527e-05, + "loss": 0.18839340209960936, + "step": 91650 + }, + { + "epoch": 0.3935155371233782, + "grad_norm": 0.0328078456223011, + "learning_rate": 6.0910376585635076e-05, + "loss": 0.32558040618896483, + "step": 91660 + }, + { + "epoch": 0.39355846921339827, + "grad_norm": 0.007657351437956095, + "learning_rate": 6.0906064865517454e-05, + "loss": 0.16526817083358764, + "step": 91670 + }, + { + "epoch": 0.39360140130341825, + "grad_norm": 0.055285923182964325, + "learning_rate": 6.090175314539983e-05, + "loss": 0.27820913791656493, + "step": 91680 + }, + { + "epoch": 0.39364433339343824, + "grad_norm": 0.49281400442123413, + "learning_rate": 6.089744142528221e-05, + "loss": 0.09620480537414551, + "step": 91690 + }, + { + "epoch": 0.3936872654834583, + "grad_norm": 4.030596733093262, + "learning_rate": 6.0893129705164586e-05, + "loss": 0.35482666492462156, + "step": 91700 + }, + { + "epoch": 0.39373019757347827, + "grad_norm": 2.9866480827331543, + "learning_rate": 6.0888817985046956e-05, + "loss": 0.0979809284210205, + "step": 91710 + }, + { + "epoch": 0.39377312966349826, + "grad_norm": 0.010686771012842655, + "learning_rate": 6.0884506264929333e-05, + "loss": 0.34946651458740235, + "step": 91720 + }, + { + "epoch": 0.3938160617535183, + "grad_norm": 0.002895164769142866, + "learning_rate": 6.088019454481171e-05, + "loss": 0.09829755425453186, + "step": 91730 + }, + { + "epoch": 0.3938589938435383, + "grad_norm": 0.07746174186468124, + "learning_rate": 6.087588282469409e-05, + "loss": 0.3091606378555298, + "step": 91740 + }, + { + "epoch": 0.3939019259335583, + "grad_norm": 0.1524367332458496, + "learning_rate": 6.0871571104576466e-05, + "loss": 0.1508937358856201, + "step": 91750 + }, + { + "epoch": 0.3939448580235783, + "grad_norm": 0.011452741920948029, + "learning_rate": 6.086725938445884e-05, + "loss": 0.317603611946106, + "step": 91760 + }, + { + "epoch": 0.3939877901135983, + "grad_norm": 0.004681314807385206, + "learning_rate": 6.086294766434122e-05, + "loss": 0.291134238243103, + "step": 91770 + }, + { + "epoch": 0.3940307222036183, + "grad_norm": 0.022366631776094437, + "learning_rate": 6.08586359442236e-05, + "loss": 0.2740320682525635, + "step": 91780 + }, + { + "epoch": 0.39407365429363833, + "grad_norm": 3.361569404602051, + "learning_rate": 6.085432422410596e-05, + "loss": 0.4950369358062744, + "step": 91790 + }, + { + "epoch": 0.3941165863836583, + "grad_norm": 0.0028315645176917315, + "learning_rate": 6.085001250398834e-05, + "loss": 0.1323166847229004, + "step": 91800 + }, + { + "epoch": 0.39415951847367836, + "grad_norm": 1.0123827457427979, + "learning_rate": 6.0845700783870716e-05, + "loss": 0.38972649574279783, + "step": 91810 + }, + { + "epoch": 0.39420245056369835, + "grad_norm": 0.039406854659318924, + "learning_rate": 6.084138906375309e-05, + "loss": 0.1629611849784851, + "step": 91820 + }, + { + "epoch": 0.39424538265371833, + "grad_norm": 1.5213974714279175, + "learning_rate": 6.083707734363547e-05, + "loss": 0.4453934669494629, + "step": 91830 + }, + { + "epoch": 0.3942883147437384, + "grad_norm": 2.449429750442505, + "learning_rate": 6.083276562351785e-05, + "loss": 0.32040867805480955, + "step": 91840 + }, + { + "epoch": 0.39433124683375836, + "grad_norm": 0.08724970370531082, + "learning_rate": 6.0828453903400225e-05, + "loss": 0.07561442852020264, + "step": 91850 + }, + { + "epoch": 0.39437417892377835, + "grad_norm": 0.008718972094357014, + "learning_rate": 6.082414218328261e-05, + "loss": 0.30009486675262453, + "step": 91860 + }, + { + "epoch": 0.3944171110137984, + "grad_norm": 0.21262004971504211, + "learning_rate": 6.081983046316497e-05, + "loss": 0.30516757965087893, + "step": 91870 + }, + { + "epoch": 0.3944600431038184, + "grad_norm": 0.03169046714901924, + "learning_rate": 6.081551874304735e-05, + "loss": 0.29369001388549804, + "step": 91880 + }, + { + "epoch": 0.39450297519383837, + "grad_norm": 6.781734943389893, + "learning_rate": 6.081120702292973e-05, + "loss": 0.2241981029510498, + "step": 91890 + }, + { + "epoch": 0.3945459072838584, + "grad_norm": 0.7466356158256531, + "learning_rate": 6.0806895302812105e-05, + "loss": 0.012508706748485565, + "step": 91900 + }, + { + "epoch": 0.3945888393738784, + "grad_norm": 0.011181380599737167, + "learning_rate": 6.080258358269448e-05, + "loss": 0.3503504037857056, + "step": 91910 + }, + { + "epoch": 0.3946317714638984, + "grad_norm": 0.03524165600538254, + "learning_rate": 6.079827186257686e-05, + "loss": 0.006903672963380814, + "step": 91920 + }, + { + "epoch": 0.3946747035539184, + "grad_norm": 0.06213952973484993, + "learning_rate": 6.079396014245924e-05, + "loss": 0.07258799076080322, + "step": 91930 + }, + { + "epoch": 0.3947176356439384, + "grad_norm": 1.010565996170044, + "learning_rate": 6.0789648422341615e-05, + "loss": 0.17450504302978515, + "step": 91940 + }, + { + "epoch": 0.3947605677339584, + "grad_norm": 0.02723545767366886, + "learning_rate": 6.0785336702223985e-05, + "loss": 0.2947016000747681, + "step": 91950 + }, + { + "epoch": 0.39480349982397844, + "grad_norm": 0.00735612353309989, + "learning_rate": 6.078102498210636e-05, + "loss": 0.13675864934921264, + "step": 91960 + }, + { + "epoch": 0.39484643191399843, + "grad_norm": 0.020295286551117897, + "learning_rate": 6.077671326198874e-05, + "loss": 0.15206470489501953, + "step": 91970 + }, + { + "epoch": 0.3948893640040184, + "grad_norm": 0.10978235304355621, + "learning_rate": 6.077240154187112e-05, + "loss": 0.21314818859100343, + "step": 91980 + }, + { + "epoch": 0.39493229609403846, + "grad_norm": 0.025493459776043892, + "learning_rate": 6.0768089821753495e-05, + "loss": 0.03302266895771026, + "step": 91990 + }, + { + "epoch": 0.39497522818405845, + "grad_norm": 0.07763177156448364, + "learning_rate": 6.076377810163587e-05, + "loss": 0.15637160539627076, + "step": 92000 + }, + { + "epoch": 0.39497522818405845, + "eval_loss": 0.43165695667266846, + "eval_runtime": 27.0763, + "eval_samples_per_second": 3.693, + "eval_steps_per_second": 3.693, + "step": 92000 + }, + { + "epoch": 0.3950181602740785, + "grad_norm": 20.20836067199707, + "learning_rate": 6.075946638151825e-05, + "loss": 0.27847981452941895, + "step": 92010 + }, + { + "epoch": 0.3950610923640985, + "grad_norm": 0.032609887421131134, + "learning_rate": 6.075515466140063e-05, + "loss": 0.26609823703765867, + "step": 92020 + }, + { + "epoch": 0.39510402445411846, + "grad_norm": 0.07131468504667282, + "learning_rate": 6.0750842941283004e-05, + "loss": 0.15527513027191162, + "step": 92030 + }, + { + "epoch": 0.3951469565441385, + "grad_norm": 2.1843528747558594, + "learning_rate": 6.074653122116537e-05, + "loss": 0.22598354816436766, + "step": 92040 + }, + { + "epoch": 0.3951898886341585, + "grad_norm": 1.2208198308944702, + "learning_rate": 6.0742219501047745e-05, + "loss": 0.19779176712036134, + "step": 92050 + }, + { + "epoch": 0.3952328207241785, + "grad_norm": 0.03513918071985245, + "learning_rate": 6.073790778093012e-05, + "loss": 0.08396974802017212, + "step": 92060 + }, + { + "epoch": 0.3952757528141985, + "grad_norm": 0.7641962766647339, + "learning_rate": 6.07335960608125e-05, + "loss": 0.2717055082321167, + "step": 92070 + }, + { + "epoch": 0.3953186849042185, + "grad_norm": 1.8674321174621582, + "learning_rate": 6.0729284340694884e-05, + "loss": 0.24090301990509033, + "step": 92080 + }, + { + "epoch": 0.3953616169942385, + "grad_norm": 1.9578057527542114, + "learning_rate": 6.072497262057726e-05, + "loss": 0.24970598220825196, + "step": 92090 + }, + { + "epoch": 0.39540454908425854, + "grad_norm": 2.490677833557129, + "learning_rate": 6.072066090045964e-05, + "loss": 0.25661425590515136, + "step": 92100 + }, + { + "epoch": 0.3954474811742785, + "grad_norm": 19.66390037536621, + "learning_rate": 6.0716349180342016e-05, + "loss": 0.16211674213409424, + "step": 92110 + }, + { + "epoch": 0.3954904132642985, + "grad_norm": 0.046901851892471313, + "learning_rate": 6.071203746022438e-05, + "loss": 0.05299463272094727, + "step": 92120 + }, + { + "epoch": 0.39553334535431856, + "grad_norm": 1.2311347723007202, + "learning_rate": 6.070772574010676e-05, + "loss": 0.1265595555305481, + "step": 92130 + }, + { + "epoch": 0.39557627744433854, + "grad_norm": 1.5069876909255981, + "learning_rate": 6.0703414019989134e-05, + "loss": 0.25828940868377687, + "step": 92140 + }, + { + "epoch": 0.39561920953435853, + "grad_norm": 0.444572776556015, + "learning_rate": 6.069910229987151e-05, + "loss": 0.09917322397232056, + "step": 92150 + }, + { + "epoch": 0.39566214162437857, + "grad_norm": 0.0789375901222229, + "learning_rate": 6.069479057975389e-05, + "loss": 0.29941983222961427, + "step": 92160 + }, + { + "epoch": 0.39570507371439856, + "grad_norm": 1.7957231998443604, + "learning_rate": 6.0690478859636266e-05, + "loss": 0.2737236976623535, + "step": 92170 + }, + { + "epoch": 0.39574800580441855, + "grad_norm": 0.04896816238760948, + "learning_rate": 6.0686167139518644e-05, + "loss": 0.08509291410446167, + "step": 92180 + }, + { + "epoch": 0.3957909378944386, + "grad_norm": 0.020770037546753883, + "learning_rate": 6.068185541940102e-05, + "loss": 0.4327548980712891, + "step": 92190 + }, + { + "epoch": 0.3958338699844586, + "grad_norm": 0.5426750779151917, + "learning_rate": 6.067754369928339e-05, + "loss": 0.3243721008300781, + "step": 92200 + }, + { + "epoch": 0.39587680207447856, + "grad_norm": 16.816232681274414, + "learning_rate": 6.067323197916577e-05, + "loss": 0.18375918865203858, + "step": 92210 + }, + { + "epoch": 0.3959197341644986, + "grad_norm": 0.09073749929666519, + "learning_rate": 6.0668920259048146e-05, + "loss": 0.0876370370388031, + "step": 92220 + }, + { + "epoch": 0.3959626662545186, + "grad_norm": 0.7398306727409363, + "learning_rate": 6.0664608538930524e-05, + "loss": 0.18924959897994995, + "step": 92230 + }, + { + "epoch": 0.39600559834453863, + "grad_norm": 0.16764898598194122, + "learning_rate": 6.06602968188129e-05, + "loss": 0.16392149925231933, + "step": 92240 + }, + { + "epoch": 0.3960485304345586, + "grad_norm": 0.027810113504529, + "learning_rate": 6.065598509869528e-05, + "loss": 0.3249415636062622, + "step": 92250 + }, + { + "epoch": 0.3960914625245786, + "grad_norm": 0.11741064488887787, + "learning_rate": 6.0651673378577656e-05, + "loss": 0.17678941488265992, + "step": 92260 + }, + { + "epoch": 0.39613439461459865, + "grad_norm": 0.01126841176301241, + "learning_rate": 6.064736165846003e-05, + "loss": 0.1792654275894165, + "step": 92270 + }, + { + "epoch": 0.39617732670461864, + "grad_norm": 0.0017827929696068168, + "learning_rate": 6.06430499383424e-05, + "loss": 0.19189642667770385, + "step": 92280 + }, + { + "epoch": 0.3962202587946386, + "grad_norm": 2.9198524951934814, + "learning_rate": 6.0638738218224774e-05, + "loss": 0.2880232334136963, + "step": 92290 + }, + { + "epoch": 0.39626319088465867, + "grad_norm": 0.021255875006318092, + "learning_rate": 6.063442649810716e-05, + "loss": 0.1363054871559143, + "step": 92300 + }, + { + "epoch": 0.39630612297467865, + "grad_norm": 0.002346013905480504, + "learning_rate": 6.0630114777989536e-05, + "loss": 0.31574110984802245, + "step": 92310 + }, + { + "epoch": 0.39634905506469864, + "grad_norm": 1.1382430791854858, + "learning_rate": 6.062580305787191e-05, + "loss": 0.2634714365005493, + "step": 92320 + }, + { + "epoch": 0.3963919871547187, + "grad_norm": 0.002902445150539279, + "learning_rate": 6.062149133775429e-05, + "loss": 0.24987235069274902, + "step": 92330 + }, + { + "epoch": 0.39643491924473867, + "grad_norm": 0.0020218868739902973, + "learning_rate": 6.061717961763667e-05, + "loss": 0.22082626819610596, + "step": 92340 + }, + { + "epoch": 0.39647785133475866, + "grad_norm": 0.3430553078651428, + "learning_rate": 6.0612867897519045e-05, + "loss": 0.3638188362121582, + "step": 92350 + }, + { + "epoch": 0.3965207834247787, + "grad_norm": 0.0021895577665418386, + "learning_rate": 6.060855617740141e-05, + "loss": 0.11037677526473999, + "step": 92360 + }, + { + "epoch": 0.3965637155147987, + "grad_norm": 0.8205850124359131, + "learning_rate": 6.0604244457283786e-05, + "loss": 0.2192380666732788, + "step": 92370 + }, + { + "epoch": 0.3966066476048187, + "grad_norm": 2.2314140796661377, + "learning_rate": 6.0599932737166164e-05, + "loss": 0.20187749862670898, + "step": 92380 + }, + { + "epoch": 0.3966495796948387, + "grad_norm": 3.9422032833099365, + "learning_rate": 6.059562101704854e-05, + "loss": 0.15174542665481566, + "step": 92390 + }, + { + "epoch": 0.3966925117848587, + "grad_norm": 0.10271573066711426, + "learning_rate": 6.059130929693092e-05, + "loss": 0.22555267810821533, + "step": 92400 + }, + { + "epoch": 0.3967354438748787, + "grad_norm": 0.29038718342781067, + "learning_rate": 6.0586997576813296e-05, + "loss": 0.2885429382324219, + "step": 92410 + }, + { + "epoch": 0.39677837596489873, + "grad_norm": 0.03839349001646042, + "learning_rate": 6.058268585669567e-05, + "loss": 0.25131521224975584, + "step": 92420 + }, + { + "epoch": 0.3968213080549187, + "grad_norm": 0.4567015767097473, + "learning_rate": 6.057837413657805e-05, + "loss": 0.1675976276397705, + "step": 92430 + }, + { + "epoch": 0.39686424014493876, + "grad_norm": 1.6474500894546509, + "learning_rate": 6.057406241646043e-05, + "loss": 0.247251296043396, + "step": 92440 + }, + { + "epoch": 0.39690717223495875, + "grad_norm": 3.3086657524108887, + "learning_rate": 6.05697506963428e-05, + "loss": 0.2922791004180908, + "step": 92450 + }, + { + "epoch": 0.39695010432497874, + "grad_norm": 1.4948768615722656, + "learning_rate": 6.0565438976225176e-05, + "loss": 0.34301207065582273, + "step": 92460 + }, + { + "epoch": 0.3969930364149988, + "grad_norm": 0.05721181258559227, + "learning_rate": 6.056112725610755e-05, + "loss": 0.32348856925964353, + "step": 92470 + }, + { + "epoch": 0.39703596850501877, + "grad_norm": 0.10996733605861664, + "learning_rate": 6.055681553598993e-05, + "loss": 0.23233428001403808, + "step": 92480 + }, + { + "epoch": 0.39707890059503875, + "grad_norm": 0.6285014152526855, + "learning_rate": 6.055250381587231e-05, + "loss": 0.28221845626831055, + "step": 92490 + }, + { + "epoch": 0.3971218326850588, + "grad_norm": 7.208746910095215, + "learning_rate": 6.0548192095754685e-05, + "loss": 0.48242778778076173, + "step": 92500 + }, + { + "epoch": 0.3971647647750788, + "grad_norm": 6.577877044677734, + "learning_rate": 6.054388037563706e-05, + "loss": 0.25366084575653075, + "step": 92510 + }, + { + "epoch": 0.39720769686509877, + "grad_norm": 0.10521818697452545, + "learning_rate": 6.053956865551944e-05, + "loss": 0.3314534664154053, + "step": 92520 + }, + { + "epoch": 0.3972506289551188, + "grad_norm": 1.7988253831863403, + "learning_rate": 6.053525693540181e-05, + "loss": 0.4622535228729248, + "step": 92530 + }, + { + "epoch": 0.3972935610451388, + "grad_norm": 0.1328478753566742, + "learning_rate": 6.053094521528419e-05, + "loss": 0.18051735162734986, + "step": 92540 + }, + { + "epoch": 0.3973364931351588, + "grad_norm": 1.4796525239944458, + "learning_rate": 6.0526633495166565e-05, + "loss": 0.20069704055786133, + "step": 92550 + }, + { + "epoch": 0.39737942522517883, + "grad_norm": 0.2786750793457031, + "learning_rate": 6.052232177504894e-05, + "loss": 0.32162978649139407, + "step": 92560 + }, + { + "epoch": 0.3974223573151988, + "grad_norm": 0.107948899269104, + "learning_rate": 6.051801005493132e-05, + "loss": 0.3327937602996826, + "step": 92570 + }, + { + "epoch": 0.3974652894052188, + "grad_norm": 0.012413183227181435, + "learning_rate": 6.05136983348137e-05, + "loss": 0.2087266206741333, + "step": 92580 + }, + { + "epoch": 0.39750822149523884, + "grad_norm": 0.7624533176422119, + "learning_rate": 6.0509386614696074e-05, + "loss": 0.302428150177002, + "step": 92590 + }, + { + "epoch": 0.39755115358525883, + "grad_norm": 0.001710769603960216, + "learning_rate": 6.050507489457845e-05, + "loss": 0.053492683172225955, + "step": 92600 + }, + { + "epoch": 0.3975940856752788, + "grad_norm": 0.037044957280159, + "learning_rate": 6.0500763174460815e-05, + "loss": 0.27043685913085935, + "step": 92610 + }, + { + "epoch": 0.39763701776529886, + "grad_norm": 0.39931926131248474, + "learning_rate": 6.049645145434319e-05, + "loss": 0.2734602689743042, + "step": 92620 + }, + { + "epoch": 0.39767994985531885, + "grad_norm": 0.019506726413965225, + "learning_rate": 6.049213973422557e-05, + "loss": 0.1141431450843811, + "step": 92630 + }, + { + "epoch": 0.39772288194533884, + "grad_norm": 0.001194458338432014, + "learning_rate": 6.048782801410795e-05, + "loss": 0.09379836320877075, + "step": 92640 + }, + { + "epoch": 0.3977658140353589, + "grad_norm": 0.011507615447044373, + "learning_rate": 6.0483516293990325e-05, + "loss": 0.10641751289367676, + "step": 92650 + }, + { + "epoch": 0.39780874612537886, + "grad_norm": 0.9543510675430298, + "learning_rate": 6.04792045738727e-05, + "loss": 0.3608994483947754, + "step": 92660 + }, + { + "epoch": 0.3978516782153989, + "grad_norm": 0.00274080503731966, + "learning_rate": 6.0474892853755086e-05, + "loss": 0.20310893058776855, + "step": 92670 + }, + { + "epoch": 0.3978946103054189, + "grad_norm": 0.0007183632696978748, + "learning_rate": 6.0470581133637464e-05, + "loss": 0.36690948009490965, + "step": 92680 + }, + { + "epoch": 0.3979375423954389, + "grad_norm": 0.03603541851043701, + "learning_rate": 6.046626941351983e-05, + "loss": 0.1928205370903015, + "step": 92690 + }, + { + "epoch": 0.3979804744854589, + "grad_norm": 3.2046563625335693, + "learning_rate": 6.0461957693402205e-05, + "loss": 0.2937966823577881, + "step": 92700 + }, + { + "epoch": 0.3980234065754789, + "grad_norm": 3.496809959411621, + "learning_rate": 6.045764597328458e-05, + "loss": 0.27285568714141845, + "step": 92710 + }, + { + "epoch": 0.3980663386654989, + "grad_norm": 0.005234590731561184, + "learning_rate": 6.045333425316696e-05, + "loss": 0.31596553325653076, + "step": 92720 + }, + { + "epoch": 0.39810927075551894, + "grad_norm": 0.9226164817810059, + "learning_rate": 6.044902253304934e-05, + "loss": 0.2640789747238159, + "step": 92730 + }, + { + "epoch": 0.3981522028455389, + "grad_norm": 0.05384889245033264, + "learning_rate": 6.0444710812931714e-05, + "loss": 0.18334288597106935, + "step": 92740 + }, + { + "epoch": 0.3981951349355589, + "grad_norm": 1.8284244537353516, + "learning_rate": 6.044039909281409e-05, + "loss": 0.267243480682373, + "step": 92750 + }, + { + "epoch": 0.39823806702557896, + "grad_norm": 0.17038202285766602, + "learning_rate": 6.043608737269647e-05, + "loss": 0.25903208255767823, + "step": 92760 + }, + { + "epoch": 0.39828099911559894, + "grad_norm": 0.10753041505813599, + "learning_rate": 6.0431775652578846e-05, + "loss": 0.10099961757659912, + "step": 92770 + }, + { + "epoch": 0.39832393120561893, + "grad_norm": 0.1653003692626953, + "learning_rate": 6.042746393246122e-05, + "loss": 0.17818518877029418, + "step": 92780 + }, + { + "epoch": 0.398366863295639, + "grad_norm": 5.727450847625732, + "learning_rate": 6.0423152212343594e-05, + "loss": 0.21343677043914794, + "step": 92790 + }, + { + "epoch": 0.39840979538565896, + "grad_norm": 0.0009643832454457879, + "learning_rate": 6.041884049222597e-05, + "loss": 0.24222161769866943, + "step": 92800 + }, + { + "epoch": 0.39845272747567895, + "grad_norm": 1.5220520496368408, + "learning_rate": 6.041452877210835e-05, + "loss": 0.11254243850708008, + "step": 92810 + }, + { + "epoch": 0.398495659565699, + "grad_norm": 0.029565613716840744, + "learning_rate": 6.0410217051990726e-05, + "loss": 0.011484414339065552, + "step": 92820 + }, + { + "epoch": 0.398538591655719, + "grad_norm": 0.001762153347954154, + "learning_rate": 6.04059053318731e-05, + "loss": 0.2063227891921997, + "step": 92830 + }, + { + "epoch": 0.39858152374573896, + "grad_norm": 0.02957475185394287, + "learning_rate": 6.040159361175548e-05, + "loss": 0.0242899626493454, + "step": 92840 + }, + { + "epoch": 0.398624455835759, + "grad_norm": 0.0029483300168067217, + "learning_rate": 6.039728189163786e-05, + "loss": 0.3035967588424683, + "step": 92850 + }, + { + "epoch": 0.398667387925779, + "grad_norm": 0.05529223382472992, + "learning_rate": 6.039297017152022e-05, + "loss": 0.30932021141052246, + "step": 92860 + }, + { + "epoch": 0.39871032001579904, + "grad_norm": 14.569122314453125, + "learning_rate": 6.03886584514026e-05, + "loss": 0.13557947874069215, + "step": 92870 + }, + { + "epoch": 0.398753252105819, + "grad_norm": 2.028317451477051, + "learning_rate": 6.0384346731284977e-05, + "loss": 0.41254119873046874, + "step": 92880 + }, + { + "epoch": 0.398796184195839, + "grad_norm": 5.376577854156494, + "learning_rate": 6.038003501116736e-05, + "loss": 0.47383694648742675, + "step": 92890 + }, + { + "epoch": 0.39883911628585905, + "grad_norm": 47.30210876464844, + "learning_rate": 6.037572329104974e-05, + "loss": 0.23314952850341797, + "step": 92900 + }, + { + "epoch": 0.39888204837587904, + "grad_norm": 0.0007397299632430077, + "learning_rate": 6.0371411570932115e-05, + "loss": 0.10217365026473998, + "step": 92910 + }, + { + "epoch": 0.398924980465899, + "grad_norm": 0.045869532972574234, + "learning_rate": 6.036709985081449e-05, + "loss": 0.13935530185699463, + "step": 92920 + }, + { + "epoch": 0.39896791255591907, + "grad_norm": 0.3745325803756714, + "learning_rate": 6.036278813069687e-05, + "loss": 0.3490712881088257, + "step": 92930 + }, + { + "epoch": 0.39901084464593906, + "grad_norm": 1.1765798330307007, + "learning_rate": 6.0358476410579234e-05, + "loss": 0.4222440242767334, + "step": 92940 + }, + { + "epoch": 0.39905377673595904, + "grad_norm": 4.688851833343506, + "learning_rate": 6.035416469046161e-05, + "loss": 0.37330188751220705, + "step": 92950 + }, + { + "epoch": 0.3990967088259791, + "grad_norm": 0.005485413130372763, + "learning_rate": 6.034985297034399e-05, + "loss": 0.2636329412460327, + "step": 92960 + }, + { + "epoch": 0.39913964091599907, + "grad_norm": 0.1594797968864441, + "learning_rate": 6.0345541250226366e-05, + "loss": 0.22802400588989258, + "step": 92970 + }, + { + "epoch": 0.39918257300601906, + "grad_norm": 0.018702786415815353, + "learning_rate": 6.034122953010874e-05, + "loss": 0.19402458667755126, + "step": 92980 + }, + { + "epoch": 0.3992255050960391, + "grad_norm": 0.49566909670829773, + "learning_rate": 6.033691780999112e-05, + "loss": 0.1691906452178955, + "step": 92990 + }, + { + "epoch": 0.3992684371860591, + "grad_norm": 0.5254350304603577, + "learning_rate": 6.03326060898735e-05, + "loss": 0.20444183349609374, + "step": 93000 + }, + { + "epoch": 0.3992684371860591, + "eval_loss": 0.42532363533973694, + "eval_runtime": 27.1264, + "eval_samples_per_second": 3.686, + "eval_steps_per_second": 3.686, + "step": 93000 + }, + { + "epoch": 0.3993113692760791, + "grad_norm": 0.021253425627946854, + "learning_rate": 6.0328294369755875e-05, + "loss": 0.18103508949279784, + "step": 93010 + }, + { + "epoch": 0.3993543013660991, + "grad_norm": 0.04606667160987854, + "learning_rate": 6.0323982649638246e-05, + "loss": 0.00736895427107811, + "step": 93020 + }, + { + "epoch": 0.3993972334561191, + "grad_norm": 2.99416446685791, + "learning_rate": 6.031967092952062e-05, + "loss": 0.19407373666763306, + "step": 93030 + }, + { + "epoch": 0.3994401655461391, + "grad_norm": 4.798525333404541, + "learning_rate": 6.0315359209403e-05, + "loss": 0.20663731098175048, + "step": 93040 + }, + { + "epoch": 0.39948309763615913, + "grad_norm": 0.004676021169871092, + "learning_rate": 6.031104748928538e-05, + "loss": 0.22965459823608397, + "step": 93050 + }, + { + "epoch": 0.3995260297261791, + "grad_norm": 0.027508612722158432, + "learning_rate": 6.0306735769167755e-05, + "loss": 0.3717710018157959, + "step": 93060 + }, + { + "epoch": 0.3995689618161991, + "grad_norm": 0.7421900629997253, + "learning_rate": 6.030242404905013e-05, + "loss": 0.18571395874023439, + "step": 93070 + }, + { + "epoch": 0.39961189390621915, + "grad_norm": 1.7135603427886963, + "learning_rate": 6.029811232893251e-05, + "loss": 0.13353700637817384, + "step": 93080 + }, + { + "epoch": 0.39965482599623914, + "grad_norm": 0.02101057395339012, + "learning_rate": 6.029380060881489e-05, + "loss": 0.19278452396392823, + "step": 93090 + }, + { + "epoch": 0.3996977580862592, + "grad_norm": 1.4164752960205078, + "learning_rate": 6.0289488888697265e-05, + "loss": 0.2833911657333374, + "step": 93100 + }, + { + "epoch": 0.39974069017627917, + "grad_norm": 0.5472753643989563, + "learning_rate": 6.0285177168579635e-05, + "loss": 0.3730371713638306, + "step": 93110 + }, + { + "epoch": 0.39978362226629915, + "grad_norm": 4.260237216949463, + "learning_rate": 6.028086544846201e-05, + "loss": 0.0819745659828186, + "step": 93120 + }, + { + "epoch": 0.3998265543563192, + "grad_norm": 0.03258398547768593, + "learning_rate": 6.027655372834439e-05, + "loss": 0.41202602386474607, + "step": 93130 + }, + { + "epoch": 0.3998694864463392, + "grad_norm": 0.012231654487550259, + "learning_rate": 6.027224200822677e-05, + "loss": 0.054266971349716184, + "step": 93140 + }, + { + "epoch": 0.39991241853635917, + "grad_norm": 0.02671731263399124, + "learning_rate": 6.0267930288109144e-05, + "loss": 0.13896874189376832, + "step": 93150 + }, + { + "epoch": 0.3999553506263792, + "grad_norm": 0.008304188027977943, + "learning_rate": 6.026361856799152e-05, + "loss": 0.3450723171234131, + "step": 93160 + }, + { + "epoch": 0.3999982827163992, + "grad_norm": 0.0143406568095088, + "learning_rate": 6.02593068478739e-05, + "loss": 0.13900293111801149, + "step": 93170 + }, + { + "epoch": 0.4000412148064192, + "grad_norm": 0.0057134004309773445, + "learning_rate": 6.0254995127756276e-05, + "loss": 0.21512312889099122, + "step": 93180 + }, + { + "epoch": 0.40008414689643923, + "grad_norm": 1.5521478652954102, + "learning_rate": 6.025068340763864e-05, + "loss": 0.20870733261108398, + "step": 93190 + }, + { + "epoch": 0.4001270789864592, + "grad_norm": 0.022138668224215508, + "learning_rate": 6.024637168752102e-05, + "loss": 0.34106059074401857, + "step": 93200 + }, + { + "epoch": 0.4001700110764792, + "grad_norm": 0.04387650266289711, + "learning_rate": 6.0242059967403395e-05, + "loss": 0.0750705897808075, + "step": 93210 + }, + { + "epoch": 0.40021294316649925, + "grad_norm": 1.2559690475463867, + "learning_rate": 6.023774824728577e-05, + "loss": 0.3357060432434082, + "step": 93220 + }, + { + "epoch": 0.40025587525651923, + "grad_norm": 0.3811001479625702, + "learning_rate": 6.023343652716815e-05, + "loss": 0.009803864359855651, + "step": 93230 + }, + { + "epoch": 0.4002988073465392, + "grad_norm": 9.359700202941895, + "learning_rate": 6.022912480705053e-05, + "loss": 0.2810043811798096, + "step": 93240 + }, + { + "epoch": 0.40034173943655926, + "grad_norm": 0.04405032843351364, + "learning_rate": 6.0224813086932904e-05, + "loss": 0.4757356643676758, + "step": 93250 + }, + { + "epoch": 0.40038467152657925, + "grad_norm": 0.004825854208320379, + "learning_rate": 6.022050136681529e-05, + "loss": 0.06741368770599365, + "step": 93260 + }, + { + "epoch": 0.40042760361659924, + "grad_norm": 0.03525272756814957, + "learning_rate": 6.021618964669765e-05, + "loss": 0.1233631730079651, + "step": 93270 + }, + { + "epoch": 0.4004705357066193, + "grad_norm": 9.752348899841309, + "learning_rate": 6.021187792658003e-05, + "loss": 0.3055421829223633, + "step": 93280 + }, + { + "epoch": 0.40051346779663927, + "grad_norm": 0.08953861892223358, + "learning_rate": 6.020756620646241e-05, + "loss": 0.12309778928756714, + "step": 93290 + }, + { + "epoch": 0.4005563998866593, + "grad_norm": 4.488521575927734, + "learning_rate": 6.0203254486344784e-05, + "loss": 0.2833517789840698, + "step": 93300 + }, + { + "epoch": 0.4005993319766793, + "grad_norm": 6.068156719207764, + "learning_rate": 6.019894276622716e-05, + "loss": 0.1711531400680542, + "step": 93310 + }, + { + "epoch": 0.4006422640666993, + "grad_norm": 3.3427748680114746, + "learning_rate": 6.019463104610954e-05, + "loss": 0.21299960613250732, + "step": 93320 + }, + { + "epoch": 0.4006851961567193, + "grad_norm": 1.6745113134384155, + "learning_rate": 6.0190319325991916e-05, + "loss": 0.1890343427658081, + "step": 93330 + }, + { + "epoch": 0.4007281282467393, + "grad_norm": 0.08644956350326538, + "learning_rate": 6.0186007605874294e-05, + "loss": 0.15870969295501708, + "step": 93340 + }, + { + "epoch": 0.4007710603367593, + "grad_norm": 0.10776764899492264, + "learning_rate": 6.0181695885756664e-05, + "loss": 0.24700050354003905, + "step": 93350 + }, + { + "epoch": 0.40081399242677934, + "grad_norm": 0.008111716248095036, + "learning_rate": 6.017738416563904e-05, + "loss": 0.20516328811645507, + "step": 93360 + }, + { + "epoch": 0.40085692451679933, + "grad_norm": 0.1302001029253006, + "learning_rate": 6.017307244552142e-05, + "loss": 0.21231431961059571, + "step": 93370 + }, + { + "epoch": 0.4008998566068193, + "grad_norm": 0.0018937139539048076, + "learning_rate": 6.0168760725403796e-05, + "loss": 0.1605125904083252, + "step": 93380 + }, + { + "epoch": 0.40094278869683936, + "grad_norm": 0.510441243648529, + "learning_rate": 6.0164449005286174e-05, + "loss": 0.24825668334960938, + "step": 93390 + }, + { + "epoch": 0.40098572078685935, + "grad_norm": 0.051314398646354675, + "learning_rate": 6.016013728516855e-05, + "loss": 0.2400137424468994, + "step": 93400 + }, + { + "epoch": 0.40102865287687933, + "grad_norm": 0.021915512159466743, + "learning_rate": 6.015582556505093e-05, + "loss": 0.14309494495391845, + "step": 93410 + }, + { + "epoch": 0.4010715849668994, + "grad_norm": 0.7100977897644043, + "learning_rate": 6.0151513844933306e-05, + "loss": 0.3975811243057251, + "step": 93420 + }, + { + "epoch": 0.40111451705691936, + "grad_norm": 0.010824207216501236, + "learning_rate": 6.014720212481567e-05, + "loss": 0.3093941450119019, + "step": 93430 + }, + { + "epoch": 0.40115744914693935, + "grad_norm": 0.038194987922906876, + "learning_rate": 6.014289040469805e-05, + "loss": 0.34295101165771485, + "step": 93440 + }, + { + "epoch": 0.4012003812369594, + "grad_norm": 0.03417959436774254, + "learning_rate": 6.0138578684580424e-05, + "loss": 0.03890349268913269, + "step": 93450 + }, + { + "epoch": 0.4012433133269794, + "grad_norm": 0.007218822371214628, + "learning_rate": 6.01342669644628e-05, + "loss": 0.04127689599990845, + "step": 93460 + }, + { + "epoch": 0.40128624541699937, + "grad_norm": 1.0656627416610718, + "learning_rate": 6.012995524434518e-05, + "loss": 0.24317021369934083, + "step": 93470 + }, + { + "epoch": 0.4013291775070194, + "grad_norm": 0.0067567359656095505, + "learning_rate": 6.012564352422756e-05, + "loss": 0.25871007442474364, + "step": 93480 + }, + { + "epoch": 0.4013721095970394, + "grad_norm": 0.0030029634945094585, + "learning_rate": 6.012133180410994e-05, + "loss": 0.19204816818237305, + "step": 93490 + }, + { + "epoch": 0.4014150416870594, + "grad_norm": 4.520451068878174, + "learning_rate": 6.011702008399232e-05, + "loss": 0.22543861865997314, + "step": 93500 + }, + { + "epoch": 0.4014579737770794, + "grad_norm": 0.6664943099021912, + "learning_rate": 6.0112708363874695e-05, + "loss": 0.09824045896530151, + "step": 93510 + }, + { + "epoch": 0.4015009058670994, + "grad_norm": 3.1957309246063232, + "learning_rate": 6.010839664375706e-05, + "loss": 0.26783323287963867, + "step": 93520 + }, + { + "epoch": 0.40154383795711945, + "grad_norm": 2.948349952697754, + "learning_rate": 6.0104084923639436e-05, + "loss": 0.1502614974975586, + "step": 93530 + }, + { + "epoch": 0.40158677004713944, + "grad_norm": 0.17878828942775726, + "learning_rate": 6.009977320352181e-05, + "loss": 0.0759764313697815, + "step": 93540 + }, + { + "epoch": 0.40162970213715943, + "grad_norm": 1.7953228950500488, + "learning_rate": 6.009546148340419e-05, + "loss": 0.3401905059814453, + "step": 93550 + }, + { + "epoch": 0.40167263422717947, + "grad_norm": 0.6826522946357727, + "learning_rate": 6.009114976328657e-05, + "loss": 0.21881649494171143, + "step": 93560 + }, + { + "epoch": 0.40171556631719946, + "grad_norm": 0.2658919095993042, + "learning_rate": 6.0086838043168945e-05, + "loss": 0.04690901637077331, + "step": 93570 + }, + { + "epoch": 0.40175849840721944, + "grad_norm": 1.6003676652908325, + "learning_rate": 6.008252632305132e-05, + "loss": 0.33685760498046874, + "step": 93580 + }, + { + "epoch": 0.4018014304972395, + "grad_norm": 0.014093033969402313, + "learning_rate": 6.00782146029337e-05, + "loss": 0.06862253546714783, + "step": 93590 + }, + { + "epoch": 0.4018443625872595, + "grad_norm": 0.03570076823234558, + "learning_rate": 6.007390288281607e-05, + "loss": 0.14127914905548095, + "step": 93600 + }, + { + "epoch": 0.40188729467727946, + "grad_norm": 1.6219851970672607, + "learning_rate": 6.006959116269845e-05, + "loss": 0.2750361919403076, + "step": 93610 + }, + { + "epoch": 0.4019302267672995, + "grad_norm": 0.13300123810768127, + "learning_rate": 6.0065279442580825e-05, + "loss": 0.18249515295028687, + "step": 93620 + }, + { + "epoch": 0.4019731588573195, + "grad_norm": 0.015724139288067818, + "learning_rate": 6.00609677224632e-05, + "loss": 0.14590342044830323, + "step": 93630 + }, + { + "epoch": 0.4020160909473395, + "grad_norm": 0.49315378069877625, + "learning_rate": 6.005665600234558e-05, + "loss": 0.13075822591781616, + "step": 93640 + }, + { + "epoch": 0.4020590230373595, + "grad_norm": 0.05315891280770302, + "learning_rate": 6.005234428222796e-05, + "loss": 0.19189344644546508, + "step": 93650 + }, + { + "epoch": 0.4021019551273795, + "grad_norm": 1.898503303527832, + "learning_rate": 6.0048032562110335e-05, + "loss": 0.10279072523117065, + "step": 93660 + }, + { + "epoch": 0.4021448872173995, + "grad_norm": 3.9733712673187256, + "learning_rate": 6.004372084199271e-05, + "loss": 0.3545947551727295, + "step": 93670 + }, + { + "epoch": 0.40218781930741954, + "grad_norm": 3.26906681060791, + "learning_rate": 6.0039409121875076e-05, + "loss": 0.5459408283233642, + "step": 93680 + }, + { + "epoch": 0.4022307513974395, + "grad_norm": 1.936118483543396, + "learning_rate": 6.003509740175745e-05, + "loss": 0.18651505708694457, + "step": 93690 + }, + { + "epoch": 0.4022736834874595, + "grad_norm": 4.495886325836182, + "learning_rate": 6.003078568163984e-05, + "loss": 0.23359956741333007, + "step": 93700 + }, + { + "epoch": 0.40231661557747955, + "grad_norm": 0.06830295920372009, + "learning_rate": 6.0026473961522215e-05, + "loss": 0.28993537425994875, + "step": 93710 + }, + { + "epoch": 0.40235954766749954, + "grad_norm": 0.10418267548084259, + "learning_rate": 6.002216224140459e-05, + "loss": 0.2117173671722412, + "step": 93720 + }, + { + "epoch": 0.4024024797575196, + "grad_norm": 5.263204097747803, + "learning_rate": 6.001785052128697e-05, + "loss": 0.097961688041687, + "step": 93730 + }, + { + "epoch": 0.40244541184753957, + "grad_norm": 0.30054160952568054, + "learning_rate": 6.001353880116935e-05, + "loss": 0.1039130449295044, + "step": 93740 + }, + { + "epoch": 0.40248834393755956, + "grad_norm": 1.7757928371429443, + "learning_rate": 6.0009227081051724e-05, + "loss": 0.13803932666778565, + "step": 93750 + }, + { + "epoch": 0.4025312760275796, + "grad_norm": 0.01852819323539734, + "learning_rate": 6.000491536093409e-05, + "loss": 0.10215220451354981, + "step": 93760 + }, + { + "epoch": 0.4025742081175996, + "grad_norm": 2.4717369079589844, + "learning_rate": 6.0000603640816465e-05, + "loss": 0.344983434677124, + "step": 93770 + }, + { + "epoch": 0.4026171402076196, + "grad_norm": 0.09923583269119263, + "learning_rate": 5.999629192069884e-05, + "loss": 0.2237044095993042, + "step": 93780 + }, + { + "epoch": 0.4026600722976396, + "grad_norm": 2.792940139770508, + "learning_rate": 5.999198020058122e-05, + "loss": 0.3041109800338745, + "step": 93790 + }, + { + "epoch": 0.4027030043876596, + "grad_norm": 1.725498080253601, + "learning_rate": 5.99876684804636e-05, + "loss": 0.056520164012908936, + "step": 93800 + }, + { + "epoch": 0.4027459364776796, + "grad_norm": 0.30796095728874207, + "learning_rate": 5.9983356760345975e-05, + "loss": 0.2576672792434692, + "step": 93810 + }, + { + "epoch": 0.40278886856769963, + "grad_norm": 1.5174881219863892, + "learning_rate": 5.997904504022835e-05, + "loss": 0.23054022789001466, + "step": 93820 + }, + { + "epoch": 0.4028318006577196, + "grad_norm": 0.016648801043629646, + "learning_rate": 5.997473332011073e-05, + "loss": 0.3647672891616821, + "step": 93830 + }, + { + "epoch": 0.4028747327477396, + "grad_norm": 1.8972567319869995, + "learning_rate": 5.9970421599993107e-05, + "loss": 0.24868409633636473, + "step": 93840 + }, + { + "epoch": 0.40291766483775965, + "grad_norm": 1.3365458250045776, + "learning_rate": 5.996610987987548e-05, + "loss": 0.36391315460205076, + "step": 93850 + }, + { + "epoch": 0.40296059692777964, + "grad_norm": 6.127025604248047, + "learning_rate": 5.9961798159757854e-05, + "loss": 0.1571225643157959, + "step": 93860 + }, + { + "epoch": 0.4030035290177996, + "grad_norm": 0.01666446588933468, + "learning_rate": 5.995748643964023e-05, + "loss": 0.18446390628814696, + "step": 93870 + }, + { + "epoch": 0.40304646110781966, + "grad_norm": 0.014780756086111069, + "learning_rate": 5.995317471952261e-05, + "loss": 0.1700269341468811, + "step": 93880 + }, + { + "epoch": 0.40308939319783965, + "grad_norm": 0.3110774755477905, + "learning_rate": 5.9948862999404987e-05, + "loss": 0.2906758785247803, + "step": 93890 + }, + { + "epoch": 0.40313232528785964, + "grad_norm": 0.0019415556453168392, + "learning_rate": 5.9944551279287364e-05, + "loss": 0.11864016056060792, + "step": 93900 + }, + { + "epoch": 0.4031752573778797, + "grad_norm": 0.12294873595237732, + "learning_rate": 5.994023955916974e-05, + "loss": 0.2842595100402832, + "step": 93910 + }, + { + "epoch": 0.40321818946789967, + "grad_norm": 1.0812486410140991, + "learning_rate": 5.993592783905212e-05, + "loss": 0.3086820602416992, + "step": 93920 + }, + { + "epoch": 0.40326112155791966, + "grad_norm": 0.0038773079868406057, + "learning_rate": 5.993161611893449e-05, + "loss": 0.18593962192535402, + "step": 93930 + }, + { + "epoch": 0.4033040536479397, + "grad_norm": 1.0432664155960083, + "learning_rate": 5.9927304398816866e-05, + "loss": 0.37398381233215333, + "step": 93940 + }, + { + "epoch": 0.4033469857379597, + "grad_norm": 2.64205002784729, + "learning_rate": 5.9922992678699244e-05, + "loss": 0.2709501266479492, + "step": 93950 + }, + { + "epoch": 0.4033899178279797, + "grad_norm": 1.6943469047546387, + "learning_rate": 5.991868095858162e-05, + "loss": 0.15391974449157714, + "step": 93960 + }, + { + "epoch": 0.4034328499179997, + "grad_norm": 0.0060857306234538555, + "learning_rate": 5.9914369238464e-05, + "loss": 0.17449193000793456, + "step": 93970 + }, + { + "epoch": 0.4034757820080197, + "grad_norm": 0.9577922224998474, + "learning_rate": 5.9910057518346376e-05, + "loss": 0.057951831817626955, + "step": 93980 + }, + { + "epoch": 0.40351871409803974, + "grad_norm": 1.1919249296188354, + "learning_rate": 5.990574579822875e-05, + "loss": 0.16909236907958985, + "step": 93990 + }, + { + "epoch": 0.40356164618805973, + "grad_norm": 4.153521537780762, + "learning_rate": 5.990143407811113e-05, + "loss": 0.43216743469238283, + "step": 94000 + }, + { + "epoch": 0.40356164618805973, + "eval_loss": 0.4400307834148407, + "eval_runtime": 27.1541, + "eval_samples_per_second": 3.683, + "eval_steps_per_second": 3.683, + "step": 94000 + }, + { + "epoch": 0.4036045782780797, + "grad_norm": 0.017568619921803474, + "learning_rate": 5.9897122357993494e-05, + "loss": 0.2042163848876953, + "step": 94010 + }, + { + "epoch": 0.40364751036809976, + "grad_norm": 0.0071970331482589245, + "learning_rate": 5.989281063787587e-05, + "loss": 0.17381359338760377, + "step": 94020 + }, + { + "epoch": 0.40369044245811975, + "grad_norm": 1.0322858095169067, + "learning_rate": 5.988849891775825e-05, + "loss": 0.21790926456451415, + "step": 94030 + }, + { + "epoch": 0.40373337454813973, + "grad_norm": 1.535051941871643, + "learning_rate": 5.9884187197640626e-05, + "loss": 0.3981826066970825, + "step": 94040 + }, + { + "epoch": 0.4037763066381598, + "grad_norm": 0.05718700960278511, + "learning_rate": 5.9879875477523004e-05, + "loss": 0.26243913173675537, + "step": 94050 + }, + { + "epoch": 0.40381923872817976, + "grad_norm": 0.01205611601471901, + "learning_rate": 5.987556375740538e-05, + "loss": 0.16048460006713866, + "step": 94060 + }, + { + "epoch": 0.40386217081819975, + "grad_norm": 1.6763887405395508, + "learning_rate": 5.987125203728776e-05, + "loss": 0.13859224319458008, + "step": 94070 + }, + { + "epoch": 0.4039051029082198, + "grad_norm": 0.006425794214010239, + "learning_rate": 5.986694031717014e-05, + "loss": 0.1734892249107361, + "step": 94080 + }, + { + "epoch": 0.4039480349982398, + "grad_norm": 0.09611303359270096, + "learning_rate": 5.9862628597052506e-05, + "loss": 0.2162872314453125, + "step": 94090 + }, + { + "epoch": 0.40399096708825977, + "grad_norm": 17.159536361694336, + "learning_rate": 5.9858316876934884e-05, + "loss": 0.13703900575637817, + "step": 94100 + }, + { + "epoch": 0.4040338991782798, + "grad_norm": 1.338617205619812, + "learning_rate": 5.985400515681726e-05, + "loss": 0.4566403865814209, + "step": 94110 + }, + { + "epoch": 0.4040768312682998, + "grad_norm": 0.023423107340931892, + "learning_rate": 5.984969343669964e-05, + "loss": 0.22699213027954102, + "step": 94120 + }, + { + "epoch": 0.4041197633583198, + "grad_norm": 2.107382297515869, + "learning_rate": 5.9845381716582016e-05, + "loss": 0.3840765953063965, + "step": 94130 + }, + { + "epoch": 0.4041626954483398, + "grad_norm": 2.3195486068725586, + "learning_rate": 5.984106999646439e-05, + "loss": 0.19965558052062987, + "step": 94140 + }, + { + "epoch": 0.4042056275383598, + "grad_norm": 0.16824714839458466, + "learning_rate": 5.983675827634677e-05, + "loss": 0.08387279510498047, + "step": 94150 + }, + { + "epoch": 0.40424855962837986, + "grad_norm": 0.07217488437891006, + "learning_rate": 5.983244655622915e-05, + "loss": 0.23426990509033202, + "step": 94160 + }, + { + "epoch": 0.40429149171839984, + "grad_norm": 0.016545869410037994, + "learning_rate": 5.982813483611152e-05, + "loss": 0.12855671644210814, + "step": 94170 + }, + { + "epoch": 0.40433442380841983, + "grad_norm": 0.38350731134414673, + "learning_rate": 5.9823823115993896e-05, + "loss": 0.29489014148712156, + "step": 94180 + }, + { + "epoch": 0.40437735589843987, + "grad_norm": 2.4663097858428955, + "learning_rate": 5.981951139587627e-05, + "loss": 0.30193476676940917, + "step": 94190 + }, + { + "epoch": 0.40442028798845986, + "grad_norm": 0.07116344571113586, + "learning_rate": 5.981519967575865e-05, + "loss": 0.25567481517791746, + "step": 94200 + }, + { + "epoch": 0.40446322007847985, + "grad_norm": 0.08018473535776138, + "learning_rate": 5.981088795564103e-05, + "loss": 0.23793158531188965, + "step": 94210 + }, + { + "epoch": 0.4045061521684999, + "grad_norm": 6.177248477935791, + "learning_rate": 5.9806576235523405e-05, + "loss": 0.14144766330718994, + "step": 94220 + }, + { + "epoch": 0.4045490842585199, + "grad_norm": 0.1142013818025589, + "learning_rate": 5.980226451540578e-05, + "loss": 0.14134806394577026, + "step": 94230 + }, + { + "epoch": 0.40459201634853986, + "grad_norm": 1.9092756509780884, + "learning_rate": 5.979795279528816e-05, + "loss": 0.1723836898803711, + "step": 94240 + }, + { + "epoch": 0.4046349484385599, + "grad_norm": 0.13086585700511932, + "learning_rate": 5.979364107517054e-05, + "loss": 0.25963706970214845, + "step": 94250 + }, + { + "epoch": 0.4046778805285799, + "grad_norm": 0.13701823353767395, + "learning_rate": 5.97893293550529e-05, + "loss": 0.2061779022216797, + "step": 94260 + }, + { + "epoch": 0.4047208126185999, + "grad_norm": 0.02567523717880249, + "learning_rate": 5.978501763493528e-05, + "loss": 0.09171283841133118, + "step": 94270 + }, + { + "epoch": 0.4047637447086199, + "grad_norm": 0.053751952946186066, + "learning_rate": 5.9780705914817655e-05, + "loss": 0.38718242645263673, + "step": 94280 + }, + { + "epoch": 0.4048066767986399, + "grad_norm": 0.13871848583221436, + "learning_rate": 5.977639419470003e-05, + "loss": 0.1819378614425659, + "step": 94290 + }, + { + "epoch": 0.4048496088886599, + "grad_norm": 0.02344353497028351, + "learning_rate": 5.977208247458242e-05, + "loss": 0.15964871644973755, + "step": 94300 + }, + { + "epoch": 0.40489254097867994, + "grad_norm": 0.005774588789790869, + "learning_rate": 5.9767770754464794e-05, + "loss": 0.10087209939956665, + "step": 94310 + }, + { + "epoch": 0.4049354730686999, + "grad_norm": 0.001752114505507052, + "learning_rate": 5.976345903434717e-05, + "loss": 0.11794418096542358, + "step": 94320 + }, + { + "epoch": 0.4049784051587199, + "grad_norm": 0.23419494926929474, + "learning_rate": 5.975914731422955e-05, + "loss": 0.17973036766052247, + "step": 94330 + }, + { + "epoch": 0.40502133724873995, + "grad_norm": 0.2058352679014206, + "learning_rate": 5.975483559411191e-05, + "loss": 0.29049394130706785, + "step": 94340 + }, + { + "epoch": 0.40506426933875994, + "grad_norm": 5.32999324798584, + "learning_rate": 5.975052387399429e-05, + "loss": 0.29587693214416505, + "step": 94350 + }, + { + "epoch": 0.40510720142877993, + "grad_norm": 0.037094105035066605, + "learning_rate": 5.974621215387667e-05, + "loss": 0.17742139101028442, + "step": 94360 + }, + { + "epoch": 0.40515013351879997, + "grad_norm": 0.015149621292948723, + "learning_rate": 5.9741900433759045e-05, + "loss": 0.19037646055221558, + "step": 94370 + }, + { + "epoch": 0.40519306560881996, + "grad_norm": 7.238763809204102, + "learning_rate": 5.973758871364142e-05, + "loss": 0.16131314039230346, + "step": 94380 + }, + { + "epoch": 0.40523599769884, + "grad_norm": 0.09510552138090134, + "learning_rate": 5.97332769935238e-05, + "loss": 0.10398712158203124, + "step": 94390 + }, + { + "epoch": 0.40527892978886, + "grad_norm": 0.030364129692316055, + "learning_rate": 5.972896527340618e-05, + "loss": 0.0476971834897995, + "step": 94400 + }, + { + "epoch": 0.40532186187888, + "grad_norm": 0.006177715957164764, + "learning_rate": 5.9724653553288554e-05, + "loss": 0.1935707688331604, + "step": 94410 + }, + { + "epoch": 0.4053647939689, + "grad_norm": 0.022529790177941322, + "learning_rate": 5.9720341833170925e-05, + "loss": 0.18455986976623534, + "step": 94420 + }, + { + "epoch": 0.40540772605892, + "grad_norm": 0.0013470775447785854, + "learning_rate": 5.97160301130533e-05, + "loss": 0.13666459321975707, + "step": 94430 + }, + { + "epoch": 0.40545065814894, + "grad_norm": 0.05355757102370262, + "learning_rate": 5.971171839293568e-05, + "loss": 0.12402044534683228, + "step": 94440 + }, + { + "epoch": 0.40549359023896003, + "grad_norm": 3.122445821762085, + "learning_rate": 5.970740667281806e-05, + "loss": 0.3857335090637207, + "step": 94450 + }, + { + "epoch": 0.40553652232898, + "grad_norm": 0.004708373919129372, + "learning_rate": 5.9703094952700434e-05, + "loss": 0.19442360401153563, + "step": 94460 + }, + { + "epoch": 0.405579454419, + "grad_norm": 0.008243762888014317, + "learning_rate": 5.969878323258281e-05, + "loss": 0.2473069190979004, + "step": 94470 + }, + { + "epoch": 0.40562238650902005, + "grad_norm": 0.0011318456381559372, + "learning_rate": 5.969447151246519e-05, + "loss": 0.04169896841049194, + "step": 94480 + }, + { + "epoch": 0.40566531859904004, + "grad_norm": 3.41719388961792, + "learning_rate": 5.9690159792347566e-05, + "loss": 0.12270849943161011, + "step": 94490 + }, + { + "epoch": 0.40570825068906, + "grad_norm": 0.03121386095881462, + "learning_rate": 5.968584807222993e-05, + "loss": 0.23815858364105225, + "step": 94500 + }, + { + "epoch": 0.40575118277908007, + "grad_norm": 22.28656768798828, + "learning_rate": 5.968153635211231e-05, + "loss": 0.2772752523422241, + "step": 94510 + }, + { + "epoch": 0.40579411486910005, + "grad_norm": 3.8086886405944824, + "learning_rate": 5.967722463199469e-05, + "loss": 0.2415098190307617, + "step": 94520 + }, + { + "epoch": 0.40583704695912004, + "grad_norm": 0.04217388480901718, + "learning_rate": 5.967291291187707e-05, + "loss": 0.17928813695907592, + "step": 94530 + }, + { + "epoch": 0.4058799790491401, + "grad_norm": 2.3201487064361572, + "learning_rate": 5.9668601191759446e-05, + "loss": 0.232806396484375, + "step": 94540 + }, + { + "epoch": 0.40592291113916007, + "grad_norm": 25.63894271850586, + "learning_rate": 5.966428947164182e-05, + "loss": 0.29534387588500977, + "step": 94550 + }, + { + "epoch": 0.40596584322918006, + "grad_norm": 0.24793045222759247, + "learning_rate": 5.96599777515242e-05, + "loss": 0.18396825790405275, + "step": 94560 + }, + { + "epoch": 0.4060087753192001, + "grad_norm": 0.7542315721511841, + "learning_rate": 5.965566603140658e-05, + "loss": 0.2694231510162354, + "step": 94570 + }, + { + "epoch": 0.4060517074092201, + "grad_norm": 1.3585526943206787, + "learning_rate": 5.9651354311288955e-05, + "loss": 0.30352447032928465, + "step": 94580 + }, + { + "epoch": 0.40609463949924013, + "grad_norm": 5.630841255187988, + "learning_rate": 5.964704259117132e-05, + "loss": 0.2450554370880127, + "step": 94590 + }, + { + "epoch": 0.4061375715892601, + "grad_norm": 0.04023003205657005, + "learning_rate": 5.9642730871053697e-05, + "loss": 0.11513957977294922, + "step": 94600 + }, + { + "epoch": 0.4061805036792801, + "grad_norm": 0.9769871830940247, + "learning_rate": 5.9638419150936074e-05, + "loss": 0.08849529027938843, + "step": 94610 + }, + { + "epoch": 0.40622343576930015, + "grad_norm": 0.661646842956543, + "learning_rate": 5.963410743081845e-05, + "loss": 0.4164388656616211, + "step": 94620 + }, + { + "epoch": 0.40626636785932013, + "grad_norm": 9.560547828674316, + "learning_rate": 5.962979571070083e-05, + "loss": 0.2010348320007324, + "step": 94630 + }, + { + "epoch": 0.4063092999493401, + "grad_norm": 2.1238622665405273, + "learning_rate": 5.9625483990583206e-05, + "loss": 0.07822906374931335, + "step": 94640 + }, + { + "epoch": 0.40635223203936016, + "grad_norm": 0.9077053070068359, + "learning_rate": 5.962117227046558e-05, + "loss": 0.40604825019836427, + "step": 94650 + }, + { + "epoch": 0.40639516412938015, + "grad_norm": 0.06327524781227112, + "learning_rate": 5.961686055034796e-05, + "loss": 0.09599577188491822, + "step": 94660 + }, + { + "epoch": 0.40643809621940014, + "grad_norm": 0.869484007358551, + "learning_rate": 5.961254883023033e-05, + "loss": 0.14397214651107787, + "step": 94670 + }, + { + "epoch": 0.4064810283094202, + "grad_norm": 0.007432250771671534, + "learning_rate": 5.960823711011271e-05, + "loss": 0.24939417839050293, + "step": 94680 + }, + { + "epoch": 0.40652396039944017, + "grad_norm": 1.876278042793274, + "learning_rate": 5.9603925389995086e-05, + "loss": 0.10063667297363281, + "step": 94690 + }, + { + "epoch": 0.40656689248946015, + "grad_norm": 1.803780198097229, + "learning_rate": 5.959961366987746e-05, + "loss": 0.28752832412719725, + "step": 94700 + }, + { + "epoch": 0.4066098245794802, + "grad_norm": 0.41878649592399597, + "learning_rate": 5.959530194975984e-05, + "loss": 0.14709892272949218, + "step": 94710 + }, + { + "epoch": 0.4066527566695002, + "grad_norm": 0.00410389993339777, + "learning_rate": 5.959099022964222e-05, + "loss": 0.0757517397403717, + "step": 94720 + }, + { + "epoch": 0.40669568875952017, + "grad_norm": 0.4839799106121063, + "learning_rate": 5.9586678509524595e-05, + "loss": 0.21969540119171144, + "step": 94730 + }, + { + "epoch": 0.4067386208495402, + "grad_norm": 0.0009927289793267846, + "learning_rate": 5.958236678940697e-05, + "loss": 0.08641666769981385, + "step": 94740 + }, + { + "epoch": 0.4067815529395602, + "grad_norm": 0.07800525426864624, + "learning_rate": 5.957805506928934e-05, + "loss": 0.21268873214721679, + "step": 94750 + }, + { + "epoch": 0.4068244850295802, + "grad_norm": 0.2969856560230255, + "learning_rate": 5.957374334917172e-05, + "loss": 0.20551798343658448, + "step": 94760 + }, + { + "epoch": 0.4068674171196002, + "grad_norm": 3.2476003170013428, + "learning_rate": 5.95694316290541e-05, + "loss": 0.49648799896240237, + "step": 94770 + }, + { + "epoch": 0.4069103492096202, + "grad_norm": 0.14998094737529755, + "learning_rate": 5.9565119908936475e-05, + "loss": 0.10299062728881836, + "step": 94780 + }, + { + "epoch": 0.4069532812996402, + "grad_norm": 0.001216806354932487, + "learning_rate": 5.956080818881885e-05, + "loss": 0.3082596778869629, + "step": 94790 + }, + { + "epoch": 0.40699621338966024, + "grad_norm": 0.010087787173688412, + "learning_rate": 5.955649646870123e-05, + "loss": 0.2633021593093872, + "step": 94800 + }, + { + "epoch": 0.40703914547968023, + "grad_norm": 0.008466735482215881, + "learning_rate": 5.955218474858361e-05, + "loss": 0.22031636238098146, + "step": 94810 + }, + { + "epoch": 0.4070820775697003, + "grad_norm": 2.877070903778076, + "learning_rate": 5.9547873028465985e-05, + "loss": 0.21617393493652343, + "step": 94820 + }, + { + "epoch": 0.40712500965972026, + "grad_norm": 0.013431715779006481, + "learning_rate": 5.954356130834835e-05, + "loss": 0.39271633625030516, + "step": 94830 + }, + { + "epoch": 0.40716794174974025, + "grad_norm": 0.016712641343474388, + "learning_rate": 5.9539249588230726e-05, + "loss": 0.055677926540374754, + "step": 94840 + }, + { + "epoch": 0.4072108738397603, + "grad_norm": 0.01818036660552025, + "learning_rate": 5.95349378681131e-05, + "loss": 0.21650080680847167, + "step": 94850 + }, + { + "epoch": 0.4072538059297803, + "grad_norm": 0.501991868019104, + "learning_rate": 5.953062614799548e-05, + "loss": 0.19998964071273803, + "step": 94860 + }, + { + "epoch": 0.40729673801980026, + "grad_norm": 0.8588781356811523, + "learning_rate": 5.952631442787786e-05, + "loss": 0.36342079639434816, + "step": 94870 + }, + { + "epoch": 0.4073396701098203, + "grad_norm": 0.020705191418528557, + "learning_rate": 5.9522002707760235e-05, + "loss": 0.1025011658668518, + "step": 94880 + }, + { + "epoch": 0.4073826021998403, + "grad_norm": 0.8543851971626282, + "learning_rate": 5.951769098764262e-05, + "loss": 0.2673808574676514, + "step": 94890 + }, + { + "epoch": 0.4074255342898603, + "grad_norm": 1.015897512435913, + "learning_rate": 5.9513379267524997e-05, + "loss": 0.375284481048584, + "step": 94900 + }, + { + "epoch": 0.4074684663798803, + "grad_norm": 3.083343982696533, + "learning_rate": 5.950906754740736e-05, + "loss": 0.23698792457580567, + "step": 94910 + }, + { + "epoch": 0.4075113984699003, + "grad_norm": 0.2520955801010132, + "learning_rate": 5.950475582728974e-05, + "loss": 0.29218809604644774, + "step": 94920 + }, + { + "epoch": 0.4075543305599203, + "grad_norm": 1.5698250532150269, + "learning_rate": 5.9500444107172115e-05, + "loss": 0.17973952293395995, + "step": 94930 + }, + { + "epoch": 0.40759726264994034, + "grad_norm": 1.5335873365402222, + "learning_rate": 5.949613238705449e-05, + "loss": 0.10807563066482544, + "step": 94940 + }, + { + "epoch": 0.4076401947399603, + "grad_norm": 0.9593386054039001, + "learning_rate": 5.949182066693687e-05, + "loss": 0.04367157220840454, + "step": 94950 + }, + { + "epoch": 0.4076831268299803, + "grad_norm": 0.003926532808691263, + "learning_rate": 5.948750894681925e-05, + "loss": 0.19878900051116943, + "step": 94960 + }, + { + "epoch": 0.40772605892000036, + "grad_norm": 0.010859617032110691, + "learning_rate": 5.9483197226701624e-05, + "loss": 0.2264415979385376, + "step": 94970 + }, + { + "epoch": 0.40776899101002034, + "grad_norm": 0.06168222054839134, + "learning_rate": 5.9478885506584e-05, + "loss": 0.1437380075454712, + "step": 94980 + }, + { + "epoch": 0.40781192310004033, + "grad_norm": 0.030141912400722504, + "learning_rate": 5.947457378646638e-05, + "loss": 0.3269296407699585, + "step": 94990 + }, + { + "epoch": 0.4078548551900604, + "grad_norm": 2.8415722846984863, + "learning_rate": 5.947026206634875e-05, + "loss": 0.16797289848327637, + "step": 95000 + }, + { + "epoch": 0.4078548551900604, + "eval_loss": 0.42135003209114075, + "eval_runtime": 27.162, + "eval_samples_per_second": 3.682, + "eval_steps_per_second": 3.682, + "step": 95000 + }, + { + "epoch": 0.40789778728008036, + "grad_norm": 0.0548337884247303, + "learning_rate": 5.946595034623113e-05, + "loss": 0.1138761043548584, + "step": 95010 + }, + { + "epoch": 0.4079407193701004, + "grad_norm": 0.05829789489507675, + "learning_rate": 5.9461638626113504e-05, + "loss": 0.22611532211303711, + "step": 95020 + }, + { + "epoch": 0.4079836514601204, + "grad_norm": 8.076669692993164, + "learning_rate": 5.945732690599588e-05, + "loss": 0.2649090766906738, + "step": 95030 + }, + { + "epoch": 0.4080265835501404, + "grad_norm": 1.6947511434555054, + "learning_rate": 5.945301518587826e-05, + "loss": 0.14479960203170777, + "step": 95040 + }, + { + "epoch": 0.4080695156401604, + "grad_norm": 1.4850653409957886, + "learning_rate": 5.9448703465760636e-05, + "loss": 0.5386390209197998, + "step": 95050 + }, + { + "epoch": 0.4081124477301804, + "grad_norm": 1.1699714660644531, + "learning_rate": 5.9444391745643014e-05, + "loss": 0.3699991226196289, + "step": 95060 + }, + { + "epoch": 0.4081553798202004, + "grad_norm": 0.0662231370806694, + "learning_rate": 5.944008002552539e-05, + "loss": 0.20802528858184816, + "step": 95070 + }, + { + "epoch": 0.40819831191022043, + "grad_norm": 5.796939373016357, + "learning_rate": 5.9435768305407755e-05, + "loss": 0.26230273246765134, + "step": 95080 + }, + { + "epoch": 0.4082412440002404, + "grad_norm": 0.015105532482266426, + "learning_rate": 5.943145658529013e-05, + "loss": 0.18045105934143066, + "step": 95090 + }, + { + "epoch": 0.4082841760902604, + "grad_norm": 0.05598805844783783, + "learning_rate": 5.942714486517251e-05, + "loss": 0.10713293552398681, + "step": 95100 + }, + { + "epoch": 0.40832710818028045, + "grad_norm": 0.004577355924993753, + "learning_rate": 5.9422833145054894e-05, + "loss": 0.2067034959793091, + "step": 95110 + }, + { + "epoch": 0.40837004027030044, + "grad_norm": 0.0111719761043787, + "learning_rate": 5.941852142493727e-05, + "loss": 0.1303783416748047, + "step": 95120 + }, + { + "epoch": 0.4084129723603204, + "grad_norm": 2.2829782962799072, + "learning_rate": 5.941420970481965e-05, + "loss": 0.15735230445861817, + "step": 95130 + }, + { + "epoch": 0.40845590445034047, + "grad_norm": 0.0009828386828303337, + "learning_rate": 5.9409897984702026e-05, + "loss": 0.34636309146881106, + "step": 95140 + }, + { + "epoch": 0.40849883654036045, + "grad_norm": 0.04014360159635544, + "learning_rate": 5.94055862645844e-05, + "loss": 0.41129145622253416, + "step": 95150 + }, + { + "epoch": 0.40854176863038044, + "grad_norm": 2.1843440532684326, + "learning_rate": 5.940127454446677e-05, + "loss": 0.25074067115783694, + "step": 95160 + }, + { + "epoch": 0.4085847007204005, + "grad_norm": 0.015182511880993843, + "learning_rate": 5.9396962824349144e-05, + "loss": 0.18507274389266967, + "step": 95170 + }, + { + "epoch": 0.40862763281042047, + "grad_norm": 3.183384418487549, + "learning_rate": 5.939265110423152e-05, + "loss": 0.25386662483215333, + "step": 95180 + }, + { + "epoch": 0.40867056490044046, + "grad_norm": 9.70214557647705, + "learning_rate": 5.93883393841139e-05, + "loss": 0.29502627849578855, + "step": 95190 + }, + { + "epoch": 0.4087134969904605, + "grad_norm": 2.1986756324768066, + "learning_rate": 5.9384027663996276e-05, + "loss": 0.4716000556945801, + "step": 95200 + }, + { + "epoch": 0.4087564290804805, + "grad_norm": 2.2466917037963867, + "learning_rate": 5.9379715943878653e-05, + "loss": 0.3876519203186035, + "step": 95210 + }, + { + "epoch": 0.4087993611705005, + "grad_norm": 1.9093042612075806, + "learning_rate": 5.937540422376103e-05, + "loss": 0.3541229009628296, + "step": 95220 + }, + { + "epoch": 0.4088422932605205, + "grad_norm": 0.0312630720436573, + "learning_rate": 5.937109250364341e-05, + "loss": 0.2550151824951172, + "step": 95230 + }, + { + "epoch": 0.4088852253505405, + "grad_norm": 0.13463670015335083, + "learning_rate": 5.936678078352578e-05, + "loss": 0.1566945195198059, + "step": 95240 + }, + { + "epoch": 0.40892815744056055, + "grad_norm": 0.04153773561120033, + "learning_rate": 5.9362469063408156e-05, + "loss": 0.1386013150215149, + "step": 95250 + }, + { + "epoch": 0.40897108953058053, + "grad_norm": 1.534032940864563, + "learning_rate": 5.9358157343290533e-05, + "loss": 0.13509927988052367, + "step": 95260 + }, + { + "epoch": 0.4090140216206005, + "grad_norm": 0.010034897364675999, + "learning_rate": 5.935384562317291e-05, + "loss": 0.2785799503326416, + "step": 95270 + }, + { + "epoch": 0.40905695371062056, + "grad_norm": 0.5196402668952942, + "learning_rate": 5.934953390305529e-05, + "loss": 0.2804682731628418, + "step": 95280 + }, + { + "epoch": 0.40909988580064055, + "grad_norm": 0.3092677593231201, + "learning_rate": 5.9345222182937665e-05, + "loss": 0.14761451482772828, + "step": 95290 + }, + { + "epoch": 0.40914281789066054, + "grad_norm": 0.0070295692421495914, + "learning_rate": 5.934091046282004e-05, + "loss": 0.09876445531845093, + "step": 95300 + }, + { + "epoch": 0.4091857499806806, + "grad_norm": 0.0008774647722020745, + "learning_rate": 5.933659874270242e-05, + "loss": 0.04889726340770721, + "step": 95310 + }, + { + "epoch": 0.40922868207070057, + "grad_norm": 0.04171142727136612, + "learning_rate": 5.93322870225848e-05, + "loss": 0.06542414426803589, + "step": 95320 + }, + { + "epoch": 0.40927161416072055, + "grad_norm": 0.9298404455184937, + "learning_rate": 5.932797530246717e-05, + "loss": 0.2098308563232422, + "step": 95330 + }, + { + "epoch": 0.4093145462507406, + "grad_norm": 6.4145660400390625, + "learning_rate": 5.9323663582349545e-05, + "loss": 0.40647087097167967, + "step": 95340 + }, + { + "epoch": 0.4093574783407606, + "grad_norm": 0.0009694885229691863, + "learning_rate": 5.931935186223192e-05, + "loss": 0.2989876508712769, + "step": 95350 + }, + { + "epoch": 0.40940041043078057, + "grad_norm": 0.12345188856124878, + "learning_rate": 5.93150401421143e-05, + "loss": 0.2628939628601074, + "step": 95360 + }, + { + "epoch": 0.4094433425208006, + "grad_norm": 1.712976336479187, + "learning_rate": 5.931072842199668e-05, + "loss": 0.2650137424468994, + "step": 95370 + }, + { + "epoch": 0.4094862746108206, + "grad_norm": 1.8961756229400635, + "learning_rate": 5.9306416701879055e-05, + "loss": 0.3032198905944824, + "step": 95380 + }, + { + "epoch": 0.4095292067008406, + "grad_norm": 4.40653657913208, + "learning_rate": 5.930210498176143e-05, + "loss": 0.1689983367919922, + "step": 95390 + }, + { + "epoch": 0.40957213879086063, + "grad_norm": 0.022152697667479515, + "learning_rate": 5.929779326164381e-05, + "loss": 0.13820401430130005, + "step": 95400 + }, + { + "epoch": 0.4096150708808806, + "grad_norm": 2.1956160068511963, + "learning_rate": 5.929348154152617e-05, + "loss": 0.19749439954757692, + "step": 95410 + }, + { + "epoch": 0.4096580029709006, + "grad_norm": 0.05495762452483177, + "learning_rate": 5.928916982140855e-05, + "loss": 0.35087130069732664, + "step": 95420 + }, + { + "epoch": 0.40970093506092065, + "grad_norm": 1.406990647315979, + "learning_rate": 5.928485810129093e-05, + "loss": 0.28234546184539794, + "step": 95430 + }, + { + "epoch": 0.40974386715094063, + "grad_norm": 1.1641638278961182, + "learning_rate": 5.9280546381173305e-05, + "loss": 0.3042471885681152, + "step": 95440 + }, + { + "epoch": 0.4097867992409607, + "grad_norm": 1.1633776426315308, + "learning_rate": 5.927623466105568e-05, + "loss": 0.2691806793212891, + "step": 95450 + }, + { + "epoch": 0.40982973133098066, + "grad_norm": 0.018605902791023254, + "learning_rate": 5.927192294093806e-05, + "loss": 0.0012621838599443437, + "step": 95460 + }, + { + "epoch": 0.40987266342100065, + "grad_norm": 0.018871862441301346, + "learning_rate": 5.926761122082044e-05, + "loss": 0.30165996551513674, + "step": 95470 + }, + { + "epoch": 0.4099155955110207, + "grad_norm": 0.6964154839515686, + "learning_rate": 5.9263299500702815e-05, + "loss": 0.19526594877243042, + "step": 95480 + }, + { + "epoch": 0.4099585276010407, + "grad_norm": 0.030805258080363274, + "learning_rate": 5.9258987780585185e-05, + "loss": 0.39353554248809813, + "step": 95490 + }, + { + "epoch": 0.41000145969106067, + "grad_norm": 2.8309528827667236, + "learning_rate": 5.925467606046756e-05, + "loss": 0.21136395931243895, + "step": 95500 + }, + { + "epoch": 0.4100443917810807, + "grad_norm": 0.00797537062317133, + "learning_rate": 5.925036434034994e-05, + "loss": 0.39774174690246583, + "step": 95510 + }, + { + "epoch": 0.4100873238711007, + "grad_norm": 0.0260153915733099, + "learning_rate": 5.924605262023232e-05, + "loss": 0.05480254292488098, + "step": 95520 + }, + { + "epoch": 0.4101302559611207, + "grad_norm": 0.008767174556851387, + "learning_rate": 5.9241740900114695e-05, + "loss": 0.16443458795547486, + "step": 95530 + }, + { + "epoch": 0.4101731880511407, + "grad_norm": 1.7948821783065796, + "learning_rate": 5.923742917999707e-05, + "loss": 0.3718928337097168, + "step": 95540 + }, + { + "epoch": 0.4102161201411607, + "grad_norm": 0.07295206189155579, + "learning_rate": 5.923311745987945e-05, + "loss": 0.15985893011093139, + "step": 95550 + }, + { + "epoch": 0.4102590522311807, + "grad_norm": 2.3034417629241943, + "learning_rate": 5.9228805739761827e-05, + "loss": 0.3065182685852051, + "step": 95560 + }, + { + "epoch": 0.41030198432120074, + "grad_norm": 0.009524974972009659, + "learning_rate": 5.92244940196442e-05, + "loss": 0.21257288455963136, + "step": 95570 + }, + { + "epoch": 0.41034491641122073, + "grad_norm": 0.3217892348766327, + "learning_rate": 5.9220182299526575e-05, + "loss": 0.07395639419555664, + "step": 95580 + }, + { + "epoch": 0.4103878485012407, + "grad_norm": 0.016713928431272507, + "learning_rate": 5.921587057940895e-05, + "loss": 0.1688373327255249, + "step": 95590 + }, + { + "epoch": 0.41043078059126076, + "grad_norm": 1.5730935335159302, + "learning_rate": 5.921155885929133e-05, + "loss": 0.35242772102355957, + "step": 95600 + }, + { + "epoch": 0.41047371268128074, + "grad_norm": 0.1674475222826004, + "learning_rate": 5.9207247139173707e-05, + "loss": 0.1739656448364258, + "step": 95610 + }, + { + "epoch": 0.41051664477130073, + "grad_norm": 0.7928306460380554, + "learning_rate": 5.9202935419056084e-05, + "loss": 0.23594801425933837, + "step": 95620 + }, + { + "epoch": 0.4105595768613208, + "grad_norm": 2.0737826824188232, + "learning_rate": 5.919862369893846e-05, + "loss": 0.21393814086914062, + "step": 95630 + }, + { + "epoch": 0.41060250895134076, + "grad_norm": 0.039834145456552505, + "learning_rate": 5.919431197882084e-05, + "loss": 0.24037718772888184, + "step": 95640 + }, + { + "epoch": 0.41064544104136075, + "grad_norm": 0.0070899007841944695, + "learning_rate": 5.9190000258703216e-05, + "loss": 0.11014029979705811, + "step": 95650 + }, + { + "epoch": 0.4106883731313808, + "grad_norm": 0.39396798610687256, + "learning_rate": 5.918568853858558e-05, + "loss": 0.21011085510253907, + "step": 95660 + }, + { + "epoch": 0.4107313052214008, + "grad_norm": 1.3160746097564697, + "learning_rate": 5.918137681846796e-05, + "loss": 0.27565855979919435, + "step": 95670 + }, + { + "epoch": 0.4107742373114208, + "grad_norm": 2.246863603591919, + "learning_rate": 5.9177065098350334e-05, + "loss": 0.4046793937683105, + "step": 95680 + }, + { + "epoch": 0.4108171694014408, + "grad_norm": 2.2735283374786377, + "learning_rate": 5.917275337823271e-05, + "loss": 0.08375327587127686, + "step": 95690 + }, + { + "epoch": 0.4108601014914608, + "grad_norm": 0.06356479972600937, + "learning_rate": 5.916844165811509e-05, + "loss": 0.12829431295394897, + "step": 95700 + }, + { + "epoch": 0.41090303358148084, + "grad_norm": 0.8090630173683167, + "learning_rate": 5.916412993799747e-05, + "loss": 0.20278749465942383, + "step": 95710 + }, + { + "epoch": 0.4109459656715008, + "grad_norm": 3.7976224422454834, + "learning_rate": 5.915981821787985e-05, + "loss": 0.37968668937683103, + "step": 95720 + }, + { + "epoch": 0.4109888977615208, + "grad_norm": 0.013438595458865166, + "learning_rate": 5.915550649776223e-05, + "loss": 0.2663418292999268, + "step": 95730 + }, + { + "epoch": 0.41103182985154085, + "grad_norm": 0.00032999878749251366, + "learning_rate": 5.915119477764459e-05, + "loss": 0.2249680519104004, + "step": 95740 + }, + { + "epoch": 0.41107476194156084, + "grad_norm": 1.3823471069335938, + "learning_rate": 5.914688305752697e-05, + "loss": 0.17618353366851808, + "step": 95750 + }, + { + "epoch": 0.4111176940315808, + "grad_norm": 1.6660122871398926, + "learning_rate": 5.9142571337409346e-05, + "loss": 0.23736302852630614, + "step": 95760 + }, + { + "epoch": 0.41116062612160087, + "grad_norm": 0.014189798384904861, + "learning_rate": 5.9138259617291724e-05, + "loss": 0.209371018409729, + "step": 95770 + }, + { + "epoch": 0.41120355821162086, + "grad_norm": 0.003713384736329317, + "learning_rate": 5.91339478971741e-05, + "loss": 0.1889081835746765, + "step": 95780 + }, + { + "epoch": 0.41124649030164084, + "grad_norm": 0.03438572585582733, + "learning_rate": 5.912963617705648e-05, + "loss": 0.07586662769317627, + "step": 95790 + }, + { + "epoch": 0.4112894223916609, + "grad_norm": 0.8181175589561462, + "learning_rate": 5.9125324456938856e-05, + "loss": 0.276668381690979, + "step": 95800 + }, + { + "epoch": 0.4113323544816809, + "grad_norm": 0.10987686365842819, + "learning_rate": 5.912101273682123e-05, + "loss": 0.01944877505302429, + "step": 95810 + }, + { + "epoch": 0.41137528657170086, + "grad_norm": 0.28580111265182495, + "learning_rate": 5.9116701016703604e-05, + "loss": 0.40301976203918455, + "step": 95820 + }, + { + "epoch": 0.4114182186617209, + "grad_norm": 6.955211162567139, + "learning_rate": 5.911238929658598e-05, + "loss": 0.2764917850494385, + "step": 95830 + }, + { + "epoch": 0.4114611507517409, + "grad_norm": 3.055581569671631, + "learning_rate": 5.910807757646836e-05, + "loss": 0.33856046199798584, + "step": 95840 + }, + { + "epoch": 0.4115040828417609, + "grad_norm": 1.8550695180892944, + "learning_rate": 5.9103765856350736e-05, + "loss": 0.26514904499053954, + "step": 95850 + }, + { + "epoch": 0.4115470149317809, + "grad_norm": 0.23635394871234894, + "learning_rate": 5.909945413623311e-05, + "loss": 0.3311293601989746, + "step": 95860 + }, + { + "epoch": 0.4115899470218009, + "grad_norm": 0.018756048753857613, + "learning_rate": 5.909514241611549e-05, + "loss": 0.1716057538986206, + "step": 95870 + }, + { + "epoch": 0.41163287911182095, + "grad_norm": 0.006415795534849167, + "learning_rate": 5.909083069599787e-05, + "loss": 0.07712118029594421, + "step": 95880 + }, + { + "epoch": 0.41167581120184094, + "grad_norm": 0.05445994809269905, + "learning_rate": 5.9086518975880245e-05, + "loss": 0.2775907516479492, + "step": 95890 + }, + { + "epoch": 0.4117187432918609, + "grad_norm": 0.17175628244876862, + "learning_rate": 5.908220725576261e-05, + "loss": 0.2930448055267334, + "step": 95900 + }, + { + "epoch": 0.41176167538188096, + "grad_norm": 0.004957165569067001, + "learning_rate": 5.9077895535644986e-05, + "loss": 0.2307873010635376, + "step": 95910 + }, + { + "epoch": 0.41180460747190095, + "grad_norm": 0.0006048490176908672, + "learning_rate": 5.9073583815527364e-05, + "loss": 0.27237510681152344, + "step": 95920 + }, + { + "epoch": 0.41184753956192094, + "grad_norm": 1.6527477502822876, + "learning_rate": 5.906927209540975e-05, + "loss": 0.2211667776107788, + "step": 95930 + }, + { + "epoch": 0.411890471651941, + "grad_norm": 0.8207837343215942, + "learning_rate": 5.9064960375292125e-05, + "loss": 0.18572959899902344, + "step": 95940 + }, + { + "epoch": 0.41193340374196097, + "grad_norm": 2.7235312461853027, + "learning_rate": 5.90606486551745e-05, + "loss": 0.3151681423187256, + "step": 95950 + }, + { + "epoch": 0.41197633583198096, + "grad_norm": 0.0987667366862297, + "learning_rate": 5.905633693505688e-05, + "loss": 0.27903921604156495, + "step": 95960 + }, + { + "epoch": 0.412019267922001, + "grad_norm": 0.03398576378822327, + "learning_rate": 5.905202521493926e-05, + "loss": 0.09625001549720764, + "step": 95970 + }, + { + "epoch": 0.412062200012021, + "grad_norm": 0.6507900357246399, + "learning_rate": 5.904771349482162e-05, + "loss": 0.11882567405700684, + "step": 95980 + }, + { + "epoch": 0.41210513210204097, + "grad_norm": 1.7718161344528198, + "learning_rate": 5.9043401774704e-05, + "loss": 0.259985089302063, + "step": 95990 + }, + { + "epoch": 0.412148064192061, + "grad_norm": 0.06375284492969513, + "learning_rate": 5.9039090054586375e-05, + "loss": 0.3681522846221924, + "step": 96000 + }, + { + "epoch": 0.412148064192061, + "eval_loss": 0.4350675940513611, + "eval_runtime": 27.252, + "eval_samples_per_second": 3.669, + "eval_steps_per_second": 3.669, + "step": 96000 + }, + { + "epoch": 0.412190996282081, + "grad_norm": 22.450407028198242, + "learning_rate": 5.903477833446875e-05, + "loss": 0.2040266752243042, + "step": 96010 + }, + { + "epoch": 0.412233928372101, + "grad_norm": 0.30055972933769226, + "learning_rate": 5.903046661435113e-05, + "loss": 0.14661065340042115, + "step": 96020 + }, + { + "epoch": 0.41227686046212103, + "grad_norm": 0.19807764887809753, + "learning_rate": 5.902615489423351e-05, + "loss": 0.3193605661392212, + "step": 96030 + }, + { + "epoch": 0.412319792552141, + "grad_norm": 0.03959206864237785, + "learning_rate": 5.9021843174115885e-05, + "loss": 0.24810662269592285, + "step": 96040 + }, + { + "epoch": 0.412362724642161, + "grad_norm": 0.11264007538557053, + "learning_rate": 5.901753145399826e-05, + "loss": 0.3553786039352417, + "step": 96050 + }, + { + "epoch": 0.41240565673218105, + "grad_norm": 0.16534768044948578, + "learning_rate": 5.901321973388064e-05, + "loss": 0.13765889406204224, + "step": 96060 + }, + { + "epoch": 0.41244858882220103, + "grad_norm": 3.1089465618133545, + "learning_rate": 5.900890801376301e-05, + "loss": 0.5068204879760743, + "step": 96070 + }, + { + "epoch": 0.412491520912221, + "grad_norm": 0.051101043820381165, + "learning_rate": 5.900459629364539e-05, + "loss": 0.22767856121063232, + "step": 96080 + }, + { + "epoch": 0.41253445300224106, + "grad_norm": 0.2743438482284546, + "learning_rate": 5.9000284573527765e-05, + "loss": 0.0870218575000763, + "step": 96090 + }, + { + "epoch": 0.41257738509226105, + "grad_norm": 1.2850000858306885, + "learning_rate": 5.899597285341014e-05, + "loss": 0.1808406949043274, + "step": 96100 + }, + { + "epoch": 0.4126203171822811, + "grad_norm": 1.991323709487915, + "learning_rate": 5.899166113329252e-05, + "loss": 0.21970710754394532, + "step": 96110 + }, + { + "epoch": 0.4126632492723011, + "grad_norm": 1.307265043258667, + "learning_rate": 5.89873494131749e-05, + "loss": 0.2848405599594116, + "step": 96120 + }, + { + "epoch": 0.41270618136232107, + "grad_norm": 0.2070932537317276, + "learning_rate": 5.8983037693057274e-05, + "loss": 0.25841717720031737, + "step": 96130 + }, + { + "epoch": 0.4127491134523411, + "grad_norm": 0.18773120641708374, + "learning_rate": 5.897872597293965e-05, + "loss": 0.07489589452743531, + "step": 96140 + }, + { + "epoch": 0.4127920455423611, + "grad_norm": 0.01186260674148798, + "learning_rate": 5.897441425282202e-05, + "loss": 0.22005856037139893, + "step": 96150 + }, + { + "epoch": 0.4128349776323811, + "grad_norm": 20.258750915527344, + "learning_rate": 5.89701025327044e-05, + "loss": 0.2249882698059082, + "step": 96160 + }, + { + "epoch": 0.4128779097224011, + "grad_norm": 50.0006217956543, + "learning_rate": 5.896579081258678e-05, + "loss": 0.0812033772468567, + "step": 96170 + }, + { + "epoch": 0.4129208418124211, + "grad_norm": 0.009035931900143623, + "learning_rate": 5.8961479092469154e-05, + "loss": 0.19636874198913573, + "step": 96180 + }, + { + "epoch": 0.4129637739024411, + "grad_norm": 0.030025752261281013, + "learning_rate": 5.895716737235153e-05, + "loss": 0.030920588970184328, + "step": 96190 + }, + { + "epoch": 0.41300670599246114, + "grad_norm": 0.6336904764175415, + "learning_rate": 5.895285565223391e-05, + "loss": 0.26514263153076173, + "step": 96200 + }, + { + "epoch": 0.41304963808248113, + "grad_norm": 0.5668049454689026, + "learning_rate": 5.8948543932116286e-05, + "loss": 0.44312324523925783, + "step": 96210 + }, + { + "epoch": 0.4130925701725011, + "grad_norm": 1.6628981828689575, + "learning_rate": 5.8944232211998663e-05, + "loss": 0.09441558122634888, + "step": 96220 + }, + { + "epoch": 0.41313550226252116, + "grad_norm": 0.16483436524868011, + "learning_rate": 5.893992049188103e-05, + "loss": 0.3925278663635254, + "step": 96230 + }, + { + "epoch": 0.41317843435254115, + "grad_norm": 0.0029956032522022724, + "learning_rate": 5.8935608771763405e-05, + "loss": 0.1006605863571167, + "step": 96240 + }, + { + "epoch": 0.41322136644256113, + "grad_norm": 2.036149263381958, + "learning_rate": 5.893129705164578e-05, + "loss": 0.29947011470794677, + "step": 96250 + }, + { + "epoch": 0.4132642985325812, + "grad_norm": 2.5441179275512695, + "learning_rate": 5.892698533152816e-05, + "loss": 0.49998650550842283, + "step": 96260 + }, + { + "epoch": 0.41330723062260116, + "grad_norm": 0.021060464903712273, + "learning_rate": 5.892267361141054e-05, + "loss": 0.1435585379600525, + "step": 96270 + }, + { + "epoch": 0.41335016271262115, + "grad_norm": 0.011224511079490185, + "learning_rate": 5.8918361891292914e-05, + "loss": 0.29232838153839114, + "step": 96280 + }, + { + "epoch": 0.4133930948026412, + "grad_norm": 5.085150241851807, + "learning_rate": 5.891405017117529e-05, + "loss": 0.3620931625366211, + "step": 96290 + }, + { + "epoch": 0.4134360268926612, + "grad_norm": 1.911069393157959, + "learning_rate": 5.8909738451057675e-05, + "loss": 0.45119342803955076, + "step": 96300 + }, + { + "epoch": 0.4134789589826812, + "grad_norm": 2.3250467777252197, + "learning_rate": 5.890542673094004e-05, + "loss": 0.21939477920532227, + "step": 96310 + }, + { + "epoch": 0.4135218910727012, + "grad_norm": 0.4628746211528778, + "learning_rate": 5.8901115010822417e-05, + "loss": 0.07731515765190125, + "step": 96320 + }, + { + "epoch": 0.4135648231627212, + "grad_norm": 0.017649687826633453, + "learning_rate": 5.8896803290704794e-05, + "loss": 0.27119874954223633, + "step": 96330 + }, + { + "epoch": 0.41360775525274124, + "grad_norm": 0.002815448446199298, + "learning_rate": 5.889249157058717e-05, + "loss": 0.1231507658958435, + "step": 96340 + }, + { + "epoch": 0.4136506873427612, + "grad_norm": 0.06919790059328079, + "learning_rate": 5.888817985046955e-05, + "loss": 0.0627961814403534, + "step": 96350 + }, + { + "epoch": 0.4136936194327812, + "grad_norm": 0.21034856140613556, + "learning_rate": 5.8883868130351926e-05, + "loss": 0.025909900665283203, + "step": 96360 + }, + { + "epoch": 0.41373655152280125, + "grad_norm": 0.1855197697877884, + "learning_rate": 5.88795564102343e-05, + "loss": 0.2717406749725342, + "step": 96370 + }, + { + "epoch": 0.41377948361282124, + "grad_norm": 2.263113498687744, + "learning_rate": 5.887524469011668e-05, + "loss": 0.19362281560897826, + "step": 96380 + }, + { + "epoch": 0.41382241570284123, + "grad_norm": 0.03105918876826763, + "learning_rate": 5.887093296999906e-05, + "loss": 0.450689697265625, + "step": 96390 + }, + { + "epoch": 0.41386534779286127, + "grad_norm": 0.005474720615893602, + "learning_rate": 5.886662124988143e-05, + "loss": 0.29455156326293946, + "step": 96400 + }, + { + "epoch": 0.41390827988288126, + "grad_norm": 74.08353424072266, + "learning_rate": 5.8862309529763806e-05, + "loss": 0.2952181577682495, + "step": 96410 + }, + { + "epoch": 0.41395121197290125, + "grad_norm": 0.029871761798858643, + "learning_rate": 5.885799780964618e-05, + "loss": 0.09040093421936035, + "step": 96420 + }, + { + "epoch": 0.4139941440629213, + "grad_norm": 2.837618589401245, + "learning_rate": 5.885368608952856e-05, + "loss": 0.2539191722869873, + "step": 96430 + }, + { + "epoch": 0.4140370761529413, + "grad_norm": 0.24232065677642822, + "learning_rate": 5.884937436941094e-05, + "loss": 0.1582349181175232, + "step": 96440 + }, + { + "epoch": 0.41408000824296126, + "grad_norm": 3.9438517093658447, + "learning_rate": 5.8845062649293315e-05, + "loss": 0.16419055461883544, + "step": 96450 + }, + { + "epoch": 0.4141229403329813, + "grad_norm": 3.0583577156066895, + "learning_rate": 5.884075092917569e-05, + "loss": 0.10935056209564209, + "step": 96460 + }, + { + "epoch": 0.4141658724230013, + "grad_norm": 0.005584963131695986, + "learning_rate": 5.883643920905807e-05, + "loss": 0.05950572490692139, + "step": 96470 + }, + { + "epoch": 0.4142088045130213, + "grad_norm": 0.0063299755565822124, + "learning_rate": 5.8832127488940434e-05, + "loss": 0.1775718331336975, + "step": 96480 + }, + { + "epoch": 0.4142517366030413, + "grad_norm": 3.7204408645629883, + "learning_rate": 5.882781576882281e-05, + "loss": 0.365419864654541, + "step": 96490 + }, + { + "epoch": 0.4142946686930613, + "grad_norm": 0.05920688807964325, + "learning_rate": 5.882350404870519e-05, + "loss": 0.21246140003204345, + "step": 96500 + }, + { + "epoch": 0.4143376007830813, + "grad_norm": 1.7804783582687378, + "learning_rate": 5.8819192328587566e-05, + "loss": 0.13792389631271362, + "step": 96510 + }, + { + "epoch": 0.41438053287310134, + "grad_norm": 5.373781204223633, + "learning_rate": 5.881488060846995e-05, + "loss": 0.40992259979248047, + "step": 96520 + }, + { + "epoch": 0.4144234649631213, + "grad_norm": 6.659917831420898, + "learning_rate": 5.881056888835233e-05, + "loss": 0.42495036125183105, + "step": 96530 + }, + { + "epoch": 0.41446639705314137, + "grad_norm": 0.08701377362012863, + "learning_rate": 5.8806257168234705e-05, + "loss": 0.10634886026382447, + "step": 96540 + }, + { + "epoch": 0.41450932914316135, + "grad_norm": 0.10711979866027832, + "learning_rate": 5.880194544811708e-05, + "loss": 0.1699918031692505, + "step": 96550 + }, + { + "epoch": 0.41455226123318134, + "grad_norm": 0.0040659112855792046, + "learning_rate": 5.8797633727999446e-05, + "loss": 0.08797727227210998, + "step": 96560 + }, + { + "epoch": 0.4145951933232014, + "grad_norm": 0.21539410948753357, + "learning_rate": 5.879332200788182e-05, + "loss": 0.1780964970588684, + "step": 96570 + }, + { + "epoch": 0.41463812541322137, + "grad_norm": 1.781267762184143, + "learning_rate": 5.87890102877642e-05, + "loss": 0.18408920764923095, + "step": 96580 + }, + { + "epoch": 0.41468105750324136, + "grad_norm": 0.0017236159183084965, + "learning_rate": 5.878469856764658e-05, + "loss": 0.2172388792037964, + "step": 96590 + }, + { + "epoch": 0.4147239895932614, + "grad_norm": 0.13159137964248657, + "learning_rate": 5.8780386847528955e-05, + "loss": 0.2666104078292847, + "step": 96600 + }, + { + "epoch": 0.4147669216832814, + "grad_norm": 0.0033857496455311775, + "learning_rate": 5.877607512741133e-05, + "loss": 0.308839225769043, + "step": 96610 + }, + { + "epoch": 0.4148098537733014, + "grad_norm": 5.20973014831543, + "learning_rate": 5.877176340729371e-05, + "loss": 0.15720083713531494, + "step": 96620 + }, + { + "epoch": 0.4148527858633214, + "grad_norm": 0.9707467555999756, + "learning_rate": 5.876745168717609e-05, + "loss": 0.21422302722930908, + "step": 96630 + }, + { + "epoch": 0.4148957179533414, + "grad_norm": 1.6356661319732666, + "learning_rate": 5.876313996705846e-05, + "loss": 0.14773917198181152, + "step": 96640 + }, + { + "epoch": 0.4149386500433614, + "grad_norm": 0.115482859313488, + "learning_rate": 5.8758828246940835e-05, + "loss": 0.2064431428909302, + "step": 96650 + }, + { + "epoch": 0.41498158213338143, + "grad_norm": 0.043903812766075134, + "learning_rate": 5.875451652682321e-05, + "loss": 0.3388453245162964, + "step": 96660 + }, + { + "epoch": 0.4150245142234014, + "grad_norm": 0.010513490065932274, + "learning_rate": 5.875020480670559e-05, + "loss": 0.13178856372833253, + "step": 96670 + }, + { + "epoch": 0.4150674463134214, + "grad_norm": 1.041566014289856, + "learning_rate": 5.874589308658797e-05, + "loss": 0.13429534435272217, + "step": 96680 + }, + { + "epoch": 0.41511037840344145, + "grad_norm": 0.06681652367115021, + "learning_rate": 5.8741581366470344e-05, + "loss": 0.13894530534744262, + "step": 96690 + }, + { + "epoch": 0.41515331049346144, + "grad_norm": 1.6779704093933105, + "learning_rate": 5.873726964635272e-05, + "loss": 0.31052098274230955, + "step": 96700 + }, + { + "epoch": 0.4151962425834814, + "grad_norm": 2.150348424911499, + "learning_rate": 5.87329579262351e-05, + "loss": 0.18519182205200196, + "step": 96710 + }, + { + "epoch": 0.41523917467350147, + "grad_norm": 10.719417572021484, + "learning_rate": 5.872864620611746e-05, + "loss": 0.2912954807281494, + "step": 96720 + }, + { + "epoch": 0.41528210676352145, + "grad_norm": 1.351884126663208, + "learning_rate": 5.872433448599984e-05, + "loss": 0.3322085618972778, + "step": 96730 + }, + { + "epoch": 0.4153250388535415, + "grad_norm": 4.0024189949035645, + "learning_rate": 5.8720022765882224e-05, + "loss": 0.0987987458705902, + "step": 96740 + }, + { + "epoch": 0.4153679709435615, + "grad_norm": 0.0032169087789952755, + "learning_rate": 5.87157110457646e-05, + "loss": 0.29510352611541746, + "step": 96750 + }, + { + "epoch": 0.41541090303358147, + "grad_norm": 1.3937362432479858, + "learning_rate": 5.871139932564698e-05, + "loss": 0.412169885635376, + "step": 96760 + }, + { + "epoch": 0.4154538351236015, + "grad_norm": 0.1955924928188324, + "learning_rate": 5.8707087605529356e-05, + "loss": 0.2589648008346558, + "step": 96770 + }, + { + "epoch": 0.4154967672136215, + "grad_norm": 0.37724238634109497, + "learning_rate": 5.8702775885411734e-05, + "loss": 0.20537447929382324, + "step": 96780 + }, + { + "epoch": 0.4155396993036415, + "grad_norm": 1.6324776411056519, + "learning_rate": 5.869846416529411e-05, + "loss": 0.40856242179870605, + "step": 96790 + }, + { + "epoch": 0.41558263139366153, + "grad_norm": 0.4038811922073364, + "learning_rate": 5.869415244517649e-05, + "loss": 0.20042769908905028, + "step": 96800 + }, + { + "epoch": 0.4156255634836815, + "grad_norm": 3.4731292724609375, + "learning_rate": 5.868984072505885e-05, + "loss": 0.36867854595184324, + "step": 96810 + }, + { + "epoch": 0.4156684955737015, + "grad_norm": 1.9070109128952026, + "learning_rate": 5.868552900494123e-05, + "loss": 0.25679750442504884, + "step": 96820 + }, + { + "epoch": 0.41571142766372154, + "grad_norm": 0.04523250088095665, + "learning_rate": 5.868121728482361e-05, + "loss": 0.12389757633209228, + "step": 96830 + }, + { + "epoch": 0.41575435975374153, + "grad_norm": 2.1402366161346436, + "learning_rate": 5.8676905564705984e-05, + "loss": 0.46518592834472655, + "step": 96840 + }, + { + "epoch": 0.4157972918437615, + "grad_norm": 3.220548629760742, + "learning_rate": 5.867259384458836e-05, + "loss": 0.09400500059127807, + "step": 96850 + }, + { + "epoch": 0.41584022393378156, + "grad_norm": 0.0733582079410553, + "learning_rate": 5.866828212447074e-05, + "loss": 0.18302125930786134, + "step": 96860 + }, + { + "epoch": 0.41588315602380155, + "grad_norm": 0.02238384075462818, + "learning_rate": 5.8663970404353116e-05, + "loss": 0.24416136741638184, + "step": 96870 + }, + { + "epoch": 0.41592608811382153, + "grad_norm": 1.7831960916519165, + "learning_rate": 5.8659658684235494e-05, + "loss": 0.2511852025985718, + "step": 96880 + }, + { + "epoch": 0.4159690202038416, + "grad_norm": 0.06032947823405266, + "learning_rate": 5.8655346964117864e-05, + "loss": 0.2421018123626709, + "step": 96890 + }, + { + "epoch": 0.41601195229386156, + "grad_norm": 0.001062124385498464, + "learning_rate": 5.865103524400024e-05, + "loss": 0.07756858468055725, + "step": 96900 + }, + { + "epoch": 0.41605488438388155, + "grad_norm": 2.137861490249634, + "learning_rate": 5.864672352388262e-05, + "loss": 0.17247992753982544, + "step": 96910 + }, + { + "epoch": 0.4160978164739016, + "grad_norm": 0.13371747732162476, + "learning_rate": 5.8642411803764996e-05, + "loss": 0.36290051937103274, + "step": 96920 + }, + { + "epoch": 0.4161407485639216, + "grad_norm": 0.030226441100239754, + "learning_rate": 5.8638100083647373e-05, + "loss": 0.15298424959182738, + "step": 96930 + }, + { + "epoch": 0.41618368065394157, + "grad_norm": 0.024282341822981834, + "learning_rate": 5.863378836352975e-05, + "loss": 0.1876778483390808, + "step": 96940 + }, + { + "epoch": 0.4162266127439616, + "grad_norm": 0.005532603710889816, + "learning_rate": 5.862947664341213e-05, + "loss": 0.10570700168609619, + "step": 96950 + }, + { + "epoch": 0.4162695448339816, + "grad_norm": 0.0017162829171866179, + "learning_rate": 5.8625164923294506e-05, + "loss": 0.22160534858703612, + "step": 96960 + }, + { + "epoch": 0.41631247692400164, + "grad_norm": 1.481353759765625, + "learning_rate": 5.8620853203176876e-05, + "loss": 0.4609670162200928, + "step": 96970 + }, + { + "epoch": 0.4163554090140216, + "grad_norm": 0.036570560187101364, + "learning_rate": 5.8616541483059253e-05, + "loss": 0.33320496082305906, + "step": 96980 + }, + { + "epoch": 0.4163983411040416, + "grad_norm": 1.344350814819336, + "learning_rate": 5.861222976294163e-05, + "loss": 0.16209814548492432, + "step": 96990 + }, + { + "epoch": 0.41644127319406166, + "grad_norm": 0.002683450933545828, + "learning_rate": 5.860791804282401e-05, + "loss": 0.17441989183425904, + "step": 97000 + }, + { + "epoch": 0.41644127319406166, + "eval_loss": 0.4123704433441162, + "eval_runtime": 27.0919, + "eval_samples_per_second": 3.691, + "eval_steps_per_second": 3.691, + "step": 97000 + }, + { + "epoch": 0.41648420528408164, + "grad_norm": 0.1961876004934311, + "learning_rate": 5.8603606322706385e-05, + "loss": 0.07715705633163453, + "step": 97010 + }, + { + "epoch": 0.41652713737410163, + "grad_norm": 0.01836606301367283, + "learning_rate": 5.859929460258876e-05, + "loss": 0.33584434986114503, + "step": 97020 + }, + { + "epoch": 0.4165700694641217, + "grad_norm": 2.514364719390869, + "learning_rate": 5.859498288247114e-05, + "loss": 0.1604735016822815, + "step": 97030 + }, + { + "epoch": 0.41661300155414166, + "grad_norm": 0.13551877439022064, + "learning_rate": 5.859067116235352e-05, + "loss": 0.05233732461929321, + "step": 97040 + }, + { + "epoch": 0.41665593364416165, + "grad_norm": 0.32872146368026733, + "learning_rate": 5.858635944223588e-05, + "loss": 0.18588415384292603, + "step": 97050 + }, + { + "epoch": 0.4166988657341817, + "grad_norm": 0.009478704072535038, + "learning_rate": 5.858204772211826e-05, + "loss": 0.32800989151000975, + "step": 97060 + }, + { + "epoch": 0.4167417978242017, + "grad_norm": 2.2177982330322266, + "learning_rate": 5.8577736002000636e-05, + "loss": 0.40303688049316405, + "step": 97070 + }, + { + "epoch": 0.41678472991422166, + "grad_norm": 2.8492250442504883, + "learning_rate": 5.857342428188301e-05, + "loss": 0.22970712184906006, + "step": 97080 + }, + { + "epoch": 0.4168276620042417, + "grad_norm": 5.243490219116211, + "learning_rate": 5.856911256176539e-05, + "loss": 0.2636042833328247, + "step": 97090 + }, + { + "epoch": 0.4168705940942617, + "grad_norm": 0.758879542350769, + "learning_rate": 5.856480084164777e-05, + "loss": 0.13969314098358154, + "step": 97100 + }, + { + "epoch": 0.4169135261842817, + "grad_norm": 0.11793681234121323, + "learning_rate": 5.856048912153015e-05, + "loss": 0.13169124126434326, + "step": 97110 + }, + { + "epoch": 0.4169564582743017, + "grad_norm": 0.13415993750095367, + "learning_rate": 5.855617740141253e-05, + "loss": 0.1481427550315857, + "step": 97120 + }, + { + "epoch": 0.4169993903643217, + "grad_norm": 1.363560676574707, + "learning_rate": 5.855186568129491e-05, + "loss": 0.22657833099365235, + "step": 97130 + }, + { + "epoch": 0.4170423224543417, + "grad_norm": 2.9042086601257324, + "learning_rate": 5.854755396117727e-05, + "loss": 0.14560015201568605, + "step": 97140 + }, + { + "epoch": 0.41708525454436174, + "grad_norm": 1.215306282043457, + "learning_rate": 5.854324224105965e-05, + "loss": 0.36920900344848634, + "step": 97150 + }, + { + "epoch": 0.4171281866343817, + "grad_norm": 1.1477103233337402, + "learning_rate": 5.8538930520942025e-05, + "loss": 0.16628166437149047, + "step": 97160 + }, + { + "epoch": 0.41717111872440177, + "grad_norm": 3.602811574935913, + "learning_rate": 5.85346188008244e-05, + "loss": 0.2822127819061279, + "step": 97170 + }, + { + "epoch": 0.41721405081442176, + "grad_norm": 0.006275674793869257, + "learning_rate": 5.853030708070678e-05, + "loss": 0.1495327353477478, + "step": 97180 + }, + { + "epoch": 0.41725698290444174, + "grad_norm": 0.0992029532790184, + "learning_rate": 5.852599536058916e-05, + "loss": 0.2783435106277466, + "step": 97190 + }, + { + "epoch": 0.4172999149944618, + "grad_norm": 1.2494653463363647, + "learning_rate": 5.8521683640471535e-05, + "loss": 0.07135303020477295, + "step": 97200 + }, + { + "epoch": 0.41734284708448177, + "grad_norm": 1.0946789979934692, + "learning_rate": 5.851737192035391e-05, + "loss": 0.25925579071044924, + "step": 97210 + }, + { + "epoch": 0.41738577917450176, + "grad_norm": 1.9514451026916504, + "learning_rate": 5.851306020023628e-05, + "loss": 0.4609402656555176, + "step": 97220 + }, + { + "epoch": 0.4174287112645218, + "grad_norm": 0.012375736609101295, + "learning_rate": 5.850874848011866e-05, + "loss": 0.3152902603149414, + "step": 97230 + }, + { + "epoch": 0.4174716433545418, + "grad_norm": 0.3962627649307251, + "learning_rate": 5.850443676000104e-05, + "loss": 0.2831923007965088, + "step": 97240 + }, + { + "epoch": 0.4175145754445618, + "grad_norm": 0.19984892010688782, + "learning_rate": 5.8500125039883415e-05, + "loss": 0.010816796123981476, + "step": 97250 + }, + { + "epoch": 0.4175575075345818, + "grad_norm": 0.09338950365781784, + "learning_rate": 5.849581331976579e-05, + "loss": 0.3447253704071045, + "step": 97260 + }, + { + "epoch": 0.4176004396246018, + "grad_norm": 0.032562434673309326, + "learning_rate": 5.849150159964817e-05, + "loss": 0.14689211845397948, + "step": 97270 + }, + { + "epoch": 0.4176433717146218, + "grad_norm": 0.1437419056892395, + "learning_rate": 5.848718987953055e-05, + "loss": 0.1887149214744568, + "step": 97280 + }, + { + "epoch": 0.41768630380464183, + "grad_norm": 0.8966065049171448, + "learning_rate": 5.8482878159412924e-05, + "loss": 0.11084823608398438, + "step": 97290 + }, + { + "epoch": 0.4177292358946618, + "grad_norm": 0.007662808522582054, + "learning_rate": 5.847856643929529e-05, + "loss": 0.32145259380340574, + "step": 97300 + }, + { + "epoch": 0.4177721679846818, + "grad_norm": 0.02117268368601799, + "learning_rate": 5.8474254719177665e-05, + "loss": 0.044441819190979004, + "step": 97310 + }, + { + "epoch": 0.41781510007470185, + "grad_norm": 0.01486627385020256, + "learning_rate": 5.846994299906004e-05, + "loss": 0.5016047477722168, + "step": 97320 + }, + { + "epoch": 0.41785803216472184, + "grad_norm": 0.8546322584152222, + "learning_rate": 5.8465631278942427e-05, + "loss": 0.32282209396362305, + "step": 97330 + }, + { + "epoch": 0.4179009642547418, + "grad_norm": 22.95284080505371, + "learning_rate": 5.8461319558824804e-05, + "loss": 0.21673707962036132, + "step": 97340 + }, + { + "epoch": 0.41794389634476187, + "grad_norm": 0.05270211771130562, + "learning_rate": 5.845700783870718e-05, + "loss": 0.06792616844177246, + "step": 97350 + }, + { + "epoch": 0.41798682843478185, + "grad_norm": 0.16774402558803558, + "learning_rate": 5.845269611858956e-05, + "loss": 0.33600101470947263, + "step": 97360 + }, + { + "epoch": 0.41802976052480184, + "grad_norm": 0.011165248230099678, + "learning_rate": 5.8448384398471936e-05, + "loss": 0.1859840750694275, + "step": 97370 + }, + { + "epoch": 0.4180726926148219, + "grad_norm": 1.0918728113174438, + "learning_rate": 5.84440726783543e-05, + "loss": 0.16099945306777955, + "step": 97380 + }, + { + "epoch": 0.41811562470484187, + "grad_norm": 1.3140350580215454, + "learning_rate": 5.843976095823668e-05, + "loss": 0.29539175033569337, + "step": 97390 + }, + { + "epoch": 0.4181585567948619, + "grad_norm": 0.12994621694087982, + "learning_rate": 5.8435449238119054e-05, + "loss": 0.38184683322906493, + "step": 97400 + }, + { + "epoch": 0.4182014888848819, + "grad_norm": 0.04722442105412483, + "learning_rate": 5.843113751800143e-05, + "loss": 0.25283255577087405, + "step": 97410 + }, + { + "epoch": 0.4182444209749019, + "grad_norm": 0.008985154330730438, + "learning_rate": 5.842682579788381e-05, + "loss": 0.22941651344299316, + "step": 97420 + }, + { + "epoch": 0.41828735306492193, + "grad_norm": 0.011790527030825615, + "learning_rate": 5.8422514077766186e-05, + "loss": 0.2190561532974243, + "step": 97430 + }, + { + "epoch": 0.4183302851549419, + "grad_norm": 0.33150047063827515, + "learning_rate": 5.8418202357648564e-05, + "loss": 0.26463005542755125, + "step": 97440 + }, + { + "epoch": 0.4183732172449619, + "grad_norm": 0.11258593201637268, + "learning_rate": 5.841389063753094e-05, + "loss": 0.07938405871391296, + "step": 97450 + }, + { + "epoch": 0.41841614933498195, + "grad_norm": 0.0037087793461978436, + "learning_rate": 5.840957891741331e-05, + "loss": 0.10488677024841309, + "step": 97460 + }, + { + "epoch": 0.41845908142500193, + "grad_norm": 3.129962682723999, + "learning_rate": 5.840526719729569e-05, + "loss": 0.5519233703613281, + "step": 97470 + }, + { + "epoch": 0.4185020135150219, + "grad_norm": 0.023688072338700294, + "learning_rate": 5.8400955477178066e-05, + "loss": 0.04940264523029327, + "step": 97480 + }, + { + "epoch": 0.41854494560504196, + "grad_norm": 0.45331892371177673, + "learning_rate": 5.8396643757060444e-05, + "loss": 0.23521509170532226, + "step": 97490 + }, + { + "epoch": 0.41858787769506195, + "grad_norm": 1.5123122930526733, + "learning_rate": 5.839233203694282e-05, + "loss": 0.3438755512237549, + "step": 97500 + }, + { + "epoch": 0.41863080978508194, + "grad_norm": 0.016864225268363953, + "learning_rate": 5.83880203168252e-05, + "loss": 0.2781703948974609, + "step": 97510 + }, + { + "epoch": 0.418673741875102, + "grad_norm": 0.37317079305648804, + "learning_rate": 5.8383708596707576e-05, + "loss": 0.3207017183303833, + "step": 97520 + }, + { + "epoch": 0.41871667396512197, + "grad_norm": 1.5420507192611694, + "learning_rate": 5.837939687658995e-05, + "loss": 0.4660184383392334, + "step": 97530 + }, + { + "epoch": 0.41875960605514195, + "grad_norm": 3.5812125205993652, + "learning_rate": 5.837508515647233e-05, + "loss": 0.3520195960998535, + "step": 97540 + }, + { + "epoch": 0.418802538145162, + "grad_norm": 0.13337750732898712, + "learning_rate": 5.83707734363547e-05, + "loss": 0.26060965061187746, + "step": 97550 + }, + { + "epoch": 0.418845470235182, + "grad_norm": 0.35369372367858887, + "learning_rate": 5.836646171623708e-05, + "loss": 0.22286348342895507, + "step": 97560 + }, + { + "epoch": 0.41888840232520197, + "grad_norm": 2.425368070602417, + "learning_rate": 5.8362149996119456e-05, + "loss": 0.17919390201568602, + "step": 97570 + }, + { + "epoch": 0.418931334415222, + "grad_norm": 0.0627279132604599, + "learning_rate": 5.835783827600183e-05, + "loss": 0.26877822875976565, + "step": 97580 + }, + { + "epoch": 0.418974266505242, + "grad_norm": 2.9158642292022705, + "learning_rate": 5.835352655588421e-05, + "loss": 0.10325750112533569, + "step": 97590 + }, + { + "epoch": 0.41901719859526204, + "grad_norm": 0.11321698129177094, + "learning_rate": 5.834921483576659e-05, + "loss": 0.19380873441696167, + "step": 97600 + }, + { + "epoch": 0.41906013068528203, + "grad_norm": 0.023406973108649254, + "learning_rate": 5.8344903115648965e-05, + "loss": 0.17096658945083618, + "step": 97610 + }, + { + "epoch": 0.419103062775302, + "grad_norm": 0.08598892390727997, + "learning_rate": 5.834059139553134e-05, + "loss": 0.08025979399681091, + "step": 97620 + }, + { + "epoch": 0.41914599486532206, + "grad_norm": 0.0026491612661629915, + "learning_rate": 5.8336279675413706e-05, + "loss": 0.16931982040405275, + "step": 97630 + }, + { + "epoch": 0.41918892695534204, + "grad_norm": 0.004537897650152445, + "learning_rate": 5.8331967955296084e-05, + "loss": 0.17108237743377686, + "step": 97640 + }, + { + "epoch": 0.41923185904536203, + "grad_norm": 0.19760261476039886, + "learning_rate": 5.832765623517846e-05, + "loss": 0.21934175491333008, + "step": 97650 + }, + { + "epoch": 0.4192747911353821, + "grad_norm": 1.2101079225540161, + "learning_rate": 5.832334451506084e-05, + "loss": 0.3915103435516357, + "step": 97660 + }, + { + "epoch": 0.41931772322540206, + "grad_norm": 0.015090469270944595, + "learning_rate": 5.8319032794943216e-05, + "loss": 0.13569951057434082, + "step": 97670 + }, + { + "epoch": 0.41936065531542205, + "grad_norm": 0.014168272726237774, + "learning_rate": 5.831472107482559e-05, + "loss": 0.22858223915100098, + "step": 97680 + }, + { + "epoch": 0.4194035874054421, + "grad_norm": 0.03062673658132553, + "learning_rate": 5.831040935470797e-05, + "loss": 0.23994054794311523, + "step": 97690 + }, + { + "epoch": 0.4194465194954621, + "grad_norm": 0.020169761031866074, + "learning_rate": 5.830609763459035e-05, + "loss": 0.22672290802001954, + "step": 97700 + }, + { + "epoch": 0.41948945158548206, + "grad_norm": 6.934675693511963, + "learning_rate": 5.830178591447272e-05, + "loss": 0.3498288869857788, + "step": 97710 + }, + { + "epoch": 0.4195323836755021, + "grad_norm": 0.15747320652008057, + "learning_rate": 5.8297474194355096e-05, + "loss": 0.3790154457092285, + "step": 97720 + }, + { + "epoch": 0.4195753157655221, + "grad_norm": 0.33371075987815857, + "learning_rate": 5.829316247423747e-05, + "loss": 0.14788826704025268, + "step": 97730 + }, + { + "epoch": 0.4196182478555421, + "grad_norm": 0.058594830334186554, + "learning_rate": 5.828885075411985e-05, + "loss": 0.5652408599853516, + "step": 97740 + }, + { + "epoch": 0.4196611799455621, + "grad_norm": 0.15235500037670135, + "learning_rate": 5.828453903400223e-05, + "loss": 0.16906936168670655, + "step": 97750 + }, + { + "epoch": 0.4197041120355821, + "grad_norm": 0.10869899392127991, + "learning_rate": 5.8280227313884605e-05, + "loss": 0.12809311151504515, + "step": 97760 + }, + { + "epoch": 0.4197470441256021, + "grad_norm": 0.16404810547828674, + "learning_rate": 5.827591559376698e-05, + "loss": 0.11475526094436646, + "step": 97770 + }, + { + "epoch": 0.41978997621562214, + "grad_norm": 1.0307124853134155, + "learning_rate": 5.827160387364936e-05, + "loss": 0.22724244594573975, + "step": 97780 + }, + { + "epoch": 0.4198329083056421, + "grad_norm": 0.008609895594418049, + "learning_rate": 5.826729215353173e-05, + "loss": 0.23866071701049804, + "step": 97790 + }, + { + "epoch": 0.4198758403956621, + "grad_norm": 3.9870755672454834, + "learning_rate": 5.826298043341411e-05, + "loss": 0.21454358100891113, + "step": 97800 + }, + { + "epoch": 0.41991877248568216, + "grad_norm": 4.111325740814209, + "learning_rate": 5.8258668713296485e-05, + "loss": 0.10587308406829835, + "step": 97810 + }, + { + "epoch": 0.41996170457570214, + "grad_norm": 0.05987037718296051, + "learning_rate": 5.825435699317886e-05, + "loss": 0.22635457515716553, + "step": 97820 + }, + { + "epoch": 0.4200046366657222, + "grad_norm": 1.7137806415557861, + "learning_rate": 5.825004527306124e-05, + "loss": 0.3221606731414795, + "step": 97830 + }, + { + "epoch": 0.4200475687557422, + "grad_norm": 0.037945330142974854, + "learning_rate": 5.824573355294362e-05, + "loss": 0.026730722188949584, + "step": 97840 + }, + { + "epoch": 0.42009050084576216, + "grad_norm": 0.6939473152160645, + "learning_rate": 5.8241421832825994e-05, + "loss": 0.24455561637878417, + "step": 97850 + }, + { + "epoch": 0.4201334329357822, + "grad_norm": 1.968237042427063, + "learning_rate": 5.823711011270837e-05, + "loss": 0.4216612815856934, + "step": 97860 + }, + { + "epoch": 0.4201763650258022, + "grad_norm": 0.005326097831130028, + "learning_rate": 5.823279839259075e-05, + "loss": 0.1221767544746399, + "step": 97870 + }, + { + "epoch": 0.4202192971158222, + "grad_norm": 4.377943992614746, + "learning_rate": 5.822848667247311e-05, + "loss": 0.3343919277191162, + "step": 97880 + }, + { + "epoch": 0.4202622292058422, + "grad_norm": 2.684236764907837, + "learning_rate": 5.822417495235549e-05, + "loss": 0.25210669040679934, + "step": 97890 + }, + { + "epoch": 0.4203051612958622, + "grad_norm": 0.35695880651474, + "learning_rate": 5.821986323223787e-05, + "loss": 0.11876271963119507, + "step": 97900 + }, + { + "epoch": 0.4203480933858822, + "grad_norm": 0.12343742698431015, + "learning_rate": 5.8215551512120245e-05, + "loss": 0.15900148153305055, + "step": 97910 + }, + { + "epoch": 0.42039102547590224, + "grad_norm": 0.5640662312507629, + "learning_rate": 5.821123979200262e-05, + "loss": 0.11388663053512574, + "step": 97920 + }, + { + "epoch": 0.4204339575659222, + "grad_norm": 3.1699492931365967, + "learning_rate": 5.8206928071885006e-05, + "loss": 0.20103952884674073, + "step": 97930 + }, + { + "epoch": 0.4204768896559422, + "grad_norm": 0.04569260776042938, + "learning_rate": 5.8202616351767383e-05, + "loss": 0.10158834457397461, + "step": 97940 + }, + { + "epoch": 0.42051982174596225, + "grad_norm": 1.2294700145721436, + "learning_rate": 5.819830463164976e-05, + "loss": 0.23906416893005372, + "step": 97950 + }, + { + "epoch": 0.42056275383598224, + "grad_norm": 0.3550843596458435, + "learning_rate": 5.8193992911532125e-05, + "loss": 0.0822509467601776, + "step": 97960 + }, + { + "epoch": 0.4206056859260022, + "grad_norm": 2.002542734146118, + "learning_rate": 5.81896811914145e-05, + "loss": 0.2353053092956543, + "step": 97970 + }, + { + "epoch": 0.42064861801602227, + "grad_norm": 0.06945760548114777, + "learning_rate": 5.818536947129688e-05, + "loss": 0.236065673828125, + "step": 97980 + }, + { + "epoch": 0.42069155010604226, + "grad_norm": 0.017344938591122627, + "learning_rate": 5.818105775117926e-05, + "loss": 0.11889860630035401, + "step": 97990 + }, + { + "epoch": 0.42073448219606224, + "grad_norm": 0.03761398419737816, + "learning_rate": 5.8176746031061634e-05, + "loss": 0.2174436092376709, + "step": 98000 + }, + { + "epoch": 0.42073448219606224, + "eval_loss": 0.4053749442100525, + "eval_runtime": 27.2888, + "eval_samples_per_second": 3.665, + "eval_steps_per_second": 3.665, + "step": 98000 + }, + { + "epoch": 0.4207774142860823, + "grad_norm": 0.5761705040931702, + "learning_rate": 5.817243431094401e-05, + "loss": 0.08439416885375976, + "step": 98010 + }, + { + "epoch": 0.42082034637610227, + "grad_norm": 0.11438914388418198, + "learning_rate": 5.816812259082639e-05, + "loss": 0.08448938131332398, + "step": 98020 + }, + { + "epoch": 0.4208632784661223, + "grad_norm": 0.05054575949907303, + "learning_rate": 5.8163810870708766e-05, + "loss": 0.34284517765045164, + "step": 98030 + }, + { + "epoch": 0.4209062105561423, + "grad_norm": 1.153587818145752, + "learning_rate": 5.8159499150591137e-05, + "loss": 0.23103575706481932, + "step": 98040 + }, + { + "epoch": 0.4209491426461623, + "grad_norm": 0.6570166349411011, + "learning_rate": 5.8155187430473514e-05, + "loss": 0.5498339176177979, + "step": 98050 + }, + { + "epoch": 0.42099207473618233, + "grad_norm": 1.3087819814682007, + "learning_rate": 5.815087571035589e-05, + "loss": 0.23551971912384034, + "step": 98060 + }, + { + "epoch": 0.4210350068262023, + "grad_norm": 0.23769740760326385, + "learning_rate": 5.814656399023827e-05, + "loss": 0.347845721244812, + "step": 98070 + }, + { + "epoch": 0.4210779389162223, + "grad_norm": 1.0044053792953491, + "learning_rate": 5.8142252270120646e-05, + "loss": 0.09060815572738648, + "step": 98080 + }, + { + "epoch": 0.42112087100624235, + "grad_norm": 1.1900349855422974, + "learning_rate": 5.813794055000302e-05, + "loss": 0.2937170028686523, + "step": 98090 + }, + { + "epoch": 0.42116380309626233, + "grad_norm": 0.9972975254058838, + "learning_rate": 5.81336288298854e-05, + "loss": 0.04283437132835388, + "step": 98100 + }, + { + "epoch": 0.4212067351862823, + "grad_norm": 2.356654644012451, + "learning_rate": 5.812931710976778e-05, + "loss": 0.14631413221359252, + "step": 98110 + }, + { + "epoch": 0.42124966727630236, + "grad_norm": 5.286679267883301, + "learning_rate": 5.812500538965014e-05, + "loss": 0.2211087465286255, + "step": 98120 + }, + { + "epoch": 0.42129259936632235, + "grad_norm": 2.404400587081909, + "learning_rate": 5.812069366953252e-05, + "loss": 0.20612568855285646, + "step": 98130 + }, + { + "epoch": 0.42133553145634234, + "grad_norm": 0.02629181742668152, + "learning_rate": 5.8116381949414896e-05, + "loss": 0.05725756883621216, + "step": 98140 + }, + { + "epoch": 0.4213784635463624, + "grad_norm": 1.1704928874969482, + "learning_rate": 5.811207022929728e-05, + "loss": 0.35143492221832273, + "step": 98150 + }, + { + "epoch": 0.42142139563638237, + "grad_norm": 0.5044357180595398, + "learning_rate": 5.810775850917966e-05, + "loss": 0.3639864206314087, + "step": 98160 + }, + { + "epoch": 0.42146432772640235, + "grad_norm": 0.005088315811008215, + "learning_rate": 5.8103446789062035e-05, + "loss": 0.24358859062194824, + "step": 98170 + }, + { + "epoch": 0.4215072598164224, + "grad_norm": 0.05479007959365845, + "learning_rate": 5.809913506894441e-05, + "loss": 0.20665128231048585, + "step": 98180 + }, + { + "epoch": 0.4215501919064424, + "grad_norm": 0.9962042570114136, + "learning_rate": 5.809482334882679e-05, + "loss": 0.29216752052307127, + "step": 98190 + }, + { + "epoch": 0.42159312399646237, + "grad_norm": 0.5849328637123108, + "learning_rate": 5.809051162870917e-05, + "loss": 0.3833322286605835, + "step": 98200 + }, + { + "epoch": 0.4216360560864824, + "grad_norm": 0.5021851658821106, + "learning_rate": 5.808619990859153e-05, + "loss": 0.39357795715332033, + "step": 98210 + }, + { + "epoch": 0.4216789881765024, + "grad_norm": 0.338433176279068, + "learning_rate": 5.808188818847391e-05, + "loss": 0.12214242219924927, + "step": 98220 + }, + { + "epoch": 0.4217219202665224, + "grad_norm": 3.6846652030944824, + "learning_rate": 5.8077576468356286e-05, + "loss": 0.24050602912902833, + "step": 98230 + }, + { + "epoch": 0.42176485235654243, + "grad_norm": 2.1717207431793213, + "learning_rate": 5.807326474823866e-05, + "loss": 0.25009167194366455, + "step": 98240 + }, + { + "epoch": 0.4218077844465624, + "grad_norm": 0.14511777460575104, + "learning_rate": 5.806895302812104e-05, + "loss": 0.12174754142761231, + "step": 98250 + }, + { + "epoch": 0.42185071653658246, + "grad_norm": 0.389323353767395, + "learning_rate": 5.806464130800342e-05, + "loss": 0.16142910718917847, + "step": 98260 + }, + { + "epoch": 0.42189364862660245, + "grad_norm": 2.4261882305145264, + "learning_rate": 5.8060329587885795e-05, + "loss": 0.391825270652771, + "step": 98270 + }, + { + "epoch": 0.42193658071662243, + "grad_norm": 0.28029099106788635, + "learning_rate": 5.805601786776817e-05, + "loss": 0.1543244242668152, + "step": 98280 + }, + { + "epoch": 0.4219795128066425, + "grad_norm": 1.7710871696472168, + "learning_rate": 5.805170614765054e-05, + "loss": 0.37169885635375977, + "step": 98290 + }, + { + "epoch": 0.42202244489666246, + "grad_norm": 0.004360601771622896, + "learning_rate": 5.804739442753292e-05, + "loss": 0.16414964199066162, + "step": 98300 + }, + { + "epoch": 0.42206537698668245, + "grad_norm": 0.01507270336151123, + "learning_rate": 5.80430827074153e-05, + "loss": 0.18333516120910645, + "step": 98310 + }, + { + "epoch": 0.4221083090767025, + "grad_norm": 1.3948227167129517, + "learning_rate": 5.8038770987297675e-05, + "loss": 0.18821496963500978, + "step": 98320 + }, + { + "epoch": 0.4221512411667225, + "grad_norm": 1.3636878728866577, + "learning_rate": 5.803445926718005e-05, + "loss": 0.14322640895843505, + "step": 98330 + }, + { + "epoch": 0.42219417325674247, + "grad_norm": 0.031215589493513107, + "learning_rate": 5.803014754706243e-05, + "loss": 0.1651058316230774, + "step": 98340 + }, + { + "epoch": 0.4222371053467625, + "grad_norm": 2.045180082321167, + "learning_rate": 5.802583582694481e-05, + "loss": 0.16372413635253907, + "step": 98350 + }, + { + "epoch": 0.4222800374367825, + "grad_norm": 6.121435165405273, + "learning_rate": 5.8021524106827184e-05, + "loss": 0.15840452909469604, + "step": 98360 + }, + { + "epoch": 0.4223229695268025, + "grad_norm": 23.101932525634766, + "learning_rate": 5.8017212386709555e-05, + "loss": 0.16662964820861817, + "step": 98370 + }, + { + "epoch": 0.4223659016168225, + "grad_norm": 1.8428597450256348, + "learning_rate": 5.801290066659193e-05, + "loss": 0.3516210079193115, + "step": 98380 + }, + { + "epoch": 0.4224088337068425, + "grad_norm": 0.46591079235076904, + "learning_rate": 5.800858894647431e-05, + "loss": 0.1679096817970276, + "step": 98390 + }, + { + "epoch": 0.4224517657968625, + "grad_norm": 3.5978612899780273, + "learning_rate": 5.800427722635669e-05, + "loss": 0.2919458866119385, + "step": 98400 + }, + { + "epoch": 0.42249469788688254, + "grad_norm": 0.003413701429963112, + "learning_rate": 5.7999965506239064e-05, + "loss": 0.18819609880447388, + "step": 98410 + }, + { + "epoch": 0.42253762997690253, + "grad_norm": 0.2167656570672989, + "learning_rate": 5.799565378612144e-05, + "loss": 0.08948023915290833, + "step": 98420 + }, + { + "epoch": 0.4225805620669225, + "grad_norm": 0.01767050288617611, + "learning_rate": 5.799134206600382e-05, + "loss": 0.40622797012329104, + "step": 98430 + }, + { + "epoch": 0.42262349415694256, + "grad_norm": 0.04388017579913139, + "learning_rate": 5.7987030345886196e-05, + "loss": 0.2401859998703003, + "step": 98440 + }, + { + "epoch": 0.42266642624696255, + "grad_norm": 3.0155787467956543, + "learning_rate": 5.798271862576856e-05, + "loss": 0.3773958683013916, + "step": 98450 + }, + { + "epoch": 0.4227093583369826, + "grad_norm": 1.8744343519210815, + "learning_rate": 5.797840690565094e-05, + "loss": 0.21185708045959473, + "step": 98460 + }, + { + "epoch": 0.4227522904270026, + "grad_norm": 0.007587990257889032, + "learning_rate": 5.7974095185533315e-05, + "loss": 0.2657394647598267, + "step": 98470 + }, + { + "epoch": 0.42279522251702256, + "grad_norm": 0.4172123670578003, + "learning_rate": 5.796978346541569e-05, + "loss": 0.076595139503479, + "step": 98480 + }, + { + "epoch": 0.4228381546070426, + "grad_norm": 1.0345379114151, + "learning_rate": 5.796547174529807e-05, + "loss": 0.3573399782180786, + "step": 98490 + }, + { + "epoch": 0.4228810866970626, + "grad_norm": 0.10183721035718918, + "learning_rate": 5.796116002518045e-05, + "loss": 0.13347251415252687, + "step": 98500 + }, + { + "epoch": 0.4229240187870826, + "grad_norm": 0.05460893362760544, + "learning_rate": 5.7956848305062824e-05, + "loss": 0.0026408961042761804, + "step": 98510 + }, + { + "epoch": 0.4229669508771026, + "grad_norm": 1.4982709884643555, + "learning_rate": 5.795253658494521e-05, + "loss": 0.24210147857666015, + "step": 98520 + }, + { + "epoch": 0.4230098829671226, + "grad_norm": 0.048997290432453156, + "learning_rate": 5.794822486482757e-05, + "loss": 0.1325900673866272, + "step": 98530 + }, + { + "epoch": 0.4230528150571426, + "grad_norm": 2.290670156478882, + "learning_rate": 5.794391314470995e-05, + "loss": 0.08407641649246216, + "step": 98540 + }, + { + "epoch": 0.42309574714716264, + "grad_norm": 0.0597001388669014, + "learning_rate": 5.793960142459233e-05, + "loss": 0.3538869619369507, + "step": 98550 + }, + { + "epoch": 0.4231386792371826, + "grad_norm": 2.8032407760620117, + "learning_rate": 5.7935289704474704e-05, + "loss": 0.41775975227355955, + "step": 98560 + }, + { + "epoch": 0.4231816113272026, + "grad_norm": 0.243858203291893, + "learning_rate": 5.793097798435708e-05, + "loss": 0.2470792770385742, + "step": 98570 + }, + { + "epoch": 0.42322454341722265, + "grad_norm": 0.4460698068141937, + "learning_rate": 5.792666626423946e-05, + "loss": 0.10146148204803467, + "step": 98580 + }, + { + "epoch": 0.42326747550724264, + "grad_norm": 0.24464015662670135, + "learning_rate": 5.7922354544121836e-05, + "loss": 0.19643570184707643, + "step": 98590 + }, + { + "epoch": 0.42331040759726263, + "grad_norm": 1.338011622428894, + "learning_rate": 5.7918042824004214e-05, + "loss": 0.1863824486732483, + "step": 98600 + }, + { + "epoch": 0.42335333968728267, + "grad_norm": 0.011673185974359512, + "learning_rate": 5.791373110388659e-05, + "loss": 0.06954334378242492, + "step": 98610 + }, + { + "epoch": 0.42339627177730266, + "grad_norm": 0.050423551350831985, + "learning_rate": 5.790941938376896e-05, + "loss": 0.28564701080322263, + "step": 98620 + }, + { + "epoch": 0.42343920386732264, + "grad_norm": 6.057043552398682, + "learning_rate": 5.790510766365134e-05, + "loss": 0.3044910192489624, + "step": 98630 + }, + { + "epoch": 0.4234821359573427, + "grad_norm": 1.2167507410049438, + "learning_rate": 5.7900795943533716e-05, + "loss": 0.3324015140533447, + "step": 98640 + }, + { + "epoch": 0.4235250680473627, + "grad_norm": 0.0021167939994484186, + "learning_rate": 5.7896484223416094e-05, + "loss": 0.1250348687171936, + "step": 98650 + }, + { + "epoch": 0.42356800013738266, + "grad_norm": 0.003853866131976247, + "learning_rate": 5.789217250329847e-05, + "loss": 0.21840078830718995, + "step": 98660 + }, + { + "epoch": 0.4236109322274027, + "grad_norm": 8.526963233947754, + "learning_rate": 5.788786078318085e-05, + "loss": 0.4191432952880859, + "step": 98670 + }, + { + "epoch": 0.4236538643174227, + "grad_norm": 0.0189904123544693, + "learning_rate": 5.7883549063063226e-05, + "loss": 0.30071659088134767, + "step": 98680 + }, + { + "epoch": 0.42369679640744273, + "grad_norm": 0.03762015700340271, + "learning_rate": 5.78792373429456e-05, + "loss": 0.23586046695709229, + "step": 98690 + }, + { + "epoch": 0.4237397284974627, + "grad_norm": 0.04850266873836517, + "learning_rate": 5.787492562282797e-05, + "loss": 0.3225547313690186, + "step": 98700 + }, + { + "epoch": 0.4237826605874827, + "grad_norm": 0.03791253641247749, + "learning_rate": 5.7870613902710344e-05, + "loss": 0.11718940734863281, + "step": 98710 + }, + { + "epoch": 0.42382559267750275, + "grad_norm": 0.030987627804279327, + "learning_rate": 5.786630218259272e-05, + "loss": 0.21683728694915771, + "step": 98720 + }, + { + "epoch": 0.42386852476752274, + "grad_norm": 15.335086822509766, + "learning_rate": 5.78619904624751e-05, + "loss": 0.35996179580688475, + "step": 98730 + }, + { + "epoch": 0.4239114568575427, + "grad_norm": 0.002879585837945342, + "learning_rate": 5.785767874235748e-05, + "loss": 0.44658312797546384, + "step": 98740 + }, + { + "epoch": 0.42395438894756277, + "grad_norm": 0.14629849791526794, + "learning_rate": 5.785336702223986e-05, + "loss": 0.0736695647239685, + "step": 98750 + }, + { + "epoch": 0.42399732103758275, + "grad_norm": 0.13867326080799103, + "learning_rate": 5.784905530212224e-05, + "loss": 0.12435241937637329, + "step": 98760 + }, + { + "epoch": 0.42404025312760274, + "grad_norm": 0.03923346847295761, + "learning_rate": 5.7844743582004615e-05, + "loss": 0.21064164638519287, + "step": 98770 + }, + { + "epoch": 0.4240831852176228, + "grad_norm": 0.10467129945755005, + "learning_rate": 5.784043186188698e-05, + "loss": 0.15324407815933228, + "step": 98780 + }, + { + "epoch": 0.42412611730764277, + "grad_norm": 0.024315934628248215, + "learning_rate": 5.7836120141769356e-05, + "loss": 0.15878936052322387, + "step": 98790 + }, + { + "epoch": 0.42416904939766276, + "grad_norm": 0.013896098360419273, + "learning_rate": 5.783180842165173e-05, + "loss": 0.22481341361999513, + "step": 98800 + }, + { + "epoch": 0.4242119814876828, + "grad_norm": 0.004909253679215908, + "learning_rate": 5.782749670153411e-05, + "loss": 0.3054252862930298, + "step": 98810 + }, + { + "epoch": 0.4242549135777028, + "grad_norm": 2.7937729358673096, + "learning_rate": 5.782318498141649e-05, + "loss": 0.3032632350921631, + "step": 98820 + }, + { + "epoch": 0.4242978456677228, + "grad_norm": 2.777571439743042, + "learning_rate": 5.7818873261298865e-05, + "loss": 0.3512873649597168, + "step": 98830 + }, + { + "epoch": 0.4243407777577428, + "grad_norm": 3.122788429260254, + "learning_rate": 5.781456154118124e-05, + "loss": 0.2130906105041504, + "step": 98840 + }, + { + "epoch": 0.4243837098477628, + "grad_norm": 0.04536563903093338, + "learning_rate": 5.781024982106362e-05, + "loss": 0.10688605308532714, + "step": 98850 + }, + { + "epoch": 0.4244266419377828, + "grad_norm": 1.177663803100586, + "learning_rate": 5.780593810094599e-05, + "loss": 0.28160221576690675, + "step": 98860 + }, + { + "epoch": 0.42446957402780283, + "grad_norm": 0.0027043658774346113, + "learning_rate": 5.780162638082837e-05, + "loss": 0.226685094833374, + "step": 98870 + }, + { + "epoch": 0.4245125061178228, + "grad_norm": 2.435696840286255, + "learning_rate": 5.7797314660710745e-05, + "loss": 0.3340808153152466, + "step": 98880 + }, + { + "epoch": 0.42455543820784286, + "grad_norm": 0.0020951908081769943, + "learning_rate": 5.779300294059312e-05, + "loss": 0.1408405900001526, + "step": 98890 + }, + { + "epoch": 0.42459837029786285, + "grad_norm": 2.14007306098938, + "learning_rate": 5.77886912204755e-05, + "loss": 0.2413254737854004, + "step": 98900 + }, + { + "epoch": 0.42464130238788284, + "grad_norm": 0.0012880098074674606, + "learning_rate": 5.778437950035788e-05, + "loss": 0.16362298727035524, + "step": 98910 + }, + { + "epoch": 0.4246842344779029, + "grad_norm": 0.004306933842599392, + "learning_rate": 5.7780067780240255e-05, + "loss": 0.2394641399383545, + "step": 98920 + }, + { + "epoch": 0.42472716656792286, + "grad_norm": 1.6067447662353516, + "learning_rate": 5.777575606012263e-05, + "loss": 0.48149833679199217, + "step": 98930 + }, + { + "epoch": 0.42477009865794285, + "grad_norm": 2.7072231769561768, + "learning_rate": 5.777144434000501e-05, + "loss": 0.17587217092514038, + "step": 98940 + }, + { + "epoch": 0.4248130307479629, + "grad_norm": 4.478725433349609, + "learning_rate": 5.776713261988737e-05, + "loss": 0.30828609466552737, + "step": 98950 + }, + { + "epoch": 0.4248559628379829, + "grad_norm": 0.7346305251121521, + "learning_rate": 5.776282089976976e-05, + "loss": 0.1656672477722168, + "step": 98960 + }, + { + "epoch": 0.42489889492800287, + "grad_norm": 0.003329846076667309, + "learning_rate": 5.7758509179652135e-05, + "loss": 0.37731635570526123, + "step": 98970 + }, + { + "epoch": 0.4249418270180229, + "grad_norm": 0.0670686811208725, + "learning_rate": 5.775419745953451e-05, + "loss": 0.2388993263244629, + "step": 98980 + }, + { + "epoch": 0.4249847591080429, + "grad_norm": 0.21079060435295105, + "learning_rate": 5.774988573941689e-05, + "loss": 0.082888263463974, + "step": 98990 + }, + { + "epoch": 0.4250276911980629, + "grad_norm": 3.8923022747039795, + "learning_rate": 5.774557401929927e-05, + "loss": 0.21767971515655518, + "step": 99000 + }, + { + "epoch": 0.4250276911980629, + "eval_loss": 0.42039725184440613, + "eval_runtime": 27.092, + "eval_samples_per_second": 3.691, + "eval_steps_per_second": 3.691, + "step": 99000 + }, + { + "epoch": 0.4250706232880829, + "grad_norm": 0.049764856696128845, + "learning_rate": 5.7741262299181644e-05, + "loss": 0.1819413423538208, + "step": 99010 + }, + { + "epoch": 0.4251135553781029, + "grad_norm": 0.061445679515600204, + "learning_rate": 5.773695057906402e-05, + "loss": 0.20978860855102538, + "step": 99020 + }, + { + "epoch": 0.4251564874681229, + "grad_norm": 0.15067073702812195, + "learning_rate": 5.7732638858946385e-05, + "loss": 0.25956437587738035, + "step": 99030 + }, + { + "epoch": 0.42519941955814294, + "grad_norm": 0.017767034471035004, + "learning_rate": 5.772832713882876e-05, + "loss": 0.12931965589523314, + "step": 99040 + }, + { + "epoch": 0.42524235164816293, + "grad_norm": 58.04544448852539, + "learning_rate": 5.772401541871114e-05, + "loss": 0.1122699499130249, + "step": 99050 + }, + { + "epoch": 0.4252852837381829, + "grad_norm": 0.08649862557649612, + "learning_rate": 5.771970369859352e-05, + "loss": 0.07707861661911011, + "step": 99060 + }, + { + "epoch": 0.42532821582820296, + "grad_norm": 4.430203914642334, + "learning_rate": 5.7715391978475894e-05, + "loss": 0.1516265392303467, + "step": 99070 + }, + { + "epoch": 0.42537114791822295, + "grad_norm": 0.8073638677597046, + "learning_rate": 5.771108025835827e-05, + "loss": 0.2441352128982544, + "step": 99080 + }, + { + "epoch": 0.42541408000824293, + "grad_norm": 0.08090592175722122, + "learning_rate": 5.770676853824065e-05, + "loss": 0.10868796110153198, + "step": 99090 + }, + { + "epoch": 0.425457012098263, + "grad_norm": 5.911368370056152, + "learning_rate": 5.7702456818123027e-05, + "loss": 0.3562882423400879, + "step": 99100 + }, + { + "epoch": 0.42549994418828296, + "grad_norm": 0.2530055046081543, + "learning_rate": 5.76981450980054e-05, + "loss": 0.19297826290130615, + "step": 99110 + }, + { + "epoch": 0.425542876278303, + "grad_norm": 1.9001591205596924, + "learning_rate": 5.7693833377887774e-05, + "loss": 0.3040750503540039, + "step": 99120 + }, + { + "epoch": 0.425585808368323, + "grad_norm": 0.07195616513490677, + "learning_rate": 5.768952165777015e-05, + "loss": 0.25295019149780273, + "step": 99130 + }, + { + "epoch": 0.425628740458343, + "grad_norm": 2.257978916168213, + "learning_rate": 5.768520993765253e-05, + "loss": 0.45368361473083496, + "step": 99140 + }, + { + "epoch": 0.425671672548363, + "grad_norm": 0.25597015023231506, + "learning_rate": 5.7680898217534906e-05, + "loss": 0.1789316415786743, + "step": 99150 + }, + { + "epoch": 0.425714604638383, + "grad_norm": 0.1592808961868286, + "learning_rate": 5.7676586497417284e-05, + "loss": 0.15700260400772095, + "step": 99160 + }, + { + "epoch": 0.425757536728403, + "grad_norm": 0.3460257649421692, + "learning_rate": 5.767227477729966e-05, + "loss": 0.27511794567108155, + "step": 99170 + }, + { + "epoch": 0.42580046881842304, + "grad_norm": 0.01419067569077015, + "learning_rate": 5.766796305718204e-05, + "loss": 0.20267207622528077, + "step": 99180 + }, + { + "epoch": 0.425843400908443, + "grad_norm": 1.0129761695861816, + "learning_rate": 5.766365133706441e-05, + "loss": 0.2106555700302124, + "step": 99190 + }, + { + "epoch": 0.425886332998463, + "grad_norm": 1.3225549459457397, + "learning_rate": 5.7659339616946786e-05, + "loss": 0.08879272937774658, + "step": 99200 + }, + { + "epoch": 0.42592926508848306, + "grad_norm": 0.10503018647432327, + "learning_rate": 5.7655027896829164e-05, + "loss": 0.08622437715530396, + "step": 99210 + }, + { + "epoch": 0.42597219717850304, + "grad_norm": 2.210472345352173, + "learning_rate": 5.765071617671154e-05, + "loss": 0.2910381555557251, + "step": 99220 + }, + { + "epoch": 0.42601512926852303, + "grad_norm": 0.008246196433901787, + "learning_rate": 5.764640445659392e-05, + "loss": 0.3339352607727051, + "step": 99230 + }, + { + "epoch": 0.42605806135854307, + "grad_norm": 1.0068203210830688, + "learning_rate": 5.7642092736476296e-05, + "loss": 0.2700967311859131, + "step": 99240 + }, + { + "epoch": 0.42610099344856306, + "grad_norm": 0.06918929517269135, + "learning_rate": 5.763778101635867e-05, + "loss": 0.015539254248142242, + "step": 99250 + }, + { + "epoch": 0.42614392553858305, + "grad_norm": 16.384273529052734, + "learning_rate": 5.763346929624105e-05, + "loss": 0.2665158033370972, + "step": 99260 + }, + { + "epoch": 0.4261868576286031, + "grad_norm": 0.05230254307389259, + "learning_rate": 5.7629157576123414e-05, + "loss": 0.3403724908828735, + "step": 99270 + }, + { + "epoch": 0.4262297897186231, + "grad_norm": 0.1716545820236206, + "learning_rate": 5.762484585600579e-05, + "loss": 0.025423717498779298, + "step": 99280 + }, + { + "epoch": 0.42627272180864306, + "grad_norm": 0.3432859778404236, + "learning_rate": 5.762053413588817e-05, + "loss": 0.27904059886932375, + "step": 99290 + }, + { + "epoch": 0.4263156538986631, + "grad_norm": 0.0010938121704384685, + "learning_rate": 5.7616222415770546e-05, + "loss": 0.33778557777404783, + "step": 99300 + }, + { + "epoch": 0.4263585859886831, + "grad_norm": 2.464271306991577, + "learning_rate": 5.7611910695652924e-05, + "loss": 0.26821877956390383, + "step": 99310 + }, + { + "epoch": 0.42640151807870313, + "grad_norm": 0.015062485821545124, + "learning_rate": 5.76075989755353e-05, + "loss": 0.20908112525939943, + "step": 99320 + }, + { + "epoch": 0.4264444501687231, + "grad_norm": 19.407758712768555, + "learning_rate": 5.760328725541768e-05, + "loss": 0.18962712287902833, + "step": 99330 + }, + { + "epoch": 0.4264873822587431, + "grad_norm": 1.7343069314956665, + "learning_rate": 5.759897553530006e-05, + "loss": 0.43570146560668943, + "step": 99340 + }, + { + "epoch": 0.42653031434876315, + "grad_norm": 0.0023813594598323107, + "learning_rate": 5.759466381518244e-05, + "loss": 0.10327715873718261, + "step": 99350 + }, + { + "epoch": 0.42657324643878314, + "grad_norm": 2.541740894317627, + "learning_rate": 5.7590352095064804e-05, + "loss": 0.22181310653686523, + "step": 99360 + }, + { + "epoch": 0.4266161785288031, + "grad_norm": 1.6314506530761719, + "learning_rate": 5.758604037494718e-05, + "loss": 0.35480194091796874, + "step": 99370 + }, + { + "epoch": 0.42665911061882317, + "grad_norm": 1.3725117444992065, + "learning_rate": 5.758172865482956e-05, + "loss": 0.23750679492950438, + "step": 99380 + }, + { + "epoch": 0.42670204270884315, + "grad_norm": 0.8450286984443665, + "learning_rate": 5.7577416934711936e-05, + "loss": 0.25147688388824463, + "step": 99390 + }, + { + "epoch": 0.42674497479886314, + "grad_norm": 1.480108618736267, + "learning_rate": 5.757310521459431e-05, + "loss": 0.20741603374481202, + "step": 99400 + }, + { + "epoch": 0.4267879068888832, + "grad_norm": 2.16528582572937, + "learning_rate": 5.756879349447669e-05, + "loss": 0.21702051162719727, + "step": 99410 + }, + { + "epoch": 0.42683083897890317, + "grad_norm": 1.2046661376953125, + "learning_rate": 5.756448177435907e-05, + "loss": 0.22789404392242432, + "step": 99420 + }, + { + "epoch": 0.42687377106892316, + "grad_norm": 0.09745719283819199, + "learning_rate": 5.7560170054241445e-05, + "loss": 0.23347074985504152, + "step": 99430 + }, + { + "epoch": 0.4269167031589432, + "grad_norm": 6.254867076873779, + "learning_rate": 5.7555858334123816e-05, + "loss": 0.21007938385009767, + "step": 99440 + }, + { + "epoch": 0.4269596352489632, + "grad_norm": 1.8788691759109497, + "learning_rate": 5.755154661400619e-05, + "loss": 0.31277174949645997, + "step": 99450 + }, + { + "epoch": 0.4270025673389832, + "grad_norm": 0.12879228591918945, + "learning_rate": 5.754723489388857e-05, + "loss": 0.29340462684631347, + "step": 99460 + }, + { + "epoch": 0.4270454994290032, + "grad_norm": 0.2092210054397583, + "learning_rate": 5.754292317377095e-05, + "loss": 0.20556654930114746, + "step": 99470 + }, + { + "epoch": 0.4270884315190232, + "grad_norm": 0.1479301005601883, + "learning_rate": 5.7538611453653325e-05, + "loss": 0.18042335510253907, + "step": 99480 + }, + { + "epoch": 0.4271313636090432, + "grad_norm": 0.05552195385098457, + "learning_rate": 5.75342997335357e-05, + "loss": 0.09226502776145935, + "step": 99490 + }, + { + "epoch": 0.42717429569906323, + "grad_norm": 0.001192165189422667, + "learning_rate": 5.752998801341808e-05, + "loss": 0.26993460655212403, + "step": 99500 + }, + { + "epoch": 0.4272172277890832, + "grad_norm": 9.776094436645508, + "learning_rate": 5.752567629330046e-05, + "loss": 0.3855778694152832, + "step": 99510 + }, + { + "epoch": 0.4272601598791032, + "grad_norm": 0.0043556843884289265, + "learning_rate": 5.752136457318282e-05, + "loss": 0.275787353515625, + "step": 99520 + }, + { + "epoch": 0.42730309196912325, + "grad_norm": 0.9076972603797913, + "learning_rate": 5.75170528530652e-05, + "loss": 0.21448926925659179, + "step": 99530 + }, + { + "epoch": 0.42734602405914324, + "grad_norm": 0.17568263411521912, + "learning_rate": 5.7512741132947575e-05, + "loss": 0.13522560596466066, + "step": 99540 + }, + { + "epoch": 0.4273889561491633, + "grad_norm": 0.026702409610152245, + "learning_rate": 5.750842941282995e-05, + "loss": 0.08444958925247192, + "step": 99550 + }, + { + "epoch": 0.42743188823918327, + "grad_norm": 0.08688662946224213, + "learning_rate": 5.750411769271234e-05, + "loss": 0.058204162120819095, + "step": 99560 + }, + { + "epoch": 0.42747482032920325, + "grad_norm": 6.395595550537109, + "learning_rate": 5.7499805972594714e-05, + "loss": 0.28020992279052737, + "step": 99570 + }, + { + "epoch": 0.4275177524192233, + "grad_norm": 0.0059433989226818085, + "learning_rate": 5.749549425247709e-05, + "loss": 0.12280683517456055, + "step": 99580 + }, + { + "epoch": 0.4275606845092433, + "grad_norm": 0.017601149156689644, + "learning_rate": 5.749118253235947e-05, + "loss": 0.2879619836807251, + "step": 99590 + }, + { + "epoch": 0.42760361659926327, + "grad_norm": 0.01707869954407215, + "learning_rate": 5.748687081224183e-05, + "loss": 0.22087881565093995, + "step": 99600 + }, + { + "epoch": 0.4276465486892833, + "grad_norm": 3.0199363231658936, + "learning_rate": 5.748255909212421e-05, + "loss": 0.140790593624115, + "step": 99610 + }, + { + "epoch": 0.4276894807793033, + "grad_norm": 2.2495341300964355, + "learning_rate": 5.747824737200659e-05, + "loss": 0.1820667266845703, + "step": 99620 + }, + { + "epoch": 0.4277324128693233, + "grad_norm": 0.32152339816093445, + "learning_rate": 5.7473935651888965e-05, + "loss": 0.25116183757781985, + "step": 99630 + }, + { + "epoch": 0.42777534495934333, + "grad_norm": 0.04170903190970421, + "learning_rate": 5.746962393177134e-05, + "loss": 0.09960184693336487, + "step": 99640 + }, + { + "epoch": 0.4278182770493633, + "grad_norm": 2.308204412460327, + "learning_rate": 5.746531221165372e-05, + "loss": 0.10755871534347534, + "step": 99650 + }, + { + "epoch": 0.4278612091393833, + "grad_norm": 0.012859346345067024, + "learning_rate": 5.74610004915361e-05, + "loss": 0.08167902231216431, + "step": 99660 + }, + { + "epoch": 0.42790414122940335, + "grad_norm": 20.540184020996094, + "learning_rate": 5.7456688771418474e-05, + "loss": 0.32033615112304686, + "step": 99670 + }, + { + "epoch": 0.42794707331942333, + "grad_norm": 0.052581265568733215, + "learning_rate": 5.745237705130085e-05, + "loss": 0.22763388156890868, + "step": 99680 + }, + { + "epoch": 0.4279900054094433, + "grad_norm": 0.024297883734107018, + "learning_rate": 5.744806533118322e-05, + "loss": 0.13195170164108277, + "step": 99690 + }, + { + "epoch": 0.42803293749946336, + "grad_norm": 0.0014119200641289353, + "learning_rate": 5.74437536110656e-05, + "loss": 0.22947165966033936, + "step": 99700 + }, + { + "epoch": 0.42807586958948335, + "grad_norm": 1.5014564990997314, + "learning_rate": 5.743944189094798e-05, + "loss": 0.16373069286346437, + "step": 99710 + }, + { + "epoch": 0.42811880167950334, + "grad_norm": 0.0006927020149305463, + "learning_rate": 5.7435130170830354e-05, + "loss": 0.08164035081863404, + "step": 99720 + }, + { + "epoch": 0.4281617337695234, + "grad_norm": 0.10113275796175003, + "learning_rate": 5.743081845071273e-05, + "loss": 0.2783978939056396, + "step": 99730 + }, + { + "epoch": 0.42820466585954337, + "grad_norm": 1.616166353225708, + "learning_rate": 5.742650673059511e-05, + "loss": 0.27019352912902833, + "step": 99740 + }, + { + "epoch": 0.4282475979495634, + "grad_norm": 5.727255344390869, + "learning_rate": 5.7422195010477486e-05, + "loss": 0.2755590438842773, + "step": 99750 + }, + { + "epoch": 0.4282905300395834, + "grad_norm": 0.010672148317098618, + "learning_rate": 5.7417883290359863e-05, + "loss": 0.28162527084350586, + "step": 99760 + }, + { + "epoch": 0.4283334621296034, + "grad_norm": 10.59469223022461, + "learning_rate": 5.741357157024223e-05, + "loss": 0.23261218070983886, + "step": 99770 + }, + { + "epoch": 0.4283763942196234, + "grad_norm": 0.07830287516117096, + "learning_rate": 5.740925985012461e-05, + "loss": 0.10009453296661378, + "step": 99780 + }, + { + "epoch": 0.4284193263096434, + "grad_norm": 4.014584064483643, + "learning_rate": 5.740494813000699e-05, + "loss": 0.1561530590057373, + "step": 99790 + }, + { + "epoch": 0.4284622583996634, + "grad_norm": 1.0443463325500488, + "learning_rate": 5.7400636409889366e-05, + "loss": 0.28882346153259275, + "step": 99800 + }, + { + "epoch": 0.42850519048968344, + "grad_norm": 0.11487990617752075, + "learning_rate": 5.739632468977174e-05, + "loss": 0.3919788122177124, + "step": 99810 + }, + { + "epoch": 0.4285481225797034, + "grad_norm": 0.5756773352622986, + "learning_rate": 5.739201296965412e-05, + "loss": 0.19305258989334106, + "step": 99820 + }, + { + "epoch": 0.4285910546697234, + "grad_norm": 2.1908233165740967, + "learning_rate": 5.73877012495365e-05, + "loss": 0.1436487317085266, + "step": 99830 + }, + { + "epoch": 0.42863398675974346, + "grad_norm": 0.23491205275058746, + "learning_rate": 5.7383389529418875e-05, + "loss": 0.356546688079834, + "step": 99840 + }, + { + "epoch": 0.42867691884976344, + "grad_norm": 0.009992998093366623, + "learning_rate": 5.737907780930124e-05, + "loss": 0.08113847374916076, + "step": 99850 + }, + { + "epoch": 0.42871985093978343, + "grad_norm": 0.22079306840896606, + "learning_rate": 5.7374766089183616e-05, + "loss": 0.12962546348571777, + "step": 99860 + }, + { + "epoch": 0.4287627830298035, + "grad_norm": 0.006492846179753542, + "learning_rate": 5.7370454369065994e-05, + "loss": 0.4033665180206299, + "step": 99870 + }, + { + "epoch": 0.42880571511982346, + "grad_norm": 0.015185476280748844, + "learning_rate": 5.736614264894837e-05, + "loss": 0.1210568904876709, + "step": 99880 + }, + { + "epoch": 0.42884864720984345, + "grad_norm": 0.23486827313899994, + "learning_rate": 5.736183092883075e-05, + "loss": 0.2891173601150513, + "step": 99890 + }, + { + "epoch": 0.4288915792998635, + "grad_norm": 1.5405279397964478, + "learning_rate": 5.7357519208713126e-05, + "loss": 0.017226718366146088, + "step": 99900 + }, + { + "epoch": 0.4289345113898835, + "grad_norm": 0.23540741205215454, + "learning_rate": 5.73532074885955e-05, + "loss": 0.37728137969970704, + "step": 99910 + }, + { + "epoch": 0.42897744347990346, + "grad_norm": 1.2578777074813843, + "learning_rate": 5.734889576847788e-05, + "loss": 0.3932936191558838, + "step": 99920 + }, + { + "epoch": 0.4290203755699235, + "grad_norm": 0.02093084156513214, + "learning_rate": 5.734458404836025e-05, + "loss": 0.23207945823669435, + "step": 99930 + }, + { + "epoch": 0.4290633076599435, + "grad_norm": 1.5797219276428223, + "learning_rate": 5.734027232824263e-05, + "loss": 0.19565119743347167, + "step": 99940 + }, + { + "epoch": 0.4291062397499635, + "grad_norm": 0.023647375404834747, + "learning_rate": 5.7335960608125006e-05, + "loss": 0.2564146280288696, + "step": 99950 + }, + { + "epoch": 0.4291491718399835, + "grad_norm": 0.0429295189678669, + "learning_rate": 5.733164888800738e-05, + "loss": 0.27921204566955565, + "step": 99960 + }, + { + "epoch": 0.4291921039300035, + "grad_norm": 0.6207567453384399, + "learning_rate": 5.732733716788976e-05, + "loss": 0.15734999179840087, + "step": 99970 + }, + { + "epoch": 0.42923503602002355, + "grad_norm": 0.5371975898742676, + "learning_rate": 5.732302544777214e-05, + "loss": 0.21604642868041993, + "step": 99980 + }, + { + "epoch": 0.42927796811004354, + "grad_norm": 0.22142738103866577, + "learning_rate": 5.7318713727654515e-05, + "loss": 0.08812406063079833, + "step": 99990 + }, + { + "epoch": 0.4293209002000635, + "grad_norm": 3.723952054977417, + "learning_rate": 5.731440200753689e-05, + "loss": 0.40993666648864746, + "step": 100000 + }, + { + "epoch": 0.4293209002000635, + "eval_loss": 0.41385185718536377, + "eval_runtime": 27.272, + "eval_samples_per_second": 3.667, + "eval_steps_per_second": 3.667, + "step": 100000 + }, + { + "epoch": 0.42936383229008357, + "grad_norm": 1.978222131729126, + "learning_rate": 5.731009028741927e-05, + "loss": 0.41112370491027833, + "step": 100010 + }, + { + "epoch": 0.42940676438010356, + "grad_norm": 0.35666602849960327, + "learning_rate": 5.730577856730164e-05, + "loss": 0.19882118701934814, + "step": 100020 + }, + { + "epoch": 0.42944969647012354, + "grad_norm": 0.1557963639497757, + "learning_rate": 5.730146684718402e-05, + "loss": 0.15280017852783204, + "step": 100030 + }, + { + "epoch": 0.4294926285601436, + "grad_norm": 1.474320650100708, + "learning_rate": 5.7297155127066395e-05, + "loss": 0.32379262447357177, + "step": 100040 + }, + { + "epoch": 0.4295355606501636, + "grad_norm": 0.0014996053650975227, + "learning_rate": 5.729284340694877e-05, + "loss": 0.18067800998687744, + "step": 100050 + }, + { + "epoch": 0.42957849274018356, + "grad_norm": 1.359062671661377, + "learning_rate": 5.728853168683115e-05, + "loss": 0.2790334939956665, + "step": 100060 + }, + { + "epoch": 0.4296214248302036, + "grad_norm": 0.6800263524055481, + "learning_rate": 5.728421996671353e-05, + "loss": 0.28148727416992186, + "step": 100070 + }, + { + "epoch": 0.4296643569202236, + "grad_norm": 1.2884678840637207, + "learning_rate": 5.7279908246595904e-05, + "loss": 0.435483455657959, + "step": 100080 + }, + { + "epoch": 0.4297072890102436, + "grad_norm": 4.299257755279541, + "learning_rate": 5.727559652647828e-05, + "loss": 0.2659862995147705, + "step": 100090 + }, + { + "epoch": 0.4297502211002636, + "grad_norm": 0.6796963214874268, + "learning_rate": 5.7271284806360646e-05, + "loss": 0.017660659551620484, + "step": 100100 + }, + { + "epoch": 0.4297931531902836, + "grad_norm": 0.042083896696567535, + "learning_rate": 5.726697308624302e-05, + "loss": 0.24837009906768798, + "step": 100110 + }, + { + "epoch": 0.4298360852803036, + "grad_norm": 0.06338540464639664, + "learning_rate": 5.72626613661254e-05, + "loss": 0.20939581394195556, + "step": 100120 + }, + { + "epoch": 0.42987901737032364, + "grad_norm": 1.4580801725387573, + "learning_rate": 5.725834964600778e-05, + "loss": 0.295143723487854, + "step": 100130 + }, + { + "epoch": 0.4299219494603436, + "grad_norm": 2.420668601989746, + "learning_rate": 5.7254037925890155e-05, + "loss": 0.35013642311096194, + "step": 100140 + }, + { + "epoch": 0.4299648815503636, + "grad_norm": 0.0071824113838374615, + "learning_rate": 5.724972620577254e-05, + "loss": 0.18279858827590942, + "step": 100150 + }, + { + "epoch": 0.43000781364038365, + "grad_norm": 1.749664306640625, + "learning_rate": 5.7245414485654916e-05, + "loss": 0.27730727195739746, + "step": 100160 + }, + { + "epoch": 0.43005074573040364, + "grad_norm": 0.1663660705089569, + "learning_rate": 5.7241102765537294e-05, + "loss": 0.16716526746749877, + "step": 100170 + }, + { + "epoch": 0.4300936778204237, + "grad_norm": 0.03588825836777687, + "learning_rate": 5.723679104541966e-05, + "loss": 0.1703078031539917, + "step": 100180 + }, + { + "epoch": 0.43013660991044367, + "grad_norm": 0.03353295102715492, + "learning_rate": 5.7232479325302035e-05, + "loss": 0.10920073986053466, + "step": 100190 + }, + { + "epoch": 0.43017954200046365, + "grad_norm": 1.1376320123672485, + "learning_rate": 5.722816760518441e-05, + "loss": 0.15631842613220215, + "step": 100200 + }, + { + "epoch": 0.4302224740904837, + "grad_norm": 0.003353550098836422, + "learning_rate": 5.722385588506679e-05, + "loss": 0.3201023578643799, + "step": 100210 + }, + { + "epoch": 0.4302654061805037, + "grad_norm": 0.004149756394326687, + "learning_rate": 5.721954416494917e-05, + "loss": 0.21888132095336915, + "step": 100220 + }, + { + "epoch": 0.43030833827052367, + "grad_norm": 0.052119333297014236, + "learning_rate": 5.7215232444831544e-05, + "loss": 0.09456470608711243, + "step": 100230 + }, + { + "epoch": 0.4303512703605437, + "grad_norm": 26.883987426757812, + "learning_rate": 5.721092072471392e-05, + "loss": 0.3765913963317871, + "step": 100240 + }, + { + "epoch": 0.4303942024505637, + "grad_norm": 0.2676823139190674, + "learning_rate": 5.72066090045963e-05, + "loss": 0.2983670473098755, + "step": 100250 + }, + { + "epoch": 0.4304371345405837, + "grad_norm": 0.09823568910360336, + "learning_rate": 5.720229728447867e-05, + "loss": 0.2189718246459961, + "step": 100260 + }, + { + "epoch": 0.43048006663060373, + "grad_norm": 0.07216641306877136, + "learning_rate": 5.719798556436105e-05, + "loss": 0.22153894901275634, + "step": 100270 + }, + { + "epoch": 0.4305229987206237, + "grad_norm": 0.49933910369873047, + "learning_rate": 5.7193673844243424e-05, + "loss": 0.26543402671813965, + "step": 100280 + }, + { + "epoch": 0.4305659308106437, + "grad_norm": 2.6107425689697266, + "learning_rate": 5.71893621241258e-05, + "loss": 0.2502300500869751, + "step": 100290 + }, + { + "epoch": 0.43060886290066375, + "grad_norm": 0.032215360552072525, + "learning_rate": 5.718505040400818e-05, + "loss": 0.2651889085769653, + "step": 100300 + }, + { + "epoch": 0.43065179499068373, + "grad_norm": 0.2990207374095917, + "learning_rate": 5.7180738683890556e-05, + "loss": 0.1141858458518982, + "step": 100310 + }, + { + "epoch": 0.4306947270807037, + "grad_norm": 1.3636616468429565, + "learning_rate": 5.7176426963772934e-05, + "loss": 0.12704546451568605, + "step": 100320 + }, + { + "epoch": 0.43073765917072376, + "grad_norm": 0.05208112671971321, + "learning_rate": 5.717211524365531e-05, + "loss": 0.2751093626022339, + "step": 100330 + }, + { + "epoch": 0.43078059126074375, + "grad_norm": 0.0030079390853643417, + "learning_rate": 5.7167803523537675e-05, + "loss": 0.24836654663085939, + "step": 100340 + }, + { + "epoch": 0.43082352335076374, + "grad_norm": 0.13114942610263824, + "learning_rate": 5.716349180342005e-05, + "loss": 0.12937086820602417, + "step": 100350 + }, + { + "epoch": 0.4308664554407838, + "grad_norm": 0.06914433091878891, + "learning_rate": 5.715918008330243e-05, + "loss": 0.33217825889587405, + "step": 100360 + }, + { + "epoch": 0.43090938753080377, + "grad_norm": 0.8148689866065979, + "learning_rate": 5.7154868363184814e-05, + "loss": 0.13250348567962647, + "step": 100370 + }, + { + "epoch": 0.43095231962082375, + "grad_norm": 8.22535514831543, + "learning_rate": 5.715055664306719e-05, + "loss": 0.48774261474609376, + "step": 100380 + }, + { + "epoch": 0.4309952517108438, + "grad_norm": 0.14958524703979492, + "learning_rate": 5.714624492294957e-05, + "loss": 0.39161689281463624, + "step": 100390 + }, + { + "epoch": 0.4310381838008638, + "grad_norm": 0.1226431354880333, + "learning_rate": 5.7141933202831946e-05, + "loss": 0.25230252742767334, + "step": 100400 + }, + { + "epoch": 0.4310811158908838, + "grad_norm": 0.5269021987915039, + "learning_rate": 5.713762148271432e-05, + "loss": 0.19572103023529053, + "step": 100410 + }, + { + "epoch": 0.4311240479809038, + "grad_norm": 0.0011158520355820656, + "learning_rate": 5.71333097625967e-05, + "loss": 0.10176982879638671, + "step": 100420 + }, + { + "epoch": 0.4311669800709238, + "grad_norm": 0.028794605284929276, + "learning_rate": 5.7128998042479064e-05, + "loss": 0.30993683338165284, + "step": 100430 + }, + { + "epoch": 0.43120991216094384, + "grad_norm": 0.06879492849111557, + "learning_rate": 5.712468632236144e-05, + "loss": 0.2143555164337158, + "step": 100440 + }, + { + "epoch": 0.43125284425096383, + "grad_norm": 0.020163610577583313, + "learning_rate": 5.712037460224382e-05, + "loss": 0.26235687732696533, + "step": 100450 + }, + { + "epoch": 0.4312957763409838, + "grad_norm": 0.6938256025314331, + "learning_rate": 5.7116062882126196e-05, + "loss": 0.24439234733581544, + "step": 100460 + }, + { + "epoch": 0.43133870843100386, + "grad_norm": 0.029514238238334656, + "learning_rate": 5.7111751162008573e-05, + "loss": 0.14914549589157106, + "step": 100470 + }, + { + "epoch": 0.43138164052102385, + "grad_norm": 0.04761035367846489, + "learning_rate": 5.710743944189095e-05, + "loss": 0.30039823055267334, + "step": 100480 + }, + { + "epoch": 0.43142457261104383, + "grad_norm": 10.50465202331543, + "learning_rate": 5.710312772177333e-05, + "loss": 0.1130056619644165, + "step": 100490 + }, + { + "epoch": 0.4314675047010639, + "grad_norm": 0.05910739675164223, + "learning_rate": 5.7098816001655705e-05, + "loss": 0.13838412761688232, + "step": 100500 + }, + { + "epoch": 0.43151043679108386, + "grad_norm": 0.022242436185479164, + "learning_rate": 5.7094504281538076e-05, + "loss": 0.20934643745422363, + "step": 100510 + }, + { + "epoch": 0.43155336888110385, + "grad_norm": 1.522462248802185, + "learning_rate": 5.709019256142045e-05, + "loss": 0.30031299591064453, + "step": 100520 + }, + { + "epoch": 0.4315963009711239, + "grad_norm": 1.5211167335510254, + "learning_rate": 5.708588084130283e-05, + "loss": 0.22337019443511963, + "step": 100530 + }, + { + "epoch": 0.4316392330611439, + "grad_norm": 2.0634267330169678, + "learning_rate": 5.708156912118521e-05, + "loss": 0.19905364513397217, + "step": 100540 + }, + { + "epoch": 0.43168216515116387, + "grad_norm": 0.9744377136230469, + "learning_rate": 5.7077257401067585e-05, + "loss": 0.2310699224472046, + "step": 100550 + }, + { + "epoch": 0.4317250972411839, + "grad_norm": 0.00894715916365385, + "learning_rate": 5.707294568094996e-05, + "loss": 0.3550968408584595, + "step": 100560 + }, + { + "epoch": 0.4317680293312039, + "grad_norm": 0.10767655074596405, + "learning_rate": 5.706863396083234e-05, + "loss": 0.18549597263336182, + "step": 100570 + }, + { + "epoch": 0.4318109614212239, + "grad_norm": 0.009473497979342937, + "learning_rate": 5.706432224071472e-05, + "loss": 0.18380953073501588, + "step": 100580 + }, + { + "epoch": 0.4318538935112439, + "grad_norm": 0.0027962077874690294, + "learning_rate": 5.706001052059709e-05, + "loss": 0.2939144134521484, + "step": 100590 + }, + { + "epoch": 0.4318968256012639, + "grad_norm": 1.0059584379196167, + "learning_rate": 5.7055698800479465e-05, + "loss": 0.22364864349365235, + "step": 100600 + }, + { + "epoch": 0.43193975769128395, + "grad_norm": 0.13138823211193085, + "learning_rate": 5.705138708036184e-05, + "loss": 0.4219820022583008, + "step": 100610 + }, + { + "epoch": 0.43198268978130394, + "grad_norm": 3.08107590675354, + "learning_rate": 5.704707536024422e-05, + "loss": 0.24338581562042236, + "step": 100620 + }, + { + "epoch": 0.43202562187132393, + "grad_norm": 0.029116906225681305, + "learning_rate": 5.70427636401266e-05, + "loss": 0.12408316135406494, + "step": 100630 + }, + { + "epoch": 0.43206855396134397, + "grad_norm": 0.01473816204816103, + "learning_rate": 5.7038451920008975e-05, + "loss": 0.23553252220153809, + "step": 100640 + }, + { + "epoch": 0.43211148605136396, + "grad_norm": 0.46792203187942505, + "learning_rate": 5.703414019989135e-05, + "loss": 0.07469576001167297, + "step": 100650 + }, + { + "epoch": 0.43215441814138394, + "grad_norm": 0.1712280511856079, + "learning_rate": 5.702982847977373e-05, + "loss": 0.21774008274078369, + "step": 100660 + }, + { + "epoch": 0.432197350231404, + "grad_norm": 0.0017177191330119967, + "learning_rate": 5.702551675965609e-05, + "loss": 0.18529870510101318, + "step": 100670 + }, + { + "epoch": 0.432240282321424, + "grad_norm": 0.0058692400343716145, + "learning_rate": 5.702120503953847e-05, + "loss": 0.10345749855041504, + "step": 100680 + }, + { + "epoch": 0.43228321441144396, + "grad_norm": 1.4120231866836548, + "learning_rate": 5.701689331942085e-05, + "loss": 0.13298569917678832, + "step": 100690 + }, + { + "epoch": 0.432326146501464, + "grad_norm": 0.7551203966140747, + "learning_rate": 5.7012581599303225e-05, + "loss": 0.09198079109191895, + "step": 100700 + }, + { + "epoch": 0.432369078591484, + "grad_norm": 0.03402279317378998, + "learning_rate": 5.70082698791856e-05, + "loss": 0.49449887275695803, + "step": 100710 + }, + { + "epoch": 0.432412010681504, + "grad_norm": 0.002699479693546891, + "learning_rate": 5.700395815906798e-05, + "loss": 0.28117620944976807, + "step": 100720 + }, + { + "epoch": 0.432454942771524, + "grad_norm": 0.04458874464035034, + "learning_rate": 5.699964643895036e-05, + "loss": 0.20188190937042236, + "step": 100730 + }, + { + "epoch": 0.432497874861544, + "grad_norm": 0.013446901924908161, + "learning_rate": 5.699533471883274e-05, + "loss": 0.07479428052902222, + "step": 100740 + }, + { + "epoch": 0.432540806951564, + "grad_norm": 3.244983434677124, + "learning_rate": 5.699102299871512e-05, + "loss": 0.1925884008407593, + "step": 100750 + }, + { + "epoch": 0.43258373904158404, + "grad_norm": 0.005489411298185587, + "learning_rate": 5.698671127859748e-05, + "loss": 0.20378761291503905, + "step": 100760 + }, + { + "epoch": 0.432626671131604, + "grad_norm": 0.06427086144685745, + "learning_rate": 5.698239955847986e-05, + "loss": 0.10766814947128296, + "step": 100770 + }, + { + "epoch": 0.432669603221624, + "grad_norm": 0.009699106216430664, + "learning_rate": 5.697808783836224e-05, + "loss": 0.06433448791503907, + "step": 100780 + }, + { + "epoch": 0.43271253531164405, + "grad_norm": 0.016040675342082977, + "learning_rate": 5.6973776118244615e-05, + "loss": 0.09252756834030151, + "step": 100790 + }, + { + "epoch": 0.43275546740166404, + "grad_norm": 6.488517761230469, + "learning_rate": 5.696946439812699e-05, + "loss": 0.3273911952972412, + "step": 100800 + }, + { + "epoch": 0.432798399491684, + "grad_norm": 32.52063751220703, + "learning_rate": 5.696515267800937e-05, + "loss": 0.15922000408172607, + "step": 100810 + }, + { + "epoch": 0.43284133158170407, + "grad_norm": 1.7135497331619263, + "learning_rate": 5.6960840957891747e-05, + "loss": 0.2881903171539307, + "step": 100820 + }, + { + "epoch": 0.43288426367172406, + "grad_norm": 8.541507720947266, + "learning_rate": 5.6956529237774124e-05, + "loss": 0.2778735399246216, + "step": 100830 + }, + { + "epoch": 0.4329271957617441, + "grad_norm": 0.0020988276228308678, + "learning_rate": 5.6952217517656494e-05, + "loss": 0.15711830854415892, + "step": 100840 + }, + { + "epoch": 0.4329701278517641, + "grad_norm": 0.117733895778656, + "learning_rate": 5.694790579753887e-05, + "loss": 0.23034195899963378, + "step": 100850 + }, + { + "epoch": 0.4330130599417841, + "grad_norm": 1.125941276550293, + "learning_rate": 5.694359407742125e-05, + "loss": 0.31992788314819337, + "step": 100860 + }, + { + "epoch": 0.4330559920318041, + "grad_norm": 0.4024018943309784, + "learning_rate": 5.6939282357303626e-05, + "loss": 0.11440498828887939, + "step": 100870 + }, + { + "epoch": 0.4330989241218241, + "grad_norm": 0.6626192927360535, + "learning_rate": 5.6934970637186004e-05, + "loss": 0.009410639107227326, + "step": 100880 + }, + { + "epoch": 0.4331418562118441, + "grad_norm": 0.8921322822570801, + "learning_rate": 5.693065891706838e-05, + "loss": 0.18852705955505372, + "step": 100890 + }, + { + "epoch": 0.43318478830186413, + "grad_norm": 2.921628952026367, + "learning_rate": 5.692634719695076e-05, + "loss": 0.1942846179008484, + "step": 100900 + }, + { + "epoch": 0.4332277203918841, + "grad_norm": 0.0008382099331356585, + "learning_rate": 5.6922035476833136e-05, + "loss": 0.26661832332611085, + "step": 100910 + }, + { + "epoch": 0.4332706524819041, + "grad_norm": 1.34660005569458, + "learning_rate": 5.69177237567155e-05, + "loss": 0.3429716110229492, + "step": 100920 + }, + { + "epoch": 0.43331358457192415, + "grad_norm": 0.41357937455177307, + "learning_rate": 5.691341203659788e-05, + "loss": 0.16713021993637084, + "step": 100930 + }, + { + "epoch": 0.43335651666194414, + "grad_norm": 0.03519873693585396, + "learning_rate": 5.6909100316480254e-05, + "loss": 0.10253534317016602, + "step": 100940 + }, + { + "epoch": 0.4333994487519641, + "grad_norm": 0.01499355398118496, + "learning_rate": 5.690478859636263e-05, + "loss": 0.21979115009307862, + "step": 100950 + }, + { + "epoch": 0.43344238084198417, + "grad_norm": 0.0017051781760528684, + "learning_rate": 5.6900476876245016e-05, + "loss": 0.1390715718269348, + "step": 100960 + }, + { + "epoch": 0.43348531293200415, + "grad_norm": 0.07887522876262665, + "learning_rate": 5.689616515612739e-05, + "loss": 0.36191625595092775, + "step": 100970 + }, + { + "epoch": 0.43352824502202414, + "grad_norm": 2.325713634490967, + "learning_rate": 5.689185343600977e-05, + "loss": 0.06256782412528991, + "step": 100980 + }, + { + "epoch": 0.4335711771120442, + "grad_norm": 0.08764491975307465, + "learning_rate": 5.688754171589215e-05, + "loss": 0.2967262029647827, + "step": 100990 + }, + { + "epoch": 0.43361410920206417, + "grad_norm": 1.0276341438293457, + "learning_rate": 5.688322999577451e-05, + "loss": 0.5041379928588867, + "step": 101000 + }, + { + "epoch": 0.43361410920206417, + "eval_loss": 0.4284164309501648, + "eval_runtime": 27.1219, + "eval_samples_per_second": 3.687, + "eval_steps_per_second": 3.687, + "step": 101000 + }, + { + "epoch": 0.43365704129208416, + "grad_norm": 0.161605104804039, + "learning_rate": 5.687891827565689e-05, + "loss": 0.20394842624664306, + "step": 101010 + }, + { + "epoch": 0.4336999733821042, + "grad_norm": 23.645925521850586, + "learning_rate": 5.6874606555539266e-05, + "loss": 0.28007776737213136, + "step": 101020 + }, + { + "epoch": 0.4337429054721242, + "grad_norm": 0.01618562825024128, + "learning_rate": 5.6870294835421644e-05, + "loss": 0.3052077293395996, + "step": 101030 + }, + { + "epoch": 0.4337858375621442, + "grad_norm": 0.5638077259063721, + "learning_rate": 5.686598311530402e-05, + "loss": 0.3267589807510376, + "step": 101040 + }, + { + "epoch": 0.4338287696521642, + "grad_norm": 3.6884796619415283, + "learning_rate": 5.68616713951864e-05, + "loss": 0.3442475080490112, + "step": 101050 + }, + { + "epoch": 0.4338717017421842, + "grad_norm": 0.0034317022655159235, + "learning_rate": 5.6857359675068776e-05, + "loss": 0.2257145643234253, + "step": 101060 + }, + { + "epoch": 0.43391463383220424, + "grad_norm": 0.02168785035610199, + "learning_rate": 5.685304795495115e-05, + "loss": 0.10080244541168212, + "step": 101070 + }, + { + "epoch": 0.43395756592222423, + "grad_norm": 0.024649212136864662, + "learning_rate": 5.6848736234833524e-05, + "loss": 0.15533299446105958, + "step": 101080 + }, + { + "epoch": 0.4340004980122442, + "grad_norm": 0.5593084096908569, + "learning_rate": 5.68444245147159e-05, + "loss": 0.20375723838806153, + "step": 101090 + }, + { + "epoch": 0.43404343010226426, + "grad_norm": 0.07193995267152786, + "learning_rate": 5.684011279459828e-05, + "loss": 0.17142632007598876, + "step": 101100 + }, + { + "epoch": 0.43408636219228425, + "grad_norm": 0.004242885857820511, + "learning_rate": 5.6835801074480656e-05, + "loss": 0.3353287935256958, + "step": 101110 + }, + { + "epoch": 0.43412929428230423, + "grad_norm": 0.9329225420951843, + "learning_rate": 5.683148935436303e-05, + "loss": 0.20502972602844238, + "step": 101120 + }, + { + "epoch": 0.4341722263723243, + "grad_norm": 2.0356602668762207, + "learning_rate": 5.682717763424541e-05, + "loss": 0.16802514791488649, + "step": 101130 + }, + { + "epoch": 0.43421515846234426, + "grad_norm": 4.4602837562561035, + "learning_rate": 5.682286591412779e-05, + "loss": 0.35306363105773925, + "step": 101140 + }, + { + "epoch": 0.43425809055236425, + "grad_norm": 0.0073759243823587894, + "learning_rate": 5.6818554194010165e-05, + "loss": 0.11986923217773438, + "step": 101150 + }, + { + "epoch": 0.4343010226423843, + "grad_norm": 0.014532789587974548, + "learning_rate": 5.681424247389254e-05, + "loss": 0.21006014347076415, + "step": 101160 + }, + { + "epoch": 0.4343439547324043, + "grad_norm": 1.4600069522857666, + "learning_rate": 5.6809930753774906e-05, + "loss": 0.3080946445465088, + "step": 101170 + }, + { + "epoch": 0.43438688682242427, + "grad_norm": 3.517202854156494, + "learning_rate": 5.680561903365729e-05, + "loss": 0.23347063064575196, + "step": 101180 + }, + { + "epoch": 0.4344298189124443, + "grad_norm": 0.09192164242267609, + "learning_rate": 5.680130731353967e-05, + "loss": 0.08297334313392639, + "step": 101190 + }, + { + "epoch": 0.4344727510024643, + "grad_norm": 14.890531539916992, + "learning_rate": 5.6796995593422045e-05, + "loss": 0.3029682397842407, + "step": 101200 + }, + { + "epoch": 0.4345156830924843, + "grad_norm": 0.08431587368249893, + "learning_rate": 5.679268387330442e-05, + "loss": 0.05657966732978821, + "step": 101210 + }, + { + "epoch": 0.4345586151825043, + "grad_norm": 0.004789439029991627, + "learning_rate": 5.67883721531868e-05, + "loss": 0.06281794905662537, + "step": 101220 + }, + { + "epoch": 0.4346015472725243, + "grad_norm": 0.007313930429518223, + "learning_rate": 5.678406043306918e-05, + "loss": 0.23483545780181886, + "step": 101230 + }, + { + "epoch": 0.4346444793625443, + "grad_norm": 0.006183262914419174, + "learning_rate": 5.6779748712951554e-05, + "loss": 0.320169997215271, + "step": 101240 + }, + { + "epoch": 0.43468741145256434, + "grad_norm": 2.053785800933838, + "learning_rate": 5.677543699283392e-05, + "loss": 0.4159492015838623, + "step": 101250 + }, + { + "epoch": 0.43473034354258433, + "grad_norm": 0.33240818977355957, + "learning_rate": 5.6771125272716295e-05, + "loss": 0.10303497314453125, + "step": 101260 + }, + { + "epoch": 0.4347732756326044, + "grad_norm": 3.343055486679077, + "learning_rate": 5.676681355259867e-05, + "loss": 0.28133907318115237, + "step": 101270 + }, + { + "epoch": 0.43481620772262436, + "grad_norm": 0.10711721330881119, + "learning_rate": 5.676250183248105e-05, + "loss": 0.2311713457107544, + "step": 101280 + }, + { + "epoch": 0.43485913981264435, + "grad_norm": 0.08864771574735641, + "learning_rate": 5.675819011236343e-05, + "loss": 0.3245322465896606, + "step": 101290 + }, + { + "epoch": 0.4349020719026644, + "grad_norm": 0.005172067321836948, + "learning_rate": 5.6753878392245805e-05, + "loss": 0.1757732152938843, + "step": 101300 + }, + { + "epoch": 0.4349450039926844, + "grad_norm": 0.08573029935359955, + "learning_rate": 5.674956667212818e-05, + "loss": 0.07301101684570313, + "step": 101310 + }, + { + "epoch": 0.43498793608270436, + "grad_norm": 0.5399850606918335, + "learning_rate": 5.674525495201056e-05, + "loss": 0.1713407278060913, + "step": 101320 + }, + { + "epoch": 0.4350308681727244, + "grad_norm": 0.3291313052177429, + "learning_rate": 5.674094323189293e-05, + "loss": 0.09000024199485779, + "step": 101330 + }, + { + "epoch": 0.4350738002627444, + "grad_norm": 0.020964227616786957, + "learning_rate": 5.673663151177531e-05, + "loss": 0.07782799601554871, + "step": 101340 + }, + { + "epoch": 0.4351167323527644, + "grad_norm": 0.020042628049850464, + "learning_rate": 5.6732319791657685e-05, + "loss": 0.48969502449035646, + "step": 101350 + }, + { + "epoch": 0.4351596644427844, + "grad_norm": 0.0018864471931010485, + "learning_rate": 5.672800807154006e-05, + "loss": 0.11262784004211426, + "step": 101360 + }, + { + "epoch": 0.4352025965328044, + "grad_norm": 0.0246067363768816, + "learning_rate": 5.672369635142244e-05, + "loss": 0.2091688871383667, + "step": 101370 + }, + { + "epoch": 0.4352455286228244, + "grad_norm": 4.4579548835754395, + "learning_rate": 5.671938463130482e-05, + "loss": 0.06489126682281494, + "step": 101380 + }, + { + "epoch": 0.43528846071284444, + "grad_norm": 1.8331623077392578, + "learning_rate": 5.6715072911187194e-05, + "loss": 0.22519593238830565, + "step": 101390 + }, + { + "epoch": 0.4353313928028644, + "grad_norm": 0.13178208470344543, + "learning_rate": 5.671076119106957e-05, + "loss": 0.08844862580299377, + "step": 101400 + }, + { + "epoch": 0.4353743248928844, + "grad_norm": 5.8470306396484375, + "learning_rate": 5.670644947095194e-05, + "loss": 0.283965539932251, + "step": 101410 + }, + { + "epoch": 0.43541725698290445, + "grad_norm": 1.977359414100647, + "learning_rate": 5.670213775083432e-05, + "loss": 0.31743721961975097, + "step": 101420 + }, + { + "epoch": 0.43546018907292444, + "grad_norm": 0.006298946216702461, + "learning_rate": 5.66978260307167e-05, + "loss": 0.2732826232910156, + "step": 101430 + }, + { + "epoch": 0.43550312116294443, + "grad_norm": 0.015092175453901291, + "learning_rate": 5.6693514310599074e-05, + "loss": 0.1754346489906311, + "step": 101440 + }, + { + "epoch": 0.43554605325296447, + "grad_norm": 0.006818375550210476, + "learning_rate": 5.668920259048145e-05, + "loss": 0.06602421402931213, + "step": 101450 + }, + { + "epoch": 0.43558898534298446, + "grad_norm": 0.10856548696756363, + "learning_rate": 5.668489087036383e-05, + "loss": 0.41567420959472656, + "step": 101460 + }, + { + "epoch": 0.4356319174330045, + "grad_norm": 0.0782633051276207, + "learning_rate": 5.6680579150246206e-05, + "loss": 0.1458239197731018, + "step": 101470 + }, + { + "epoch": 0.4356748495230245, + "grad_norm": 0.009810393676161766, + "learning_rate": 5.6676267430128583e-05, + "loss": 0.30247931480407714, + "step": 101480 + }, + { + "epoch": 0.4357177816130445, + "grad_norm": 1.1154206991195679, + "learning_rate": 5.667195571001096e-05, + "loss": 0.1663208246231079, + "step": 101490 + }, + { + "epoch": 0.4357607137030645, + "grad_norm": 1.5516237020492554, + "learning_rate": 5.6667643989893325e-05, + "loss": 0.41340136528015137, + "step": 101500 + }, + { + "epoch": 0.4358036457930845, + "grad_norm": 0.004893908742815256, + "learning_rate": 5.66633322697757e-05, + "loss": 0.30065782070159913, + "step": 101510 + }, + { + "epoch": 0.4358465778831045, + "grad_norm": 0.7853567004203796, + "learning_rate": 5.665902054965808e-05, + "loss": 0.24362916946411134, + "step": 101520 + }, + { + "epoch": 0.43588950997312453, + "grad_norm": 1.3496156930923462, + "learning_rate": 5.6654708829540457e-05, + "loss": 0.17322909832000732, + "step": 101530 + }, + { + "epoch": 0.4359324420631445, + "grad_norm": 0.8741046786308289, + "learning_rate": 5.6650397109422834e-05, + "loss": 0.08911564946174622, + "step": 101540 + }, + { + "epoch": 0.4359753741531645, + "grad_norm": 0.005866996478289366, + "learning_rate": 5.664608538930521e-05, + "loss": 0.11345422267913818, + "step": 101550 + }, + { + "epoch": 0.43601830624318455, + "grad_norm": 0.0030817079823464155, + "learning_rate": 5.6641773669187595e-05, + "loss": 0.08362066745758057, + "step": 101560 + }, + { + "epoch": 0.43606123833320454, + "grad_norm": 0.022512266412377357, + "learning_rate": 5.663746194906997e-05, + "loss": 0.2745735883712769, + "step": 101570 + }, + { + "epoch": 0.4361041704232245, + "grad_norm": 1.2275443077087402, + "learning_rate": 5.6633150228952337e-05, + "loss": 0.4065723419189453, + "step": 101580 + }, + { + "epoch": 0.43614710251324457, + "grad_norm": 0.006209479179233313, + "learning_rate": 5.6628838508834714e-05, + "loss": 0.4551173210144043, + "step": 101590 + }, + { + "epoch": 0.43619003460326455, + "grad_norm": 1.0085053443908691, + "learning_rate": 5.662452678871709e-05, + "loss": 0.17605886459350586, + "step": 101600 + }, + { + "epoch": 0.43623296669328454, + "grad_norm": 3.473280906677246, + "learning_rate": 5.662021506859947e-05, + "loss": 0.21241440773010253, + "step": 101610 + }, + { + "epoch": 0.4362758987833046, + "grad_norm": 1.7464669942855835, + "learning_rate": 5.6615903348481846e-05, + "loss": 0.1767161011695862, + "step": 101620 + }, + { + "epoch": 0.43631883087332457, + "grad_norm": 0.19424454867839813, + "learning_rate": 5.661159162836422e-05, + "loss": 0.2732317686080933, + "step": 101630 + }, + { + "epoch": 0.43636176296334456, + "grad_norm": 0.05577385425567627, + "learning_rate": 5.66072799082466e-05, + "loss": 0.2823940753936768, + "step": 101640 + }, + { + "epoch": 0.4364046950533646, + "grad_norm": 3.1323673725128174, + "learning_rate": 5.660296818812898e-05, + "loss": 0.18861196041107178, + "step": 101650 + }, + { + "epoch": 0.4364476271433846, + "grad_norm": 0.3599117398262024, + "learning_rate": 5.659865646801135e-05, + "loss": 0.1460519552230835, + "step": 101660 + }, + { + "epoch": 0.4364905592334046, + "grad_norm": 1.0804738998413086, + "learning_rate": 5.6594344747893726e-05, + "loss": 0.33149147033691406, + "step": 101670 + }, + { + "epoch": 0.4365334913234246, + "grad_norm": 0.6159604787826538, + "learning_rate": 5.65900330277761e-05, + "loss": 0.15043935775756836, + "step": 101680 + }, + { + "epoch": 0.4365764234134446, + "grad_norm": 0.06314518302679062, + "learning_rate": 5.658572130765848e-05, + "loss": 0.1825084686279297, + "step": 101690 + }, + { + "epoch": 0.43661935550346465, + "grad_norm": 0.017460836097598076, + "learning_rate": 5.658140958754086e-05, + "loss": 0.23410093784332275, + "step": 101700 + }, + { + "epoch": 0.43666228759348463, + "grad_norm": 0.8213013410568237, + "learning_rate": 5.6577097867423235e-05, + "loss": 0.10457472801208496, + "step": 101710 + }, + { + "epoch": 0.4367052196835046, + "grad_norm": 3.3704280853271484, + "learning_rate": 5.657278614730561e-05, + "loss": 0.2720768690109253, + "step": 101720 + }, + { + "epoch": 0.43674815177352466, + "grad_norm": 1.2922886610031128, + "learning_rate": 5.656847442718799e-05, + "loss": 0.27418065071105957, + "step": 101730 + }, + { + "epoch": 0.43679108386354465, + "grad_norm": 0.06490170955657959, + "learning_rate": 5.6564162707070354e-05, + "loss": 0.17207794189453124, + "step": 101740 + }, + { + "epoch": 0.43683401595356464, + "grad_norm": 0.8208549618721008, + "learning_rate": 5.655985098695273e-05, + "loss": 0.2595613718032837, + "step": 101750 + }, + { + "epoch": 0.4368769480435847, + "grad_norm": 0.004950092639774084, + "learning_rate": 5.655553926683511e-05, + "loss": 0.1528960108757019, + "step": 101760 + }, + { + "epoch": 0.43691988013360467, + "grad_norm": 0.660977840423584, + "learning_rate": 5.6551227546717486e-05, + "loss": 0.44907441139221194, + "step": 101770 + }, + { + "epoch": 0.43696281222362465, + "grad_norm": 6.029102802276611, + "learning_rate": 5.654691582659987e-05, + "loss": 0.12078993320465088, + "step": 101780 + }, + { + "epoch": 0.4370057443136447, + "grad_norm": 0.2080804705619812, + "learning_rate": 5.654260410648225e-05, + "loss": 0.15819777250289918, + "step": 101790 + }, + { + "epoch": 0.4370486764036647, + "grad_norm": 0.1170460432767868, + "learning_rate": 5.6538292386364625e-05, + "loss": 0.21379849910736085, + "step": 101800 + }, + { + "epoch": 0.43709160849368467, + "grad_norm": 0.16317272186279297, + "learning_rate": 5.6533980666247e-05, + "loss": 0.1071736216545105, + "step": 101810 + }, + { + "epoch": 0.4371345405837047, + "grad_norm": 0.06004194915294647, + "learning_rate": 5.6529668946129366e-05, + "loss": 0.32128081321716306, + "step": 101820 + }, + { + "epoch": 0.4371774726737247, + "grad_norm": 0.3741120398044586, + "learning_rate": 5.652535722601174e-05, + "loss": 0.25509469509124755, + "step": 101830 + }, + { + "epoch": 0.4372204047637447, + "grad_norm": 0.04972869157791138, + "learning_rate": 5.652104550589412e-05, + "loss": 0.24276161193847656, + "step": 101840 + }, + { + "epoch": 0.43726333685376473, + "grad_norm": 0.10509629547595978, + "learning_rate": 5.65167337857765e-05, + "loss": 0.27646722793579104, + "step": 101850 + }, + { + "epoch": 0.4373062689437847, + "grad_norm": 0.03949575871229172, + "learning_rate": 5.6512422065658875e-05, + "loss": 0.2635036945343018, + "step": 101860 + }, + { + "epoch": 0.4373492010338047, + "grad_norm": 0.13845627009868622, + "learning_rate": 5.650811034554125e-05, + "loss": 0.25788857936859133, + "step": 101870 + }, + { + "epoch": 0.43739213312382474, + "grad_norm": 0.3628508448600769, + "learning_rate": 5.650379862542363e-05, + "loss": 0.2939404726028442, + "step": 101880 + }, + { + "epoch": 0.43743506521384473, + "grad_norm": 3.7908737659454346, + "learning_rate": 5.649948690530601e-05, + "loss": 0.3650730848312378, + "step": 101890 + }, + { + "epoch": 0.4374779973038648, + "grad_norm": 0.15260189771652222, + "learning_rate": 5.6495175185188384e-05, + "loss": 0.2733223676681519, + "step": 101900 + }, + { + "epoch": 0.43752092939388476, + "grad_norm": 0.03667456656694412, + "learning_rate": 5.6490863465070755e-05, + "loss": 0.23879833221435548, + "step": 101910 + }, + { + "epoch": 0.43756386148390475, + "grad_norm": 1.1296567916870117, + "learning_rate": 5.648655174495313e-05, + "loss": 0.3958956241607666, + "step": 101920 + }, + { + "epoch": 0.4376067935739248, + "grad_norm": 0.9460829496383667, + "learning_rate": 5.648224002483551e-05, + "loss": 0.17357568740844725, + "step": 101930 + }, + { + "epoch": 0.4376497256639448, + "grad_norm": 0.013191147707402706, + "learning_rate": 5.647792830471789e-05, + "loss": 0.2423572540283203, + "step": 101940 + }, + { + "epoch": 0.43769265775396476, + "grad_norm": 0.08220737427473068, + "learning_rate": 5.6473616584600264e-05, + "loss": 0.10584760904312134, + "step": 101950 + }, + { + "epoch": 0.4377355898439848, + "grad_norm": 0.17858093976974487, + "learning_rate": 5.646930486448264e-05, + "loss": 0.31906838417053224, + "step": 101960 + }, + { + "epoch": 0.4377785219340048, + "grad_norm": 1.531294822692871, + "learning_rate": 5.646499314436502e-05, + "loss": 0.2012697458267212, + "step": 101970 + }, + { + "epoch": 0.4378214540240248, + "grad_norm": 0.1400694102048874, + "learning_rate": 5.6460681424247396e-05, + "loss": 0.09353853464126587, + "step": 101980 + }, + { + "epoch": 0.4378643861140448, + "grad_norm": 0.09306161850690842, + "learning_rate": 5.645636970412976e-05, + "loss": 0.4895349025726318, + "step": 101990 + }, + { + "epoch": 0.4379073182040648, + "grad_norm": 1.4866420030593872, + "learning_rate": 5.6452057984012144e-05, + "loss": 0.2991748094558716, + "step": 102000 + }, + { + "epoch": 0.4379073182040648, + "eval_loss": 0.4135998785495758, + "eval_runtime": 27.1015, + "eval_samples_per_second": 3.69, + "eval_steps_per_second": 3.69, + "step": 102000 + }, + { + "epoch": 0.4379502502940848, + "grad_norm": 0.11813008040189743, + "learning_rate": 5.644774626389452e-05, + "loss": 0.25430753231048586, + "step": 102010 + }, + { + "epoch": 0.43799318238410484, + "grad_norm": 0.5968402028083801, + "learning_rate": 5.64434345437769e-05, + "loss": 0.4001627445220947, + "step": 102020 + }, + { + "epoch": 0.4380361144741248, + "grad_norm": 2.1716525554656982, + "learning_rate": 5.6439122823659276e-05, + "loss": 0.30237655639648436, + "step": 102030 + }, + { + "epoch": 0.4380790465641448, + "grad_norm": 0.002318573882803321, + "learning_rate": 5.6434811103541654e-05, + "loss": 0.2868093967437744, + "step": 102040 + }, + { + "epoch": 0.43812197865416486, + "grad_norm": 1.7742433547973633, + "learning_rate": 5.643049938342403e-05, + "loss": 0.056132668256759645, + "step": 102050 + }, + { + "epoch": 0.43816491074418484, + "grad_norm": 4.7987470626831055, + "learning_rate": 5.642618766330641e-05, + "loss": 0.29152579307556153, + "step": 102060 + }, + { + "epoch": 0.43820784283420483, + "grad_norm": 0.07227819412946701, + "learning_rate": 5.642187594318877e-05, + "loss": 0.2815880537033081, + "step": 102070 + }, + { + "epoch": 0.4382507749242249, + "grad_norm": 0.06919845938682556, + "learning_rate": 5.641756422307115e-05, + "loss": 0.13695106506347657, + "step": 102080 + }, + { + "epoch": 0.43829370701424486, + "grad_norm": 4.543615341186523, + "learning_rate": 5.641325250295353e-05, + "loss": 0.18619425296783448, + "step": 102090 + }, + { + "epoch": 0.43833663910426485, + "grad_norm": 0.25457215309143066, + "learning_rate": 5.6408940782835904e-05, + "loss": 0.14458426237106323, + "step": 102100 + }, + { + "epoch": 0.4383795711942849, + "grad_norm": 0.011515560559928417, + "learning_rate": 5.640462906271828e-05, + "loss": 0.1838878870010376, + "step": 102110 + }, + { + "epoch": 0.4384225032843049, + "grad_norm": 0.0019815945997834206, + "learning_rate": 5.640031734260066e-05, + "loss": 0.20781657695770264, + "step": 102120 + }, + { + "epoch": 0.4384654353743249, + "grad_norm": 1.029922366142273, + "learning_rate": 5.6396005622483036e-05, + "loss": 0.20602426528930665, + "step": 102130 + }, + { + "epoch": 0.4385083674643449, + "grad_norm": 0.0010533623863011599, + "learning_rate": 5.6391693902365414e-05, + "loss": 0.03147282600402832, + "step": 102140 + }, + { + "epoch": 0.4385512995543649, + "grad_norm": 0.26850226521492004, + "learning_rate": 5.6387382182247784e-05, + "loss": 0.28639023303985595, + "step": 102150 + }, + { + "epoch": 0.43859423164438494, + "grad_norm": 3.8238465785980225, + "learning_rate": 5.638307046213016e-05, + "loss": 0.1597683310508728, + "step": 102160 + }, + { + "epoch": 0.4386371637344049, + "grad_norm": 1.4224358797073364, + "learning_rate": 5.637875874201254e-05, + "loss": 0.36101374626159666, + "step": 102170 + }, + { + "epoch": 0.4386800958244249, + "grad_norm": 5.2266364097595215, + "learning_rate": 5.6374447021894916e-05, + "loss": 0.24237642288208008, + "step": 102180 + }, + { + "epoch": 0.43872302791444495, + "grad_norm": 0.024161502718925476, + "learning_rate": 5.6370135301777293e-05, + "loss": 0.37608938217163085, + "step": 102190 + }, + { + "epoch": 0.43876596000446494, + "grad_norm": 0.06186579167842865, + "learning_rate": 5.636582358165967e-05, + "loss": 0.332311224937439, + "step": 102200 + }, + { + "epoch": 0.4388088920944849, + "grad_norm": 0.03216930106282234, + "learning_rate": 5.636151186154205e-05, + "loss": 0.30623137950897217, + "step": 102210 + }, + { + "epoch": 0.43885182418450497, + "grad_norm": 0.9261085391044617, + "learning_rate": 5.6357200141424425e-05, + "loss": 0.12676268815994263, + "step": 102220 + }, + { + "epoch": 0.43889475627452496, + "grad_norm": 0.28767600655555725, + "learning_rate": 5.63528884213068e-05, + "loss": 0.06856579780578613, + "step": 102230 + }, + { + "epoch": 0.43893768836454494, + "grad_norm": 1.7868995666503906, + "learning_rate": 5.6348576701189173e-05, + "loss": 0.03691851794719696, + "step": 102240 + }, + { + "epoch": 0.438980620454565, + "grad_norm": 0.3403833210468292, + "learning_rate": 5.634426498107155e-05, + "loss": 0.11112273931503296, + "step": 102250 + }, + { + "epoch": 0.43902355254458497, + "grad_norm": 0.07535845786333084, + "learning_rate": 5.633995326095393e-05, + "loss": 0.1796416759490967, + "step": 102260 + }, + { + "epoch": 0.43906648463460496, + "grad_norm": 0.021636424586176872, + "learning_rate": 5.6335641540836305e-05, + "loss": 0.07033033967018128, + "step": 102270 + }, + { + "epoch": 0.439109416724625, + "grad_norm": 2.452381134033203, + "learning_rate": 5.633132982071868e-05, + "loss": 0.2959134578704834, + "step": 102280 + }, + { + "epoch": 0.439152348814645, + "grad_norm": 2.967747926712036, + "learning_rate": 5.632701810060106e-05, + "loss": 0.20051445960998535, + "step": 102290 + }, + { + "epoch": 0.439195280904665, + "grad_norm": 1.3664977550506592, + "learning_rate": 5.632270638048344e-05, + "loss": 0.12291600704193115, + "step": 102300 + }, + { + "epoch": 0.439238212994685, + "grad_norm": 0.00837288424372673, + "learning_rate": 5.6318394660365815e-05, + "loss": 0.26719226837158205, + "step": 102310 + }, + { + "epoch": 0.439281145084705, + "grad_norm": 0.0758778303861618, + "learning_rate": 5.631408294024818e-05, + "loss": 0.21944777965545653, + "step": 102320 + }, + { + "epoch": 0.43932407717472505, + "grad_norm": 4.773903846740723, + "learning_rate": 5.6309771220130556e-05, + "loss": 0.30995585918426516, + "step": 102330 + }, + { + "epoch": 0.43936700926474503, + "grad_norm": 0.9918110370635986, + "learning_rate": 5.630545950001293e-05, + "loss": 0.24170682430267335, + "step": 102340 + }, + { + "epoch": 0.439409941354765, + "grad_norm": 0.03276235982775688, + "learning_rate": 5.630114777989531e-05, + "loss": 0.11611915826797485, + "step": 102350 + }, + { + "epoch": 0.43945287344478506, + "grad_norm": 0.03927241638302803, + "learning_rate": 5.629683605977769e-05, + "loss": 0.24892120361328124, + "step": 102360 + }, + { + "epoch": 0.43949580553480505, + "grad_norm": 0.04267513006925583, + "learning_rate": 5.629252433966007e-05, + "loss": 0.10058658123016358, + "step": 102370 + }, + { + "epoch": 0.43953873762482504, + "grad_norm": 0.0018779346719384193, + "learning_rate": 5.628821261954245e-05, + "loss": 0.29012117385864256, + "step": 102380 + }, + { + "epoch": 0.4395816697148451, + "grad_norm": 0.9698354005813599, + "learning_rate": 5.628390089942483e-05, + "loss": 0.3414642572402954, + "step": 102390 + }, + { + "epoch": 0.43962460180486507, + "grad_norm": 2.9953103065490723, + "learning_rate": 5.627958917930719e-05, + "loss": 0.0756386935710907, + "step": 102400 + }, + { + "epoch": 0.43966753389488505, + "grad_norm": 0.4900497496128082, + "learning_rate": 5.627527745918957e-05, + "loss": 0.34758667945861815, + "step": 102410 + }, + { + "epoch": 0.4397104659849051, + "grad_norm": 0.02272651344537735, + "learning_rate": 5.6270965739071945e-05, + "loss": 0.20230214595794677, + "step": 102420 + }, + { + "epoch": 0.4397533980749251, + "grad_norm": 0.03989941254258156, + "learning_rate": 5.626665401895432e-05, + "loss": 0.2181776762008667, + "step": 102430 + }, + { + "epoch": 0.43979633016494507, + "grad_norm": 0.026858249679207802, + "learning_rate": 5.62623422988367e-05, + "loss": 0.18684104681015015, + "step": 102440 + }, + { + "epoch": 0.4398392622549651, + "grad_norm": 0.23959459364414215, + "learning_rate": 5.625803057871908e-05, + "loss": 0.1459541440010071, + "step": 102450 + }, + { + "epoch": 0.4398821943449851, + "grad_norm": 0.004670017398893833, + "learning_rate": 5.6253718858601455e-05, + "loss": 0.11860576868057252, + "step": 102460 + }, + { + "epoch": 0.4399251264350051, + "grad_norm": 0.011067052371799946, + "learning_rate": 5.624940713848383e-05, + "loss": 0.39524543285369873, + "step": 102470 + }, + { + "epoch": 0.43996805852502513, + "grad_norm": 0.03430342674255371, + "learning_rate": 5.62450954183662e-05, + "loss": 0.3463152885437012, + "step": 102480 + }, + { + "epoch": 0.4400109906150451, + "grad_norm": 0.03823309764266014, + "learning_rate": 5.624078369824858e-05, + "loss": 0.2293764352798462, + "step": 102490 + }, + { + "epoch": 0.4400539227050651, + "grad_norm": 0.017376506701111794, + "learning_rate": 5.623647197813096e-05, + "loss": 0.2071463108062744, + "step": 102500 + }, + { + "epoch": 0.44009685479508515, + "grad_norm": 1.2558318376541138, + "learning_rate": 5.6232160258013335e-05, + "loss": 0.32691106796264646, + "step": 102510 + }, + { + "epoch": 0.44013978688510513, + "grad_norm": 2.95967698097229, + "learning_rate": 5.622784853789571e-05, + "loss": 0.17968182563781737, + "step": 102520 + }, + { + "epoch": 0.4401827189751251, + "grad_norm": 1.033097743988037, + "learning_rate": 5.622353681777809e-05, + "loss": 0.19898836612701415, + "step": 102530 + }, + { + "epoch": 0.44022565106514516, + "grad_norm": 0.016062144190073013, + "learning_rate": 5.6219225097660467e-05, + "loss": 0.09094843864440919, + "step": 102540 + }, + { + "epoch": 0.44026858315516515, + "grad_norm": 0.4532967805862427, + "learning_rate": 5.6214913377542844e-05, + "loss": 0.18726831674575806, + "step": 102550 + }, + { + "epoch": 0.4403115152451852, + "grad_norm": 0.014149283058941364, + "learning_rate": 5.621060165742522e-05, + "loss": 0.05682987570762634, + "step": 102560 + }, + { + "epoch": 0.4403544473352052, + "grad_norm": 2.137563467025757, + "learning_rate": 5.6206289937307585e-05, + "loss": 0.2961072206497192, + "step": 102570 + }, + { + "epoch": 0.44039737942522517, + "grad_norm": 0.04554494842886925, + "learning_rate": 5.620197821718996e-05, + "loss": 0.03142690062522888, + "step": 102580 + }, + { + "epoch": 0.4404403115152452, + "grad_norm": 0.005631288979202509, + "learning_rate": 5.6197666497072347e-05, + "loss": 0.12604271173477172, + "step": 102590 + }, + { + "epoch": 0.4404832436052652, + "grad_norm": 0.003667443059384823, + "learning_rate": 5.6193354776954724e-05, + "loss": 0.02955203056335449, + "step": 102600 + }, + { + "epoch": 0.4405261756952852, + "grad_norm": 0.05866995453834534, + "learning_rate": 5.61890430568371e-05, + "loss": 0.08454251289367676, + "step": 102610 + }, + { + "epoch": 0.4405691077853052, + "grad_norm": 0.913500964641571, + "learning_rate": 5.618473133671948e-05, + "loss": 0.262734055519104, + "step": 102620 + }, + { + "epoch": 0.4406120398753252, + "grad_norm": 4.365779399871826, + "learning_rate": 5.6180419616601856e-05, + "loss": 0.23714911937713623, + "step": 102630 + }, + { + "epoch": 0.4406549719653452, + "grad_norm": 1.4142851829528809, + "learning_rate": 5.617610789648423e-05, + "loss": 0.30322554111480715, + "step": 102640 + }, + { + "epoch": 0.44069790405536524, + "grad_norm": 5.523765563964844, + "learning_rate": 5.61717961763666e-05, + "loss": 0.2837998867034912, + "step": 102650 + }, + { + "epoch": 0.44074083614538523, + "grad_norm": 7.742196083068848, + "learning_rate": 5.6167484456248974e-05, + "loss": 0.2016591787338257, + "step": 102660 + }, + { + "epoch": 0.4407837682354052, + "grad_norm": 0.39852452278137207, + "learning_rate": 5.616317273613135e-05, + "loss": 0.16970374584197997, + "step": 102670 + }, + { + "epoch": 0.44082670032542526, + "grad_norm": 0.02069421112537384, + "learning_rate": 5.615886101601373e-05, + "loss": 0.323093843460083, + "step": 102680 + }, + { + "epoch": 0.44086963241544525, + "grad_norm": 0.009212727658450603, + "learning_rate": 5.6154549295896106e-05, + "loss": 0.22446053028106688, + "step": 102690 + }, + { + "epoch": 0.44091256450546523, + "grad_norm": 0.0052307466976344585, + "learning_rate": 5.6150237575778484e-05, + "loss": 0.16936511993408204, + "step": 102700 + }, + { + "epoch": 0.4409554965954853, + "grad_norm": 0.8425884246826172, + "learning_rate": 5.614592585566086e-05, + "loss": 0.41118998527526857, + "step": 102710 + }, + { + "epoch": 0.44099842868550526, + "grad_norm": 0.08288142830133438, + "learning_rate": 5.614161413554324e-05, + "loss": 0.08106495141983032, + "step": 102720 + }, + { + "epoch": 0.44104136077552525, + "grad_norm": 0.9253395199775696, + "learning_rate": 5.613730241542561e-05, + "loss": 0.2048487663269043, + "step": 102730 + }, + { + "epoch": 0.4410842928655453, + "grad_norm": 3.6987619400024414, + "learning_rate": 5.6132990695307986e-05, + "loss": 0.49891486167907717, + "step": 102740 + }, + { + "epoch": 0.4411272249555653, + "grad_norm": 0.009961171075701714, + "learning_rate": 5.6128678975190364e-05, + "loss": 0.3197080373764038, + "step": 102750 + }, + { + "epoch": 0.4411701570455853, + "grad_norm": 0.00707285525277257, + "learning_rate": 5.612436725507274e-05, + "loss": 0.4116492748260498, + "step": 102760 + }, + { + "epoch": 0.4412130891356053, + "grad_norm": 0.00316762481816113, + "learning_rate": 5.612005553495512e-05, + "loss": 0.10139093399047852, + "step": 102770 + }, + { + "epoch": 0.4412560212256253, + "grad_norm": 0.002529738238081336, + "learning_rate": 5.6115743814837496e-05, + "loss": 0.10213443040847778, + "step": 102780 + }, + { + "epoch": 0.44129895331564534, + "grad_norm": 0.6762450933456421, + "learning_rate": 5.611143209471987e-05, + "loss": 0.2196059226989746, + "step": 102790 + }, + { + "epoch": 0.4413418854056653, + "grad_norm": 15.398003578186035, + "learning_rate": 5.610712037460225e-05, + "loss": 0.18061717748641967, + "step": 102800 + }, + { + "epoch": 0.4413848174956853, + "grad_norm": 0.22043964266777039, + "learning_rate": 5.610280865448462e-05, + "loss": 0.2987262487411499, + "step": 102810 + }, + { + "epoch": 0.44142774958570535, + "grad_norm": 1.8148123025894165, + "learning_rate": 5.6098496934367e-05, + "loss": 0.2733000755310059, + "step": 102820 + }, + { + "epoch": 0.44147068167572534, + "grad_norm": 0.22438614070415497, + "learning_rate": 5.6094185214249376e-05, + "loss": 0.23932878971099852, + "step": 102830 + }, + { + "epoch": 0.4415136137657453, + "grad_norm": 2.330376148223877, + "learning_rate": 5.608987349413175e-05, + "loss": 0.2667943000793457, + "step": 102840 + }, + { + "epoch": 0.44155654585576537, + "grad_norm": 12.250018119812012, + "learning_rate": 5.608556177401413e-05, + "loss": 0.0842181921005249, + "step": 102850 + }, + { + "epoch": 0.44159947794578536, + "grad_norm": 1.1223269701004028, + "learning_rate": 5.608125005389651e-05, + "loss": 0.40932598114013674, + "step": 102860 + }, + { + "epoch": 0.44164241003580534, + "grad_norm": 4.876951694488525, + "learning_rate": 5.6076938333778885e-05, + "loss": 0.30170905590057373, + "step": 102870 + }, + { + "epoch": 0.4416853421258254, + "grad_norm": 2.0066170692443848, + "learning_rate": 5.607262661366126e-05, + "loss": 0.2661459445953369, + "step": 102880 + }, + { + "epoch": 0.4417282742158454, + "grad_norm": 0.12877433001995087, + "learning_rate": 5.6068314893543626e-05, + "loss": 0.17716137170791627, + "step": 102890 + }, + { + "epoch": 0.44177120630586536, + "grad_norm": 0.1708947718143463, + "learning_rate": 5.6064003173426003e-05, + "loss": 0.29579811096191405, + "step": 102900 + }, + { + "epoch": 0.4418141383958854, + "grad_norm": 0.8348886370658875, + "learning_rate": 5.605969145330838e-05, + "loss": 0.23079733848571776, + "step": 102910 + }, + { + "epoch": 0.4418570704859054, + "grad_norm": 1.4094451665878296, + "learning_rate": 5.605537973319076e-05, + "loss": 0.3656616687774658, + "step": 102920 + }, + { + "epoch": 0.4419000025759254, + "grad_norm": 0.21388374269008636, + "learning_rate": 5.6051068013073136e-05, + "loss": 0.23285670280456544, + "step": 102930 + }, + { + "epoch": 0.4419429346659454, + "grad_norm": 0.02697441726922989, + "learning_rate": 5.604675629295551e-05, + "loss": 0.08836208581924439, + "step": 102940 + }, + { + "epoch": 0.4419858667559654, + "grad_norm": 0.2437342256307602, + "learning_rate": 5.604244457283789e-05, + "loss": 0.04906468987464905, + "step": 102950 + }, + { + "epoch": 0.4420287988459854, + "grad_norm": 0.09356532245874405, + "learning_rate": 5.603813285272027e-05, + "loss": 0.09088559746742249, + "step": 102960 + }, + { + "epoch": 0.44207173093600544, + "grad_norm": 0.03725534304976463, + "learning_rate": 5.603382113260265e-05, + "loss": 0.06402713060379028, + "step": 102970 + }, + { + "epoch": 0.4421146630260254, + "grad_norm": 0.011978763155639172, + "learning_rate": 5.6029509412485015e-05, + "loss": 0.3434773921966553, + "step": 102980 + }, + { + "epoch": 0.44215759511604547, + "grad_norm": 0.4410317540168762, + "learning_rate": 5.602519769236739e-05, + "loss": 0.17957134246826173, + "step": 102990 + }, + { + "epoch": 0.44220052720606545, + "grad_norm": 2.408623695373535, + "learning_rate": 5.602088597224977e-05, + "loss": 0.2665703296661377, + "step": 103000 + }, + { + "epoch": 0.44220052720606545, + "eval_loss": 0.41615238785743713, + "eval_runtime": 27.2641, + "eval_samples_per_second": 3.668, + "eval_steps_per_second": 3.668, + "step": 103000 + }, + { + "epoch": 0.44224345929608544, + "grad_norm": 2.4489293098449707, + "learning_rate": 5.601657425213215e-05, + "loss": 0.23038191795349122, + "step": 103010 + }, + { + "epoch": 0.4422863913861055, + "grad_norm": 2.3956902027130127, + "learning_rate": 5.6012262532014525e-05, + "loss": 0.24207894802093505, + "step": 103020 + }, + { + "epoch": 0.44232932347612547, + "grad_norm": 0.13477887213230133, + "learning_rate": 5.60079508118969e-05, + "loss": 0.11965099573135377, + "step": 103030 + }, + { + "epoch": 0.44237225556614546, + "grad_norm": 0.0049563124775886536, + "learning_rate": 5.600363909177928e-05, + "loss": 0.21348881721496582, + "step": 103040 + }, + { + "epoch": 0.4424151876561655, + "grad_norm": 0.3449452817440033, + "learning_rate": 5.599932737166166e-05, + "loss": 0.4346416473388672, + "step": 103050 + }, + { + "epoch": 0.4424581197461855, + "grad_norm": 0.8822908401489258, + "learning_rate": 5.599501565154403e-05, + "loss": 0.21301577091217042, + "step": 103060 + }, + { + "epoch": 0.4425010518362055, + "grad_norm": 1.8892277479171753, + "learning_rate": 5.5990703931426405e-05, + "loss": 0.33349945545196535, + "step": 103070 + }, + { + "epoch": 0.4425439839262255, + "grad_norm": 0.06545145809650421, + "learning_rate": 5.598639221130878e-05, + "loss": 0.30561869144439696, + "step": 103080 + }, + { + "epoch": 0.4425869160162455, + "grad_norm": 0.2756239175796509, + "learning_rate": 5.598208049119116e-05, + "loss": 0.2424759864807129, + "step": 103090 + }, + { + "epoch": 0.4426298481062655, + "grad_norm": 2.4025254249572754, + "learning_rate": 5.597776877107354e-05, + "loss": 0.2664868116378784, + "step": 103100 + }, + { + "epoch": 0.44267278019628553, + "grad_norm": 0.028143398463726044, + "learning_rate": 5.5973457050955914e-05, + "loss": 0.10620641708374023, + "step": 103110 + }, + { + "epoch": 0.4427157122863055, + "grad_norm": 2.8042232990264893, + "learning_rate": 5.596914533083829e-05, + "loss": 0.1460793733596802, + "step": 103120 + }, + { + "epoch": 0.4427586443763255, + "grad_norm": 0.010151490569114685, + "learning_rate": 5.596483361072067e-05, + "loss": 0.17683175802230836, + "step": 103130 + }, + { + "epoch": 0.44280157646634555, + "grad_norm": 2.218151569366455, + "learning_rate": 5.596052189060303e-05, + "loss": 0.3955509901046753, + "step": 103140 + }, + { + "epoch": 0.44284450855636553, + "grad_norm": 4.830830097198486, + "learning_rate": 5.595621017048541e-05, + "loss": 0.3088000059127808, + "step": 103150 + }, + { + "epoch": 0.4428874406463855, + "grad_norm": 0.014260631985962391, + "learning_rate": 5.595189845036779e-05, + "loss": 0.1861141800880432, + "step": 103160 + }, + { + "epoch": 0.44293037273640556, + "grad_norm": 0.09907328337430954, + "learning_rate": 5.5947586730250165e-05, + "loss": 0.14836844205856323, + "step": 103170 + }, + { + "epoch": 0.44297330482642555, + "grad_norm": 0.05011352524161339, + "learning_rate": 5.594327501013254e-05, + "loss": 0.21681520938873292, + "step": 103180 + }, + { + "epoch": 0.4430162369164456, + "grad_norm": 1.6237834692001343, + "learning_rate": 5.5938963290014926e-05, + "loss": 0.23171391487121581, + "step": 103190 + }, + { + "epoch": 0.4430591690064656, + "grad_norm": 0.8891550898551941, + "learning_rate": 5.5934651569897303e-05, + "loss": 0.27312302589416504, + "step": 103200 + }, + { + "epoch": 0.44310210109648557, + "grad_norm": 1.871790885925293, + "learning_rate": 5.593033984977968e-05, + "loss": 0.16714088916778563, + "step": 103210 + }, + { + "epoch": 0.4431450331865056, + "grad_norm": 1.3890925645828247, + "learning_rate": 5.5926028129662045e-05, + "loss": 0.19409667253494262, + "step": 103220 + }, + { + "epoch": 0.4431879652765256, + "grad_norm": 4.021583557128906, + "learning_rate": 5.592171640954442e-05, + "loss": 0.24269635677337648, + "step": 103230 + }, + { + "epoch": 0.4432308973665456, + "grad_norm": 0.05253671109676361, + "learning_rate": 5.59174046894268e-05, + "loss": 0.06833736896514893, + "step": 103240 + }, + { + "epoch": 0.4432738294565656, + "grad_norm": 0.5878363847732544, + "learning_rate": 5.5913092969309177e-05, + "loss": 0.2748382091522217, + "step": 103250 + }, + { + "epoch": 0.4433167615465856, + "grad_norm": 0.2838081419467926, + "learning_rate": 5.5908781249191554e-05, + "loss": 0.18605902194976806, + "step": 103260 + }, + { + "epoch": 0.4433596936366056, + "grad_norm": 0.3230079710483551, + "learning_rate": 5.590446952907393e-05, + "loss": 0.1064450740814209, + "step": 103270 + }, + { + "epoch": 0.44340262572662564, + "grad_norm": 0.11365670710802078, + "learning_rate": 5.590015780895631e-05, + "loss": 0.37896459102630614, + "step": 103280 + }, + { + "epoch": 0.44344555781664563, + "grad_norm": 0.025720424950122833, + "learning_rate": 5.5895846088838686e-05, + "loss": 0.14678401947021485, + "step": 103290 + }, + { + "epoch": 0.4434884899066656, + "grad_norm": 1.161470890045166, + "learning_rate": 5.589153436872106e-05, + "loss": 0.3876520872116089, + "step": 103300 + }, + { + "epoch": 0.44353142199668566, + "grad_norm": 25.90959358215332, + "learning_rate": 5.5887222648603434e-05, + "loss": 0.17435024976730346, + "step": 103310 + }, + { + "epoch": 0.44357435408670565, + "grad_norm": 6.646755695343018, + "learning_rate": 5.588291092848581e-05, + "loss": 0.17923879623413086, + "step": 103320 + }, + { + "epoch": 0.44361728617672563, + "grad_norm": 1.0259877443313599, + "learning_rate": 5.587859920836819e-05, + "loss": 0.325650691986084, + "step": 103330 + }, + { + "epoch": 0.4436602182667457, + "grad_norm": 0.1227472648024559, + "learning_rate": 5.5874287488250566e-05, + "loss": 0.3091316223144531, + "step": 103340 + }, + { + "epoch": 0.44370315035676566, + "grad_norm": 0.048743315041065216, + "learning_rate": 5.586997576813294e-05, + "loss": 0.20743951797485352, + "step": 103350 + }, + { + "epoch": 0.44374608244678565, + "grad_norm": 0.8254276514053345, + "learning_rate": 5.586566404801532e-05, + "loss": 0.3583249807357788, + "step": 103360 + }, + { + "epoch": 0.4437890145368057, + "grad_norm": 0.02611999399960041, + "learning_rate": 5.58613523278977e-05, + "loss": 0.1163441300392151, + "step": 103370 + }, + { + "epoch": 0.4438319466268257, + "grad_norm": 0.006742026656866074, + "learning_rate": 5.5857040607780075e-05, + "loss": 0.2178345203399658, + "step": 103380 + }, + { + "epoch": 0.44387487871684567, + "grad_norm": 0.008877522312104702, + "learning_rate": 5.585272888766244e-05, + "loss": 0.24348392486572265, + "step": 103390 + }, + { + "epoch": 0.4439178108068657, + "grad_norm": 0.008085265755653381, + "learning_rate": 5.5848417167544816e-05, + "loss": 0.20122940540313722, + "step": 103400 + }, + { + "epoch": 0.4439607428968857, + "grad_norm": 0.14661838114261627, + "learning_rate": 5.58441054474272e-05, + "loss": 0.2685752630233765, + "step": 103410 + }, + { + "epoch": 0.44400367498690574, + "grad_norm": 6.539491176605225, + "learning_rate": 5.583979372730958e-05, + "loss": 0.2309859037399292, + "step": 103420 + }, + { + "epoch": 0.4440466070769257, + "grad_norm": 0.24323442578315735, + "learning_rate": 5.5835482007191955e-05, + "loss": 0.19980394840240479, + "step": 103430 + }, + { + "epoch": 0.4440895391669457, + "grad_norm": 0.01880057342350483, + "learning_rate": 5.583117028707433e-05, + "loss": 0.08100860714912414, + "step": 103440 + }, + { + "epoch": 0.44413247125696576, + "grad_norm": 1.389131784439087, + "learning_rate": 5.582685856695671e-05, + "loss": 0.40127172470092776, + "step": 103450 + }, + { + "epoch": 0.44417540334698574, + "grad_norm": 0.010281624272465706, + "learning_rate": 5.582254684683909e-05, + "loss": 0.15416443347930908, + "step": 103460 + }, + { + "epoch": 0.44421833543700573, + "grad_norm": 1.043463945388794, + "learning_rate": 5.581823512672145e-05, + "loss": 0.41317176818847656, + "step": 103470 + }, + { + "epoch": 0.44426126752702577, + "grad_norm": 0.0015052884118631482, + "learning_rate": 5.581392340660383e-05, + "loss": 0.1989218235015869, + "step": 103480 + }, + { + "epoch": 0.44430419961704576, + "grad_norm": 0.22993981838226318, + "learning_rate": 5.5809611686486206e-05, + "loss": 0.1768964171409607, + "step": 103490 + }, + { + "epoch": 0.44434713170706575, + "grad_norm": 1.827546238899231, + "learning_rate": 5.580529996636858e-05, + "loss": 0.10534555912017822, + "step": 103500 + }, + { + "epoch": 0.4443900637970858, + "grad_norm": 0.15109845995903015, + "learning_rate": 5.580098824625096e-05, + "loss": 0.3064572811126709, + "step": 103510 + }, + { + "epoch": 0.4444329958871058, + "grad_norm": 0.8713210821151733, + "learning_rate": 5.579667652613334e-05, + "loss": 0.1682778477668762, + "step": 103520 + }, + { + "epoch": 0.44447592797712576, + "grad_norm": 1.4505832195281982, + "learning_rate": 5.5792364806015715e-05, + "loss": 0.2827667236328125, + "step": 103530 + }, + { + "epoch": 0.4445188600671458, + "grad_norm": 0.14652542769908905, + "learning_rate": 5.578805308589809e-05, + "loss": 0.08168087601661682, + "step": 103540 + }, + { + "epoch": 0.4445617921571658, + "grad_norm": 0.006383778993040323, + "learning_rate": 5.578374136578046e-05, + "loss": 0.09196740984916688, + "step": 103550 + }, + { + "epoch": 0.4446047242471858, + "grad_norm": 0.003078729147091508, + "learning_rate": 5.577942964566284e-05, + "loss": 0.20092494487762452, + "step": 103560 + }, + { + "epoch": 0.4446476563372058, + "grad_norm": 18.05199432373047, + "learning_rate": 5.577511792554522e-05, + "loss": 0.28601033687591554, + "step": 103570 + }, + { + "epoch": 0.4446905884272258, + "grad_norm": 1.4970247745513916, + "learning_rate": 5.5770806205427595e-05, + "loss": 0.19839348793029785, + "step": 103580 + }, + { + "epoch": 0.4447335205172458, + "grad_norm": 4.3401665687561035, + "learning_rate": 5.576649448530997e-05, + "loss": 0.32708165645599363, + "step": 103590 + }, + { + "epoch": 0.44477645260726584, + "grad_norm": 0.07379349321126938, + "learning_rate": 5.576218276519235e-05, + "loss": 0.1674059510231018, + "step": 103600 + }, + { + "epoch": 0.4448193846972858, + "grad_norm": 0.015309474430978298, + "learning_rate": 5.575787104507473e-05, + "loss": 0.37714362144470215, + "step": 103610 + }, + { + "epoch": 0.44486231678730587, + "grad_norm": 2.3161532878875732, + "learning_rate": 5.5753559324957104e-05, + "loss": 0.3296446561813354, + "step": 103620 + }, + { + "epoch": 0.44490524887732585, + "grad_norm": 0.1761620193719864, + "learning_rate": 5.5749247604839475e-05, + "loss": 0.14058544635772705, + "step": 103630 + }, + { + "epoch": 0.44494818096734584, + "grad_norm": 0.02455052174627781, + "learning_rate": 5.574493588472185e-05, + "loss": 0.1107941746711731, + "step": 103640 + }, + { + "epoch": 0.4449911130573659, + "grad_norm": 2.150546073913574, + "learning_rate": 5.574062416460423e-05, + "loss": 0.39648966789245604, + "step": 103650 + }, + { + "epoch": 0.44503404514738587, + "grad_norm": 0.16114000976085663, + "learning_rate": 5.573631244448661e-05, + "loss": 0.22998547554016113, + "step": 103660 + }, + { + "epoch": 0.44507697723740586, + "grad_norm": 1.723541498184204, + "learning_rate": 5.5732000724368984e-05, + "loss": 0.23313498497009277, + "step": 103670 + }, + { + "epoch": 0.4451199093274259, + "grad_norm": 1.5442637205123901, + "learning_rate": 5.572768900425136e-05, + "loss": 0.4073235511779785, + "step": 103680 + }, + { + "epoch": 0.4451628414174459, + "grad_norm": 0.019211404025554657, + "learning_rate": 5.572337728413374e-05, + "loss": 0.2632411003112793, + "step": 103690 + }, + { + "epoch": 0.4452057735074659, + "grad_norm": 0.1311320811510086, + "learning_rate": 5.5719065564016116e-05, + "loss": 0.23861315250396728, + "step": 103700 + }, + { + "epoch": 0.4452487055974859, + "grad_norm": 0.015002823434770107, + "learning_rate": 5.5714753843898494e-05, + "loss": 0.2565034866333008, + "step": 103710 + }, + { + "epoch": 0.4452916376875059, + "grad_norm": 1.0120104551315308, + "learning_rate": 5.571044212378086e-05, + "loss": 0.11774532794952393, + "step": 103720 + }, + { + "epoch": 0.4453345697775259, + "grad_norm": 1.3540277481079102, + "learning_rate": 5.5706130403663235e-05, + "loss": 0.19017940759658813, + "step": 103730 + }, + { + "epoch": 0.44537750186754593, + "grad_norm": 0.017274409532546997, + "learning_rate": 5.570181868354561e-05, + "loss": 0.16923511028289795, + "step": 103740 + }, + { + "epoch": 0.4454204339575659, + "grad_norm": 0.006057052407413721, + "learning_rate": 5.569750696342799e-05, + "loss": 0.27709307670593264, + "step": 103750 + }, + { + "epoch": 0.4454633660475859, + "grad_norm": 0.012291841208934784, + "learning_rate": 5.569319524331037e-05, + "loss": 0.19354288578033446, + "step": 103760 + }, + { + "epoch": 0.44550629813760595, + "grad_norm": 0.002555753802880645, + "learning_rate": 5.5688883523192744e-05, + "loss": 0.23224294185638428, + "step": 103770 + }, + { + "epoch": 0.44554923022762594, + "grad_norm": 2.2619755268096924, + "learning_rate": 5.568457180307513e-05, + "loss": 0.06295624971389771, + "step": 103780 + }, + { + "epoch": 0.4455921623176459, + "grad_norm": 0.001481866231188178, + "learning_rate": 5.5680260082957506e-05, + "loss": 0.17975410223007202, + "step": 103790 + }, + { + "epoch": 0.44563509440766597, + "grad_norm": 0.09197922050952911, + "learning_rate": 5.567594836283987e-05, + "loss": 0.03728804886341095, + "step": 103800 + }, + { + "epoch": 0.44567802649768595, + "grad_norm": 1.473630666732788, + "learning_rate": 5.567163664272225e-05, + "loss": 0.18659095764160155, + "step": 103810 + }, + { + "epoch": 0.44572095858770594, + "grad_norm": 0.0031739831902086735, + "learning_rate": 5.5667324922604624e-05, + "loss": 0.026174116134643554, + "step": 103820 + }, + { + "epoch": 0.445763890677726, + "grad_norm": 0.011447089724242687, + "learning_rate": 5.5663013202487e-05, + "loss": 0.4874469757080078, + "step": 103830 + }, + { + "epoch": 0.44580682276774597, + "grad_norm": 1.396328091621399, + "learning_rate": 5.565870148236938e-05, + "loss": 0.13257690668106079, + "step": 103840 + }, + { + "epoch": 0.445849754857766, + "grad_norm": 0.08562108129262924, + "learning_rate": 5.5654389762251756e-05, + "loss": 0.11450157165527344, + "step": 103850 + }, + { + "epoch": 0.445892686947786, + "grad_norm": 3.2500569820404053, + "learning_rate": 5.5650078042134134e-05, + "loss": 0.176918363571167, + "step": 103860 + }, + { + "epoch": 0.445935619037806, + "grad_norm": 2.0813026428222656, + "learning_rate": 5.564576632201651e-05, + "loss": 0.19415004253387452, + "step": 103870 + }, + { + "epoch": 0.44597855112782603, + "grad_norm": 0.8829070329666138, + "learning_rate": 5.564145460189888e-05, + "loss": 0.17182838916778564, + "step": 103880 + }, + { + "epoch": 0.446021483217846, + "grad_norm": 0.007004234939813614, + "learning_rate": 5.563714288178126e-05, + "loss": 0.36884973049163816, + "step": 103890 + }, + { + "epoch": 0.446064415307866, + "grad_norm": 0.8837750554084778, + "learning_rate": 5.5632831161663636e-05, + "loss": 0.11918337345123291, + "step": 103900 + }, + { + "epoch": 0.44610734739788604, + "grad_norm": 0.0010567853460088372, + "learning_rate": 5.5628519441546013e-05, + "loss": 0.17197400331497192, + "step": 103910 + }, + { + "epoch": 0.44615027948790603, + "grad_norm": 0.09773020446300507, + "learning_rate": 5.562420772142839e-05, + "loss": 0.2357560873031616, + "step": 103920 + }, + { + "epoch": 0.446193211577926, + "grad_norm": 0.0009565603686496615, + "learning_rate": 5.561989600131077e-05, + "loss": 0.14261248111724853, + "step": 103930 + }, + { + "epoch": 0.44623614366794606, + "grad_norm": 4.308777809143066, + "learning_rate": 5.5615584281193146e-05, + "loss": 0.30736939907073973, + "step": 103940 + }, + { + "epoch": 0.44627907575796605, + "grad_norm": 7.586215496063232, + "learning_rate": 5.561127256107552e-05, + "loss": 0.44284768104553224, + "step": 103950 + }, + { + "epoch": 0.44632200784798604, + "grad_norm": 1.932542324066162, + "learning_rate": 5.560696084095789e-05, + "loss": 0.11989340782165528, + "step": 103960 + }, + { + "epoch": 0.4463649399380061, + "grad_norm": 0.006522905547171831, + "learning_rate": 5.5602649120840264e-05, + "loss": 0.005932082608342171, + "step": 103970 + }, + { + "epoch": 0.44640787202802606, + "grad_norm": 0.022067038342356682, + "learning_rate": 5.559833740072264e-05, + "loss": 0.15941121578216552, + "step": 103980 + }, + { + "epoch": 0.44645080411804605, + "grad_norm": 2.1581637859344482, + "learning_rate": 5.559402568060502e-05, + "loss": 0.11742219924926758, + "step": 103990 + }, + { + "epoch": 0.4464937362080661, + "grad_norm": 0.3017823100090027, + "learning_rate": 5.55897139604874e-05, + "loss": 0.14415110349655152, + "step": 104000 + }, + { + "epoch": 0.4464937362080661, + "eval_loss": 0.412598580121994, + "eval_runtime": 27.092, + "eval_samples_per_second": 3.691, + "eval_steps_per_second": 3.691, + "step": 104000 + }, + { + "epoch": 0.4465366682980861, + "grad_norm": 0.09379760175943375, + "learning_rate": 5.558540224036978e-05, + "loss": 0.11643334627151489, + "step": 104010 + }, + { + "epoch": 0.44657960038810607, + "grad_norm": 2.794205904006958, + "learning_rate": 5.558109052025216e-05, + "loss": 0.21521177291870117, + "step": 104020 + }, + { + "epoch": 0.4466225324781261, + "grad_norm": 2.2005207538604736, + "learning_rate": 5.5576778800134535e-05, + "loss": 0.050330376625061034, + "step": 104030 + }, + { + "epoch": 0.4466654645681461, + "grad_norm": 0.002720118500292301, + "learning_rate": 5.557246708001691e-05, + "loss": 0.43977041244506837, + "step": 104040 + }, + { + "epoch": 0.44670839665816614, + "grad_norm": 0.008064290508627892, + "learning_rate": 5.5568155359899276e-05, + "loss": 0.2373410940170288, + "step": 104050 + }, + { + "epoch": 0.4467513287481861, + "grad_norm": 1.018776297569275, + "learning_rate": 5.556384363978165e-05, + "loss": 0.23385534286499024, + "step": 104060 + }, + { + "epoch": 0.4467942608382061, + "grad_norm": 0.9403790831565857, + "learning_rate": 5.555953191966403e-05, + "loss": 0.08395354151725769, + "step": 104070 + }, + { + "epoch": 0.44683719292822616, + "grad_norm": 10.480605125427246, + "learning_rate": 5.555522019954641e-05, + "loss": 0.26746931076049807, + "step": 104080 + }, + { + "epoch": 0.44688012501824614, + "grad_norm": 0.001435067504644394, + "learning_rate": 5.5550908479428785e-05, + "loss": 0.22385041713714598, + "step": 104090 + }, + { + "epoch": 0.44692305710826613, + "grad_norm": 1.9128391742706299, + "learning_rate": 5.554659675931116e-05, + "loss": 0.2415689468383789, + "step": 104100 + }, + { + "epoch": 0.4469659891982862, + "grad_norm": 0.10824067145586014, + "learning_rate": 5.554228503919354e-05, + "loss": 0.25131275653839114, + "step": 104110 + }, + { + "epoch": 0.44700892128830616, + "grad_norm": 6.406500816345215, + "learning_rate": 5.553797331907592e-05, + "loss": 0.24533276557922362, + "step": 104120 + }, + { + "epoch": 0.44705185337832615, + "grad_norm": 0.07218021154403687, + "learning_rate": 5.553366159895829e-05, + "loss": 0.16491100788116456, + "step": 104130 + }, + { + "epoch": 0.4470947854683462, + "grad_norm": 1.6558536291122437, + "learning_rate": 5.5529349878840665e-05, + "loss": 0.3282376527786255, + "step": 104140 + }, + { + "epoch": 0.4471377175583662, + "grad_norm": 0.9896855354309082, + "learning_rate": 5.552503815872304e-05, + "loss": 0.31535341739654543, + "step": 104150 + }, + { + "epoch": 0.44718064964838616, + "grad_norm": 0.11811896413564682, + "learning_rate": 5.552072643860542e-05, + "loss": 0.3985059976577759, + "step": 104160 + }, + { + "epoch": 0.4472235817384062, + "grad_norm": 1.187322974205017, + "learning_rate": 5.55164147184878e-05, + "loss": 0.10966448783874512, + "step": 104170 + }, + { + "epoch": 0.4472665138284262, + "grad_norm": 1.861840009689331, + "learning_rate": 5.5512102998370175e-05, + "loss": 0.3425853729248047, + "step": 104180 + }, + { + "epoch": 0.4473094459184462, + "grad_norm": 0.001304112607613206, + "learning_rate": 5.550779127825255e-05, + "loss": 0.47119789123535155, + "step": 104190 + }, + { + "epoch": 0.4473523780084662, + "grad_norm": 1.7016571760177612, + "learning_rate": 5.550347955813493e-05, + "loss": 0.34137959480285646, + "step": 104200 + }, + { + "epoch": 0.4473953100984862, + "grad_norm": 0.11588651686906815, + "learning_rate": 5.549916783801729e-05, + "loss": 0.18648021221160888, + "step": 104210 + }, + { + "epoch": 0.4474382421885062, + "grad_norm": 0.008340016007423401, + "learning_rate": 5.549485611789968e-05, + "loss": 0.10814428329467773, + "step": 104220 + }, + { + "epoch": 0.44748117427852624, + "grad_norm": 0.051742419600486755, + "learning_rate": 5.5490544397782055e-05, + "loss": 0.06334395408630371, + "step": 104230 + }, + { + "epoch": 0.4475241063685462, + "grad_norm": 3.1890809535980225, + "learning_rate": 5.548623267766443e-05, + "loss": 0.21675529479980468, + "step": 104240 + }, + { + "epoch": 0.4475670384585662, + "grad_norm": 0.020858777686953545, + "learning_rate": 5.548192095754681e-05, + "loss": 0.15983496904373168, + "step": 104250 + }, + { + "epoch": 0.44760997054858626, + "grad_norm": 0.0014625436160713434, + "learning_rate": 5.5477609237429187e-05, + "loss": 0.1737871289253235, + "step": 104260 + }, + { + "epoch": 0.44765290263860624, + "grad_norm": 0.08136744052171707, + "learning_rate": 5.5473297517311564e-05, + "loss": 0.25660922527313235, + "step": 104270 + }, + { + "epoch": 0.4476958347286263, + "grad_norm": 1.1136901378631592, + "learning_rate": 5.546898579719394e-05, + "loss": 0.09946085810661316, + "step": 104280 + }, + { + "epoch": 0.44773876681864627, + "grad_norm": 0.002379036508500576, + "learning_rate": 5.5464674077076305e-05, + "loss": 0.355484676361084, + "step": 104290 + }, + { + "epoch": 0.44778169890866626, + "grad_norm": 0.09864296764135361, + "learning_rate": 5.546036235695868e-05, + "loss": 0.3599360942840576, + "step": 104300 + }, + { + "epoch": 0.4478246309986863, + "grad_norm": 3.0941085815429688, + "learning_rate": 5.545605063684106e-05, + "loss": 0.27424373626708987, + "step": 104310 + }, + { + "epoch": 0.4478675630887063, + "grad_norm": 1.5314997434616089, + "learning_rate": 5.545173891672344e-05, + "loss": 0.26385200023651123, + "step": 104320 + }, + { + "epoch": 0.4479104951787263, + "grad_norm": 0.6546000242233276, + "learning_rate": 5.5447427196605814e-05, + "loss": 0.1692166805267334, + "step": 104330 + }, + { + "epoch": 0.4479534272687463, + "grad_norm": 0.0019183208933100104, + "learning_rate": 5.544311547648819e-05, + "loss": 0.2954787969589233, + "step": 104340 + }, + { + "epoch": 0.4479963593587663, + "grad_norm": 0.8082722425460815, + "learning_rate": 5.543880375637057e-05, + "loss": 0.1485131025314331, + "step": 104350 + }, + { + "epoch": 0.4480392914487863, + "grad_norm": 0.015951262786984444, + "learning_rate": 5.5434492036252946e-05, + "loss": 0.20640509128570556, + "step": 104360 + }, + { + "epoch": 0.44808222353880633, + "grad_norm": 35.86313247680664, + "learning_rate": 5.543018031613532e-05, + "loss": 0.30402724742889403, + "step": 104370 + }, + { + "epoch": 0.4481251556288263, + "grad_norm": 0.08005943894386292, + "learning_rate": 5.5425868596017694e-05, + "loss": 0.09017500877380372, + "step": 104380 + }, + { + "epoch": 0.4481680877188463, + "grad_norm": 0.06653320044279099, + "learning_rate": 5.542155687590007e-05, + "loss": 0.1858425498008728, + "step": 104390 + }, + { + "epoch": 0.44821101980886635, + "grad_norm": 1.3172448873519897, + "learning_rate": 5.541724515578245e-05, + "loss": 0.3179537534713745, + "step": 104400 + }, + { + "epoch": 0.44825395189888634, + "grad_norm": 1.2068978548049927, + "learning_rate": 5.5412933435664826e-05, + "loss": 0.21991288661956787, + "step": 104410 + }, + { + "epoch": 0.4482968839889063, + "grad_norm": 0.0010691984789445996, + "learning_rate": 5.5408621715547204e-05, + "loss": 0.322251296043396, + "step": 104420 + }, + { + "epoch": 0.44833981607892637, + "grad_norm": 1.4022953510284424, + "learning_rate": 5.540430999542958e-05, + "loss": 0.3753403663635254, + "step": 104430 + }, + { + "epoch": 0.44838274816894635, + "grad_norm": 1.5526142120361328, + "learning_rate": 5.539999827531196e-05, + "loss": 0.28666656017303466, + "step": 104440 + }, + { + "epoch": 0.44842568025896634, + "grad_norm": 0.03407391533255577, + "learning_rate": 5.5395686555194336e-05, + "loss": 0.028074699640274047, + "step": 104450 + }, + { + "epoch": 0.4484686123489864, + "grad_norm": 1.6679810285568237, + "learning_rate": 5.5391374835076706e-05, + "loss": 0.28589587211608886, + "step": 104460 + }, + { + "epoch": 0.44851154443900637, + "grad_norm": 0.2455066740512848, + "learning_rate": 5.5387063114959084e-05, + "loss": 0.10405961275100709, + "step": 104470 + }, + { + "epoch": 0.4485544765290264, + "grad_norm": 1.4242631196975708, + "learning_rate": 5.538275139484146e-05, + "loss": 0.3303467035293579, + "step": 104480 + }, + { + "epoch": 0.4485974086190464, + "grad_norm": 1.238398790359497, + "learning_rate": 5.537843967472384e-05, + "loss": 0.14865484237670898, + "step": 104490 + }, + { + "epoch": 0.4486403407090664, + "grad_norm": 0.03500519320368767, + "learning_rate": 5.5374127954606216e-05, + "loss": 0.19715001583099365, + "step": 104500 + }, + { + "epoch": 0.44868327279908643, + "grad_norm": 0.02038196101784706, + "learning_rate": 5.536981623448859e-05, + "loss": 0.15497336387634278, + "step": 104510 + }, + { + "epoch": 0.4487262048891064, + "grad_norm": 0.027887515723705292, + "learning_rate": 5.536550451437097e-05, + "loss": 0.0818180799484253, + "step": 104520 + }, + { + "epoch": 0.4487691369791264, + "grad_norm": 7.665521144866943, + "learning_rate": 5.536119279425335e-05, + "loss": 0.3511594533920288, + "step": 104530 + }, + { + "epoch": 0.44881206906914645, + "grad_norm": 0.0011700972681865096, + "learning_rate": 5.535688107413571e-05, + "loss": 0.1457660675048828, + "step": 104540 + }, + { + "epoch": 0.44885500115916643, + "grad_norm": 0.3345299959182739, + "learning_rate": 5.535256935401809e-05, + "loss": 0.2602620363235474, + "step": 104550 + }, + { + "epoch": 0.4488979332491864, + "grad_norm": 7.855331897735596, + "learning_rate": 5.5348257633900466e-05, + "loss": 0.22493176460266112, + "step": 104560 + }, + { + "epoch": 0.44894086533920646, + "grad_norm": 3.3931214809417725, + "learning_rate": 5.5343945913782844e-05, + "loss": 0.09702563285827637, + "step": 104570 + }, + { + "epoch": 0.44898379742922645, + "grad_norm": 4.353238582611084, + "learning_rate": 5.533963419366522e-05, + "loss": 0.23266072273254396, + "step": 104580 + }, + { + "epoch": 0.44902672951924644, + "grad_norm": 0.5245272517204285, + "learning_rate": 5.5335322473547605e-05, + "loss": 0.3167107105255127, + "step": 104590 + }, + { + "epoch": 0.4490696616092665, + "grad_norm": 0.013726268894970417, + "learning_rate": 5.533101075342998e-05, + "loss": 0.22341177463531495, + "step": 104600 + }, + { + "epoch": 0.44911259369928647, + "grad_norm": 0.0004888771800324321, + "learning_rate": 5.532669903331236e-05, + "loss": 0.24303507804870605, + "step": 104610 + }, + { + "epoch": 0.44915552578930645, + "grad_norm": 3.4889206886291504, + "learning_rate": 5.5322387313194724e-05, + "loss": 0.2107637643814087, + "step": 104620 + }, + { + "epoch": 0.4491984578793265, + "grad_norm": 0.34107357263565063, + "learning_rate": 5.53180755930771e-05, + "loss": 0.12795854806900026, + "step": 104630 + }, + { + "epoch": 0.4492413899693465, + "grad_norm": 2.047565221786499, + "learning_rate": 5.531376387295948e-05, + "loss": 0.14512872695922852, + "step": 104640 + }, + { + "epoch": 0.44928432205936647, + "grad_norm": 1.2649197578430176, + "learning_rate": 5.5309452152841856e-05, + "loss": 0.20695888996124268, + "step": 104650 + }, + { + "epoch": 0.4493272541493865, + "grad_norm": 1.462600827217102, + "learning_rate": 5.530514043272423e-05, + "loss": 0.3829000234603882, + "step": 104660 + }, + { + "epoch": 0.4493701862394065, + "grad_norm": 0.29148611426353455, + "learning_rate": 5.530082871260661e-05, + "loss": 0.1434500813484192, + "step": 104670 + }, + { + "epoch": 0.4494131183294265, + "grad_norm": 0.692503809928894, + "learning_rate": 5.529651699248899e-05, + "loss": 0.16208131313323976, + "step": 104680 + }, + { + "epoch": 0.44945605041944653, + "grad_norm": 0.05662372708320618, + "learning_rate": 5.5292205272371365e-05, + "loss": 0.08052989840507507, + "step": 104690 + }, + { + "epoch": 0.4494989825094665, + "grad_norm": 4.697552680969238, + "learning_rate": 5.5287893552253735e-05, + "loss": 0.24397377967834472, + "step": 104700 + }, + { + "epoch": 0.44954191459948656, + "grad_norm": 0.10242107510566711, + "learning_rate": 5.528358183213611e-05, + "loss": 0.22177133560180665, + "step": 104710 + }, + { + "epoch": 0.44958484668950655, + "grad_norm": 0.17377902567386627, + "learning_rate": 5.527927011201849e-05, + "loss": 0.1945713758468628, + "step": 104720 + }, + { + "epoch": 0.44962777877952653, + "grad_norm": 3.188666582107544, + "learning_rate": 5.527495839190087e-05, + "loss": 0.2882867336273193, + "step": 104730 + }, + { + "epoch": 0.4496707108695466, + "grad_norm": 0.009148088283836842, + "learning_rate": 5.5270646671783245e-05, + "loss": 0.06424065828323364, + "step": 104740 + }, + { + "epoch": 0.44971364295956656, + "grad_norm": 0.07122382521629333, + "learning_rate": 5.526633495166562e-05, + "loss": 0.1966516852378845, + "step": 104750 + }, + { + "epoch": 0.44975657504958655, + "grad_norm": 0.015124008990824223, + "learning_rate": 5.5262023231548e-05, + "loss": 0.1615827798843384, + "step": 104760 + }, + { + "epoch": 0.4497995071396066, + "grad_norm": 1.248611330986023, + "learning_rate": 5.525771151143038e-05, + "loss": 0.16856156587600707, + "step": 104770 + }, + { + "epoch": 0.4498424392296266, + "grad_norm": 1.596437931060791, + "learning_rate": 5.5253399791312754e-05, + "loss": 0.12455008029937745, + "step": 104780 + }, + { + "epoch": 0.44988537131964657, + "grad_norm": 0.0020168491173535585, + "learning_rate": 5.524908807119512e-05, + "loss": 0.27606155872344973, + "step": 104790 + }, + { + "epoch": 0.4499283034096666, + "grad_norm": 1.187111735343933, + "learning_rate": 5.5244776351077495e-05, + "loss": 0.5580471992492676, + "step": 104800 + }, + { + "epoch": 0.4499712354996866, + "grad_norm": 1.0547586679458618, + "learning_rate": 5.524046463095988e-05, + "loss": 0.3080447196960449, + "step": 104810 + }, + { + "epoch": 0.4500141675897066, + "grad_norm": 0.06692945212125778, + "learning_rate": 5.523615291084226e-05, + "loss": 0.17507950067520142, + "step": 104820 + }, + { + "epoch": 0.4500570996797266, + "grad_norm": 0.009819112718105316, + "learning_rate": 5.5231841190724634e-05, + "loss": 0.24413628578186036, + "step": 104830 + }, + { + "epoch": 0.4501000317697466, + "grad_norm": 0.011352095752954483, + "learning_rate": 5.522752947060701e-05, + "loss": 0.332852840423584, + "step": 104840 + }, + { + "epoch": 0.4501429638597666, + "grad_norm": 0.01721765473484993, + "learning_rate": 5.522321775048939e-05, + "loss": 0.246399450302124, + "step": 104850 + }, + { + "epoch": 0.45018589594978664, + "grad_norm": 0.46497970819473267, + "learning_rate": 5.5218906030371766e-05, + "loss": 0.12881628274917603, + "step": 104860 + }, + { + "epoch": 0.45022882803980663, + "grad_norm": 2.9362080097198486, + "learning_rate": 5.521459431025413e-05, + "loss": 0.25533831119537354, + "step": 104870 + }, + { + "epoch": 0.4502717601298266, + "grad_norm": 0.056996967643499374, + "learning_rate": 5.521028259013651e-05, + "loss": 0.1687253475189209, + "step": 104880 + }, + { + "epoch": 0.45031469221984666, + "grad_norm": 0.15789049863815308, + "learning_rate": 5.5205970870018885e-05, + "loss": 0.11047836542129516, + "step": 104890 + }, + { + "epoch": 0.45035762430986664, + "grad_norm": 0.4039181172847748, + "learning_rate": 5.520165914990126e-05, + "loss": 0.2870154619216919, + "step": 104900 + }, + { + "epoch": 0.4504005563998867, + "grad_norm": 0.2132159173488617, + "learning_rate": 5.519734742978364e-05, + "loss": 0.08048205375671387, + "step": 104910 + }, + { + "epoch": 0.4504434884899067, + "grad_norm": 1.834272861480713, + "learning_rate": 5.519303570966602e-05, + "loss": 0.1264325976371765, + "step": 104920 + }, + { + "epoch": 0.45048642057992666, + "grad_norm": 0.7455428242683411, + "learning_rate": 5.5188723989548394e-05, + "loss": 0.3383171081542969, + "step": 104930 + }, + { + "epoch": 0.4505293526699467, + "grad_norm": 0.11336026340723038, + "learning_rate": 5.518441226943077e-05, + "loss": 0.09172531962394714, + "step": 104940 + }, + { + "epoch": 0.4505722847599667, + "grad_norm": 0.37443527579307556, + "learning_rate": 5.518010054931314e-05, + "loss": 0.17477719783782958, + "step": 104950 + }, + { + "epoch": 0.4506152168499867, + "grad_norm": 0.007864640094339848, + "learning_rate": 5.517578882919552e-05, + "loss": 0.11320825815200805, + "step": 104960 + }, + { + "epoch": 0.4506581489400067, + "grad_norm": 2.6541857719421387, + "learning_rate": 5.51714771090779e-05, + "loss": 0.2526413440704346, + "step": 104970 + }, + { + "epoch": 0.4507010810300267, + "grad_norm": 0.2613985538482666, + "learning_rate": 5.5167165388960274e-05, + "loss": 0.22820723056793213, + "step": 104980 + }, + { + "epoch": 0.4507440131200467, + "grad_norm": 0.20190514624118805, + "learning_rate": 5.516285366884265e-05, + "loss": 0.2348994493484497, + "step": 104990 + }, + { + "epoch": 0.45078694521006674, + "grad_norm": 0.798771858215332, + "learning_rate": 5.515854194872503e-05, + "loss": 0.1506957530975342, + "step": 105000 + }, + { + "epoch": 0.45078694521006674, + "eval_loss": 0.4291705787181854, + "eval_runtime": 27.1182, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 105000 + }, + { + "epoch": 0.4508298773000867, + "grad_norm": 0.0026851454749703407, + "learning_rate": 5.5154230228607406e-05, + "loss": 0.11021552085876465, + "step": 105010 + }, + { + "epoch": 0.4508728093901067, + "grad_norm": 5.40506649017334, + "learning_rate": 5.514991850848978e-05, + "loss": 0.22361650466918945, + "step": 105020 + }, + { + "epoch": 0.45091574148012675, + "grad_norm": 0.20230934023857117, + "learning_rate": 5.5145606788372154e-05, + "loss": 0.16922760009765625, + "step": 105030 + }, + { + "epoch": 0.45095867357014674, + "grad_norm": 0.03748038411140442, + "learning_rate": 5.514129506825453e-05, + "loss": 0.1610949993133545, + "step": 105040 + }, + { + "epoch": 0.4510016056601667, + "grad_norm": 0.15671096742153168, + "learning_rate": 5.513698334813691e-05, + "loss": 0.20407464504241943, + "step": 105050 + }, + { + "epoch": 0.45104453775018677, + "grad_norm": 0.0348237119615078, + "learning_rate": 5.5132671628019286e-05, + "loss": 0.2760026454925537, + "step": 105060 + }, + { + "epoch": 0.45108746984020676, + "grad_norm": 0.10463076829910278, + "learning_rate": 5.512835990790166e-05, + "loss": 0.24972622394561766, + "step": 105070 + }, + { + "epoch": 0.45113040193022674, + "grad_norm": 2.421722412109375, + "learning_rate": 5.512404818778404e-05, + "loss": 0.030455023050308228, + "step": 105080 + }, + { + "epoch": 0.4511733340202468, + "grad_norm": 0.026346173137426376, + "learning_rate": 5.511973646766642e-05, + "loss": 0.2886451959609985, + "step": 105090 + }, + { + "epoch": 0.4512162661102668, + "grad_norm": 0.0042284042574465275, + "learning_rate": 5.5115424747548795e-05, + "loss": 0.10194617509841919, + "step": 105100 + }, + { + "epoch": 0.45125919820028676, + "grad_norm": 1.1205313205718994, + "learning_rate": 5.511111302743117e-05, + "loss": 0.23844561576843262, + "step": 105110 + }, + { + "epoch": 0.4513021302903068, + "grad_norm": 0.6483297944068909, + "learning_rate": 5.5106801307313536e-05, + "loss": 0.3492732524871826, + "step": 105120 + }, + { + "epoch": 0.4513450623803268, + "grad_norm": 0.04426403343677521, + "learning_rate": 5.5102489587195914e-05, + "loss": 0.2877930164337158, + "step": 105130 + }, + { + "epoch": 0.45138799447034683, + "grad_norm": 2.4738221168518066, + "learning_rate": 5.509817786707829e-05, + "loss": 0.27635483741760253, + "step": 105140 + }, + { + "epoch": 0.4514309265603668, + "grad_norm": 0.09570404887199402, + "learning_rate": 5.509386614696067e-05, + "loss": 0.16080989837646484, + "step": 105150 + }, + { + "epoch": 0.4514738586503868, + "grad_norm": 0.7927491664886475, + "learning_rate": 5.5089554426843046e-05, + "loss": 0.34032866954803465, + "step": 105160 + }, + { + "epoch": 0.45151679074040685, + "grad_norm": 0.3021717369556427, + "learning_rate": 5.508524270672542e-05, + "loss": 0.10694130659103393, + "step": 105170 + }, + { + "epoch": 0.45155972283042684, + "grad_norm": 0.022390982136130333, + "learning_rate": 5.50809309866078e-05, + "loss": 0.23462820053100586, + "step": 105180 + }, + { + "epoch": 0.4516026549204468, + "grad_norm": 0.3534424304962158, + "learning_rate": 5.5076619266490185e-05, + "loss": 0.3277368783950806, + "step": 105190 + }, + { + "epoch": 0.45164558701046686, + "grad_norm": 0.13010990619659424, + "learning_rate": 5.507230754637255e-05, + "loss": 0.3046452760696411, + "step": 105200 + }, + { + "epoch": 0.45168851910048685, + "grad_norm": 0.014195505529642105, + "learning_rate": 5.5067995826254926e-05, + "loss": 0.14671356678009034, + "step": 105210 + }, + { + "epoch": 0.45173145119050684, + "grad_norm": 1.4591064453125, + "learning_rate": 5.50636841061373e-05, + "loss": 0.251104474067688, + "step": 105220 + }, + { + "epoch": 0.4517743832805269, + "grad_norm": 0.0026446059346199036, + "learning_rate": 5.505937238601968e-05, + "loss": 0.23639440536499023, + "step": 105230 + }, + { + "epoch": 0.45181731537054687, + "grad_norm": 0.11398119479417801, + "learning_rate": 5.505506066590206e-05, + "loss": 0.1752384662628174, + "step": 105240 + }, + { + "epoch": 0.45186024746056686, + "grad_norm": 1.354035496711731, + "learning_rate": 5.5050748945784435e-05, + "loss": 0.2977996587753296, + "step": 105250 + }, + { + "epoch": 0.4519031795505869, + "grad_norm": 3.3216819763183594, + "learning_rate": 5.504643722566681e-05, + "loss": 0.33972654342651365, + "step": 105260 + }, + { + "epoch": 0.4519461116406069, + "grad_norm": 0.008711867034435272, + "learning_rate": 5.504212550554919e-05, + "loss": 0.13020092248916626, + "step": 105270 + }, + { + "epoch": 0.45198904373062687, + "grad_norm": 0.24850133061408997, + "learning_rate": 5.503781378543156e-05, + "loss": 0.2301051139831543, + "step": 105280 + }, + { + "epoch": 0.4520319758206469, + "grad_norm": 0.0009402823052369058, + "learning_rate": 5.503350206531394e-05, + "loss": 0.24847860336303712, + "step": 105290 + }, + { + "epoch": 0.4520749079106669, + "grad_norm": 0.003734735306352377, + "learning_rate": 5.5029190345196315e-05, + "loss": 0.21702148914337158, + "step": 105300 + }, + { + "epoch": 0.4521178400006869, + "grad_norm": 0.0419401153922081, + "learning_rate": 5.502487862507869e-05, + "loss": 0.4774662494659424, + "step": 105310 + }, + { + "epoch": 0.45216077209070693, + "grad_norm": 0.2750690281391144, + "learning_rate": 5.502056690496107e-05, + "loss": 0.0887843906879425, + "step": 105320 + }, + { + "epoch": 0.4522037041807269, + "grad_norm": 0.48757249116897583, + "learning_rate": 5.501625518484345e-05, + "loss": 0.36505000591278075, + "step": 105330 + }, + { + "epoch": 0.45224663627074696, + "grad_norm": 2.4312708377838135, + "learning_rate": 5.5011943464725824e-05, + "loss": 0.36078386306762694, + "step": 105340 + }, + { + "epoch": 0.45228956836076695, + "grad_norm": 0.9031078219413757, + "learning_rate": 5.50076317446082e-05, + "loss": 0.19097875356674193, + "step": 105350 + }, + { + "epoch": 0.45233250045078693, + "grad_norm": 0.5606090426445007, + "learning_rate": 5.5003320024490566e-05, + "loss": 0.08003722429275513, + "step": 105360 + }, + { + "epoch": 0.452375432540807, + "grad_norm": 0.04021068662405014, + "learning_rate": 5.499900830437294e-05, + "loss": 0.07359545230865479, + "step": 105370 + }, + { + "epoch": 0.45241836463082696, + "grad_norm": 0.09893706440925598, + "learning_rate": 5.499469658425532e-05, + "loss": 0.35075531005859373, + "step": 105380 + }, + { + "epoch": 0.45246129672084695, + "grad_norm": 0.01163639035075903, + "learning_rate": 5.49903848641377e-05, + "loss": 0.059896016120910646, + "step": 105390 + }, + { + "epoch": 0.452504228810867, + "grad_norm": 0.027939310297369957, + "learning_rate": 5.4986073144020075e-05, + "loss": 0.15823696851730346, + "step": 105400 + }, + { + "epoch": 0.452547160900887, + "grad_norm": 0.862416684627533, + "learning_rate": 5.498176142390246e-05, + "loss": 0.264521050453186, + "step": 105410 + }, + { + "epoch": 0.45259009299090697, + "grad_norm": 2.1110284328460693, + "learning_rate": 5.4977449703784836e-05, + "loss": 0.29168670177459716, + "step": 105420 + }, + { + "epoch": 0.452633025080927, + "grad_norm": 0.08526672422885895, + "learning_rate": 5.4973137983667214e-05, + "loss": 0.12457125186920166, + "step": 105430 + }, + { + "epoch": 0.452675957170947, + "grad_norm": 0.030613580718636513, + "learning_rate": 5.496882626354958e-05, + "loss": 0.2875051021575928, + "step": 105440 + }, + { + "epoch": 0.452718889260967, + "grad_norm": 0.18582996726036072, + "learning_rate": 5.4964514543431955e-05, + "loss": 0.0911365807056427, + "step": 105450 + }, + { + "epoch": 0.452761821350987, + "grad_norm": 0.17034812271595, + "learning_rate": 5.496020282331433e-05, + "loss": 0.22752113342285157, + "step": 105460 + }, + { + "epoch": 0.452804753441007, + "grad_norm": 0.5781890749931335, + "learning_rate": 5.495589110319671e-05, + "loss": 0.3640397071838379, + "step": 105470 + }, + { + "epoch": 0.452847685531027, + "grad_norm": 4.468594074249268, + "learning_rate": 5.495157938307909e-05, + "loss": 0.20832431316375732, + "step": 105480 + }, + { + "epoch": 0.45289061762104704, + "grad_norm": 1.2272095680236816, + "learning_rate": 5.4947267662961464e-05, + "loss": 0.03705971837043762, + "step": 105490 + }, + { + "epoch": 0.45293354971106703, + "grad_norm": 0.00048346107359975576, + "learning_rate": 5.494295594284384e-05, + "loss": 0.3444437265396118, + "step": 105500 + }, + { + "epoch": 0.452976481801087, + "grad_norm": 0.0061003537848591805, + "learning_rate": 5.493864422272622e-05, + "loss": 0.2898735046386719, + "step": 105510 + }, + { + "epoch": 0.45301941389110706, + "grad_norm": 0.12422608584165573, + "learning_rate": 5.4934332502608596e-05, + "loss": 0.2567976713180542, + "step": 105520 + }, + { + "epoch": 0.45306234598112705, + "grad_norm": 0.3058219850063324, + "learning_rate": 5.493002078249097e-05, + "loss": 0.27314488887786864, + "step": 105530 + }, + { + "epoch": 0.45310527807114703, + "grad_norm": 0.1873115599155426, + "learning_rate": 5.4925709062373344e-05, + "loss": 0.14641257524490356, + "step": 105540 + }, + { + "epoch": 0.4531482101611671, + "grad_norm": 10.684101104736328, + "learning_rate": 5.492139734225572e-05, + "loss": 0.17531346082687377, + "step": 105550 + }, + { + "epoch": 0.45319114225118706, + "grad_norm": 0.10758557915687561, + "learning_rate": 5.49170856221381e-05, + "loss": 0.10541104078292847, + "step": 105560 + }, + { + "epoch": 0.4532340743412071, + "grad_norm": 0.1083778589963913, + "learning_rate": 5.4912773902020476e-05, + "loss": 0.16405218839645386, + "step": 105570 + }, + { + "epoch": 0.4532770064312271, + "grad_norm": 0.05349402502179146, + "learning_rate": 5.4908462181902854e-05, + "loss": 0.41372342109680177, + "step": 105580 + }, + { + "epoch": 0.4533199385212471, + "grad_norm": 0.02954784967005253, + "learning_rate": 5.490415046178523e-05, + "loss": 0.12565935850143434, + "step": 105590 + }, + { + "epoch": 0.4533628706112671, + "grad_norm": 0.10741877555847168, + "learning_rate": 5.489983874166761e-05, + "loss": 0.06284834146499634, + "step": 105600 + }, + { + "epoch": 0.4534058027012871, + "grad_norm": 0.0008346183458343148, + "learning_rate": 5.489552702154997e-05, + "loss": 0.09273313879966735, + "step": 105610 + }, + { + "epoch": 0.4534487347913071, + "grad_norm": 0.06353622674942017, + "learning_rate": 5.489121530143235e-05, + "loss": 0.08961839079856873, + "step": 105620 + }, + { + "epoch": 0.45349166688132714, + "grad_norm": 3.129249334335327, + "learning_rate": 5.4886903581314734e-05, + "loss": 0.26601376533508303, + "step": 105630 + }, + { + "epoch": 0.4535345989713471, + "grad_norm": 0.028794599696993828, + "learning_rate": 5.488259186119711e-05, + "loss": 0.07939417958259583, + "step": 105640 + }, + { + "epoch": 0.4535775310613671, + "grad_norm": 29.774160385131836, + "learning_rate": 5.487828014107949e-05, + "loss": 0.17747390270233154, + "step": 105650 + }, + { + "epoch": 0.45362046315138715, + "grad_norm": 0.9835271239280701, + "learning_rate": 5.4873968420961866e-05, + "loss": 0.25790364742279054, + "step": 105660 + }, + { + "epoch": 0.45366339524140714, + "grad_norm": 0.1610698699951172, + "learning_rate": 5.486965670084424e-05, + "loss": 0.23586997985839844, + "step": 105670 + }, + { + "epoch": 0.45370632733142713, + "grad_norm": 2.178144693374634, + "learning_rate": 5.486534498072662e-05, + "loss": 0.42916340827941896, + "step": 105680 + }, + { + "epoch": 0.45374925942144717, + "grad_norm": 0.03143854811787605, + "learning_rate": 5.4861033260608984e-05, + "loss": 0.2098684072494507, + "step": 105690 + }, + { + "epoch": 0.45379219151146716, + "grad_norm": 0.031577371060848236, + "learning_rate": 5.485672154049136e-05, + "loss": 0.23385446071624755, + "step": 105700 + }, + { + "epoch": 0.45383512360148714, + "grad_norm": 0.0011513188946992159, + "learning_rate": 5.485240982037374e-05, + "loss": 0.1985771417617798, + "step": 105710 + }, + { + "epoch": 0.4538780556915072, + "grad_norm": 1.0164504051208496, + "learning_rate": 5.4848098100256116e-05, + "loss": 0.17143828868865968, + "step": 105720 + }, + { + "epoch": 0.4539209877815272, + "grad_norm": 0.13020484149456024, + "learning_rate": 5.484378638013849e-05, + "loss": 0.12143751382827758, + "step": 105730 + }, + { + "epoch": 0.45396391987154716, + "grad_norm": 0.025720888748764992, + "learning_rate": 5.483947466002087e-05, + "loss": 0.20034878253936766, + "step": 105740 + }, + { + "epoch": 0.4540068519615672, + "grad_norm": 1.9531680345535278, + "learning_rate": 5.483516293990325e-05, + "loss": 0.4207042694091797, + "step": 105750 + }, + { + "epoch": 0.4540497840515872, + "grad_norm": 8.067658424377441, + "learning_rate": 5.4830851219785625e-05, + "loss": 0.18401105403900148, + "step": 105760 + }, + { + "epoch": 0.45409271614160723, + "grad_norm": 0.0019036878366023302, + "learning_rate": 5.4826539499667996e-05, + "loss": 0.3329533815383911, + "step": 105770 + }, + { + "epoch": 0.4541356482316272, + "grad_norm": 1.9565998315811157, + "learning_rate": 5.482222777955037e-05, + "loss": 0.3015397071838379, + "step": 105780 + }, + { + "epoch": 0.4541785803216472, + "grad_norm": 0.07071906328201294, + "learning_rate": 5.481791605943275e-05, + "loss": 0.09857931733131409, + "step": 105790 + }, + { + "epoch": 0.45422151241166725, + "grad_norm": 3.0820274353027344, + "learning_rate": 5.481360433931513e-05, + "loss": 0.15218595266342164, + "step": 105800 + }, + { + "epoch": 0.45426444450168724, + "grad_norm": 0.03130373731255531, + "learning_rate": 5.4809292619197505e-05, + "loss": 0.23463423252105714, + "step": 105810 + }, + { + "epoch": 0.4543073765917072, + "grad_norm": 1.1514767408370972, + "learning_rate": 5.480498089907988e-05, + "loss": 0.2280792474746704, + "step": 105820 + }, + { + "epoch": 0.45435030868172727, + "grad_norm": 0.039772145450115204, + "learning_rate": 5.480066917896226e-05, + "loss": 0.2732717037200928, + "step": 105830 + }, + { + "epoch": 0.45439324077174725, + "grad_norm": 1.2524688243865967, + "learning_rate": 5.479635745884464e-05, + "loss": 0.12637274265289306, + "step": 105840 + }, + { + "epoch": 0.45443617286176724, + "grad_norm": 0.054476819932460785, + "learning_rate": 5.4792045738727015e-05, + "loss": 0.20405046939849852, + "step": 105850 + }, + { + "epoch": 0.4544791049517873, + "grad_norm": 0.6834324598312378, + "learning_rate": 5.4787734018609385e-05, + "loss": 0.2571122646331787, + "step": 105860 + }, + { + "epoch": 0.45452203704180727, + "grad_norm": 0.043317679315805435, + "learning_rate": 5.478342229849176e-05, + "loss": 0.1396381139755249, + "step": 105870 + }, + { + "epoch": 0.45456496913182726, + "grad_norm": 0.023231295868754387, + "learning_rate": 5.477911057837414e-05, + "loss": 0.3477238416671753, + "step": 105880 + }, + { + "epoch": 0.4546079012218473, + "grad_norm": 4.941470146179199, + "learning_rate": 5.477479885825652e-05, + "loss": 0.2937323093414307, + "step": 105890 + }, + { + "epoch": 0.4546508333118673, + "grad_norm": 0.0022386584896594286, + "learning_rate": 5.4770487138138895e-05, + "loss": 0.1340307593345642, + "step": 105900 + }, + { + "epoch": 0.4546937654018873, + "grad_norm": 24.781116485595703, + "learning_rate": 5.476617541802127e-05, + "loss": 0.1788191556930542, + "step": 105910 + }, + { + "epoch": 0.4547366974919073, + "grad_norm": 0.0002703432401176542, + "learning_rate": 5.476186369790365e-05, + "loss": 0.08865943551063538, + "step": 105920 + }, + { + "epoch": 0.4547796295819273, + "grad_norm": 0.36172252893447876, + "learning_rate": 5.475755197778603e-05, + "loss": 0.1634334444999695, + "step": 105930 + }, + { + "epoch": 0.4548225616719473, + "grad_norm": 0.014767971821129322, + "learning_rate": 5.475324025766839e-05, + "loss": 0.19812369346618652, + "step": 105940 + }, + { + "epoch": 0.45486549376196733, + "grad_norm": 0.052083760499954224, + "learning_rate": 5.474892853755077e-05, + "loss": 0.1582965612411499, + "step": 105950 + }, + { + "epoch": 0.4549084258519873, + "grad_norm": 0.03402931988239288, + "learning_rate": 5.4744616817433145e-05, + "loss": 0.09308210015296936, + "step": 105960 + }, + { + "epoch": 0.4549513579420073, + "grad_norm": 0.001018917071633041, + "learning_rate": 5.474030509731552e-05, + "loss": 0.04058152437210083, + "step": 105970 + }, + { + "epoch": 0.45499429003202735, + "grad_norm": 0.00025437428848817945, + "learning_rate": 5.47359933771979e-05, + "loss": 0.21476302146911622, + "step": 105980 + }, + { + "epoch": 0.45503722212204734, + "grad_norm": 0.7949357032775879, + "learning_rate": 5.473168165708028e-05, + "loss": 0.22767140865325927, + "step": 105990 + }, + { + "epoch": 0.4550801542120674, + "grad_norm": 0.001639080117456615, + "learning_rate": 5.472736993696266e-05, + "loss": 0.25164577960968015, + "step": 106000 + }, + { + "epoch": 0.4550801542120674, + "eval_loss": 0.41482996940612793, + "eval_runtime": 27.1731, + "eval_samples_per_second": 3.68, + "eval_steps_per_second": 3.68, + "step": 106000 + }, + { + "epoch": 0.45512308630208737, + "grad_norm": 6.3070597648620605, + "learning_rate": 5.472305821684504e-05, + "loss": 0.29818522930145264, + "step": 106010 + }, + { + "epoch": 0.45516601839210735, + "grad_norm": 0.07904084026813507, + "learning_rate": 5.47187464967274e-05, + "loss": 0.22123868465423585, + "step": 106020 + }, + { + "epoch": 0.4552089504821274, + "grad_norm": 0.02109706960618496, + "learning_rate": 5.471443477660978e-05, + "loss": 0.2515869617462158, + "step": 106030 + }, + { + "epoch": 0.4552518825721474, + "grad_norm": 1.6628992557525635, + "learning_rate": 5.471012305649216e-05, + "loss": 0.17183525562286378, + "step": 106040 + }, + { + "epoch": 0.45529481466216737, + "grad_norm": 0.00023969076573848724, + "learning_rate": 5.4705811336374534e-05, + "loss": 0.1583714485168457, + "step": 106050 + }, + { + "epoch": 0.4553377467521874, + "grad_norm": 0.0442027822136879, + "learning_rate": 5.470149961625691e-05, + "loss": 0.06777175664901733, + "step": 106060 + }, + { + "epoch": 0.4553806788422074, + "grad_norm": 0.03461259976029396, + "learning_rate": 5.469718789613929e-05, + "loss": 0.3223992347717285, + "step": 106070 + }, + { + "epoch": 0.4554236109322274, + "grad_norm": 0.0009801493724808097, + "learning_rate": 5.4692876176021667e-05, + "loss": 0.07548410296440125, + "step": 106080 + }, + { + "epoch": 0.4554665430222474, + "grad_norm": 0.00773969478905201, + "learning_rate": 5.4688564455904044e-05, + "loss": 0.36380581855773925, + "step": 106090 + }, + { + "epoch": 0.4555094751122674, + "grad_norm": 0.9940565228462219, + "learning_rate": 5.4684252735786414e-05, + "loss": 0.20437259674072267, + "step": 106100 + }, + { + "epoch": 0.4555524072022874, + "grad_norm": 5.216660499572754, + "learning_rate": 5.467994101566879e-05, + "loss": 0.33365843296051023, + "step": 106110 + }, + { + "epoch": 0.45559533929230744, + "grad_norm": 0.005851436872035265, + "learning_rate": 5.467562929555117e-05, + "loss": 0.1647118330001831, + "step": 106120 + }, + { + "epoch": 0.45563827138232743, + "grad_norm": 0.002860630862414837, + "learning_rate": 5.4671317575433546e-05, + "loss": 0.12592718601226807, + "step": 106130 + }, + { + "epoch": 0.4556812034723474, + "grad_norm": 2.543055295944214, + "learning_rate": 5.4667005855315924e-05, + "loss": 0.06108694672584534, + "step": 106140 + }, + { + "epoch": 0.45572413556236746, + "grad_norm": 0.21334119141101837, + "learning_rate": 5.46626941351983e-05, + "loss": 0.1180303454399109, + "step": 106150 + }, + { + "epoch": 0.45576706765238745, + "grad_norm": 0.0378948450088501, + "learning_rate": 5.465838241508068e-05, + "loss": 0.2481198787689209, + "step": 106160 + }, + { + "epoch": 0.45580999974240743, + "grad_norm": 2.029277801513672, + "learning_rate": 5.4654070694963056e-05, + "loss": 0.2699997663497925, + "step": 106170 + }, + { + "epoch": 0.4558529318324275, + "grad_norm": 0.03349459171295166, + "learning_rate": 5.464975897484542e-05, + "loss": 0.20755038261413575, + "step": 106180 + }, + { + "epoch": 0.45589586392244746, + "grad_norm": 0.27105414867401123, + "learning_rate": 5.46454472547278e-05, + "loss": 0.17104097604751586, + "step": 106190 + }, + { + "epoch": 0.4559387960124675, + "grad_norm": 0.001149525516666472, + "learning_rate": 5.4641135534610174e-05, + "loss": 0.1373377561569214, + "step": 106200 + }, + { + "epoch": 0.4559817281024875, + "grad_norm": 0.08853041380643845, + "learning_rate": 5.463682381449255e-05, + "loss": 0.19982340335845947, + "step": 106210 + }, + { + "epoch": 0.4560246601925075, + "grad_norm": 0.7553684711456299, + "learning_rate": 5.4632512094374936e-05, + "loss": 0.24692888259887696, + "step": 106220 + }, + { + "epoch": 0.4560675922825275, + "grad_norm": 2.763723134994507, + "learning_rate": 5.462820037425731e-05, + "loss": 0.2985665321350098, + "step": 106230 + }, + { + "epoch": 0.4561105243725475, + "grad_norm": 0.9032507538795471, + "learning_rate": 5.462388865413969e-05, + "loss": 0.1298931360244751, + "step": 106240 + }, + { + "epoch": 0.4561534564625675, + "grad_norm": 0.11341998726129532, + "learning_rate": 5.461957693402207e-05, + "loss": 0.1300404191017151, + "step": 106250 + }, + { + "epoch": 0.45619638855258754, + "grad_norm": 0.3565244674682617, + "learning_rate": 5.4615265213904445e-05, + "loss": 0.26439990997314455, + "step": 106260 + }, + { + "epoch": 0.4562393206426075, + "grad_norm": 5.0023274421691895, + "learning_rate": 5.461095349378681e-05, + "loss": 0.39081289768218996, + "step": 106270 + }, + { + "epoch": 0.4562822527326275, + "grad_norm": 1.4395017623901367, + "learning_rate": 5.4606641773669186e-05, + "loss": 0.33194308280944823, + "step": 106280 + }, + { + "epoch": 0.45632518482264756, + "grad_norm": 1.879360318183899, + "learning_rate": 5.4602330053551564e-05, + "loss": 0.1768111228942871, + "step": 106290 + }, + { + "epoch": 0.45636811691266754, + "grad_norm": 0.0011820903746411204, + "learning_rate": 5.459801833343394e-05, + "loss": 0.2108830690383911, + "step": 106300 + }, + { + "epoch": 0.45641104900268753, + "grad_norm": 4.044092178344727, + "learning_rate": 5.459370661331632e-05, + "loss": 0.23956904411315919, + "step": 106310 + }, + { + "epoch": 0.4564539810927076, + "grad_norm": 0.018155014142394066, + "learning_rate": 5.4589394893198696e-05, + "loss": 0.14681684970855713, + "step": 106320 + }, + { + "epoch": 0.45649691318272756, + "grad_norm": 2.6040914058685303, + "learning_rate": 5.458508317308107e-05, + "loss": 0.43901724815368653, + "step": 106330 + }, + { + "epoch": 0.45653984527274755, + "grad_norm": 0.6595094799995422, + "learning_rate": 5.458077145296345e-05, + "loss": 0.3422011137008667, + "step": 106340 + }, + { + "epoch": 0.4565827773627676, + "grad_norm": 0.03562316671013832, + "learning_rate": 5.457645973284582e-05, + "loss": 0.273145866394043, + "step": 106350 + }, + { + "epoch": 0.4566257094527876, + "grad_norm": 0.10383918136358261, + "learning_rate": 5.45721480127282e-05, + "loss": 0.07594062089920044, + "step": 106360 + }, + { + "epoch": 0.45666864154280756, + "grad_norm": 5.044617176055908, + "learning_rate": 5.4567836292610576e-05, + "loss": 0.12228653430938721, + "step": 106370 + }, + { + "epoch": 0.4567115736328276, + "grad_norm": 1.1260477304458618, + "learning_rate": 5.456352457249295e-05, + "loss": 0.3477102518081665, + "step": 106380 + }, + { + "epoch": 0.4567545057228476, + "grad_norm": 1.9273653030395508, + "learning_rate": 5.455921285237533e-05, + "loss": 0.35800995826721194, + "step": 106390 + }, + { + "epoch": 0.4567974378128676, + "grad_norm": 0.11359955370426178, + "learning_rate": 5.455490113225771e-05, + "loss": 0.15946197509765625, + "step": 106400 + }, + { + "epoch": 0.4568403699028876, + "grad_norm": 0.5352416634559631, + "learning_rate": 5.4550589412140085e-05, + "loss": 0.11623773574829102, + "step": 106410 + }, + { + "epoch": 0.4568833019929076, + "grad_norm": 1.6490083932876587, + "learning_rate": 5.454627769202246e-05, + "loss": 0.3729348659515381, + "step": 106420 + }, + { + "epoch": 0.45692623408292765, + "grad_norm": 2.265371561050415, + "learning_rate": 5.4541965971904826e-05, + "loss": 0.21114826202392578, + "step": 106430 + }, + { + "epoch": 0.45696916617294764, + "grad_norm": 5.321807384490967, + "learning_rate": 5.453765425178721e-05, + "loss": 0.21526637077331542, + "step": 106440 + }, + { + "epoch": 0.4570120982629676, + "grad_norm": 3.7925004959106445, + "learning_rate": 5.453334253166959e-05, + "loss": 0.4127657890319824, + "step": 106450 + }, + { + "epoch": 0.45705503035298767, + "grad_norm": 0.0988013967871666, + "learning_rate": 5.4529030811551965e-05, + "loss": 0.13126038312911986, + "step": 106460 + }, + { + "epoch": 0.45709796244300765, + "grad_norm": 0.010926156304776669, + "learning_rate": 5.452471909143434e-05, + "loss": 0.278293776512146, + "step": 106470 + }, + { + "epoch": 0.45714089453302764, + "grad_norm": 0.004582703113555908, + "learning_rate": 5.452040737131672e-05, + "loss": 0.4080946922302246, + "step": 106480 + }, + { + "epoch": 0.4571838266230477, + "grad_norm": 0.10418860614299774, + "learning_rate": 5.45160956511991e-05, + "loss": 0.1536099433898926, + "step": 106490 + }, + { + "epoch": 0.45722675871306767, + "grad_norm": 0.017425889149308205, + "learning_rate": 5.4511783931081474e-05, + "loss": 0.09255735278129577, + "step": 106500 + }, + { + "epoch": 0.45726969080308766, + "grad_norm": 0.9808640480041504, + "learning_rate": 5.450747221096384e-05, + "loss": 0.45423202514648436, + "step": 106510 + }, + { + "epoch": 0.4573126228931077, + "grad_norm": 0.5739647746086121, + "learning_rate": 5.4503160490846215e-05, + "loss": 0.2469416856765747, + "step": 106520 + }, + { + "epoch": 0.4573555549831277, + "grad_norm": 14.410594940185547, + "learning_rate": 5.449884877072859e-05, + "loss": 0.24274146556854248, + "step": 106530 + }, + { + "epoch": 0.4573984870731477, + "grad_norm": 9.88992691040039, + "learning_rate": 5.449453705061097e-05, + "loss": 0.23896820545196534, + "step": 106540 + }, + { + "epoch": 0.4574414191631677, + "grad_norm": 0.022617029026150703, + "learning_rate": 5.449022533049335e-05, + "loss": 0.11045522689819336, + "step": 106550 + }, + { + "epoch": 0.4574843512531877, + "grad_norm": 1.0978119373321533, + "learning_rate": 5.4485913610375725e-05, + "loss": 0.25470197200775146, + "step": 106560 + }, + { + "epoch": 0.4575272833432077, + "grad_norm": 0.12575271725654602, + "learning_rate": 5.44816018902581e-05, + "loss": 0.17973484992980956, + "step": 106570 + }, + { + "epoch": 0.45757021543322773, + "grad_norm": 1.3486829996109009, + "learning_rate": 5.447729017014048e-05, + "loss": 0.2512737035751343, + "step": 106580 + }, + { + "epoch": 0.4576131475232477, + "grad_norm": 1.4863824844360352, + "learning_rate": 5.447297845002286e-05, + "loss": 0.2442696809768677, + "step": 106590 + }, + { + "epoch": 0.4576560796132677, + "grad_norm": 0.02740716002881527, + "learning_rate": 5.446866672990523e-05, + "loss": 0.12065447568893432, + "step": 106600 + }, + { + "epoch": 0.45769901170328775, + "grad_norm": 0.007653544191271067, + "learning_rate": 5.4464355009787605e-05, + "loss": 0.09701498150825501, + "step": 106610 + }, + { + "epoch": 0.45774194379330774, + "grad_norm": 0.006488314364105463, + "learning_rate": 5.446004328966998e-05, + "loss": 0.12032349109649658, + "step": 106620 + }, + { + "epoch": 0.4577848758833278, + "grad_norm": 2.8242900371551514, + "learning_rate": 5.445573156955236e-05, + "loss": 0.3414148330688477, + "step": 106630 + }, + { + "epoch": 0.45782780797334777, + "grad_norm": 1.6868880987167358, + "learning_rate": 5.445141984943474e-05, + "loss": 0.2605616331100464, + "step": 106640 + }, + { + "epoch": 0.45787074006336775, + "grad_norm": 0.0032382814679294825, + "learning_rate": 5.4447108129317114e-05, + "loss": 0.16553410291671752, + "step": 106650 + }, + { + "epoch": 0.4579136721533878, + "grad_norm": 6.745307922363281, + "learning_rate": 5.444279640919949e-05, + "loss": 0.24076588153839112, + "step": 106660 + }, + { + "epoch": 0.4579566042434078, + "grad_norm": 0.6660043597221375, + "learning_rate": 5.443848468908187e-05, + "loss": 0.22255656719207764, + "step": 106670 + }, + { + "epoch": 0.45799953633342777, + "grad_norm": 0.03691640496253967, + "learning_rate": 5.443417296896424e-05, + "loss": 0.21677632331848146, + "step": 106680 + }, + { + "epoch": 0.4580424684234478, + "grad_norm": 1.0660126209259033, + "learning_rate": 5.442986124884662e-05, + "loss": 0.4634716510772705, + "step": 106690 + }, + { + "epoch": 0.4580854005134678, + "grad_norm": 0.06456798315048218, + "learning_rate": 5.4425549528728994e-05, + "loss": 0.14308961629867553, + "step": 106700 + }, + { + "epoch": 0.4581283326034878, + "grad_norm": 5.615670204162598, + "learning_rate": 5.442123780861137e-05, + "loss": 0.28072869777679443, + "step": 106710 + }, + { + "epoch": 0.45817126469350783, + "grad_norm": 1.6977615356445312, + "learning_rate": 5.441692608849375e-05, + "loss": 0.22022032737731934, + "step": 106720 + }, + { + "epoch": 0.4582141967835278, + "grad_norm": 0.4748421907424927, + "learning_rate": 5.4412614368376126e-05, + "loss": 0.12203251123428345, + "step": 106730 + }, + { + "epoch": 0.4582571288735478, + "grad_norm": 0.09836887568235397, + "learning_rate": 5.44083026482585e-05, + "loss": 0.1722171664237976, + "step": 106740 + }, + { + "epoch": 0.45830006096356785, + "grad_norm": 0.15481862425804138, + "learning_rate": 5.440399092814088e-05, + "loss": 0.22251276969909667, + "step": 106750 + }, + { + "epoch": 0.45834299305358783, + "grad_norm": 0.6792683005332947, + "learning_rate": 5.4399679208023245e-05, + "loss": 0.15424517393112183, + "step": 106760 + }, + { + "epoch": 0.4583859251436078, + "grad_norm": 2.5286638736724854, + "learning_rate": 5.439536748790562e-05, + "loss": 0.23278870582580566, + "step": 106770 + }, + { + "epoch": 0.45842885723362786, + "grad_norm": 0.0009078033617697656, + "learning_rate": 5.4391055767788e-05, + "loss": 0.343894362449646, + "step": 106780 + }, + { + "epoch": 0.45847178932364785, + "grad_norm": 0.2713935077190399, + "learning_rate": 5.4386744047670377e-05, + "loss": 0.22668204307556153, + "step": 106790 + }, + { + "epoch": 0.45851472141366784, + "grad_norm": 5.523571491241455, + "learning_rate": 5.4382432327552754e-05, + "loss": 0.17648887634277344, + "step": 106800 + }, + { + "epoch": 0.4585576535036879, + "grad_norm": 0.008552341721951962, + "learning_rate": 5.437812060743513e-05, + "loss": 0.09792098999023438, + "step": 106810 + }, + { + "epoch": 0.45860058559370787, + "grad_norm": 2.73484468460083, + "learning_rate": 5.4373808887317515e-05, + "loss": 0.12865368127822877, + "step": 106820 + }, + { + "epoch": 0.45864351768372785, + "grad_norm": 0.06436431407928467, + "learning_rate": 5.436949716719989e-05, + "loss": 0.28464598655700685, + "step": 106830 + }, + { + "epoch": 0.4586864497737479, + "grad_norm": 0.030549952760338783, + "learning_rate": 5.4365185447082256e-05, + "loss": 0.01779765635728836, + "step": 106840 + }, + { + "epoch": 0.4587293818637679, + "grad_norm": 0.03825048729777336, + "learning_rate": 5.4360873726964634e-05, + "loss": 0.21697518825531006, + "step": 106850 + }, + { + "epoch": 0.4587723139537879, + "grad_norm": 0.00542847765609622, + "learning_rate": 5.435656200684701e-05, + "loss": 0.04956367015838623, + "step": 106860 + }, + { + "epoch": 0.4588152460438079, + "grad_norm": 0.3514080345630646, + "learning_rate": 5.435225028672939e-05, + "loss": 0.18809044361114502, + "step": 106870 + }, + { + "epoch": 0.4588581781338279, + "grad_norm": 0.012438662350177765, + "learning_rate": 5.4347938566611766e-05, + "loss": 0.08882975578308105, + "step": 106880 + }, + { + "epoch": 0.45890111022384794, + "grad_norm": 0.25532791018486023, + "learning_rate": 5.434362684649414e-05, + "loss": 0.24946775436401367, + "step": 106890 + }, + { + "epoch": 0.45894404231386793, + "grad_norm": 5.779187202453613, + "learning_rate": 5.433931512637652e-05, + "loss": 0.2366427183151245, + "step": 106900 + }, + { + "epoch": 0.4589869744038879, + "grad_norm": 0.01807689107954502, + "learning_rate": 5.43350034062589e-05, + "loss": 0.20517139434814452, + "step": 106910 + }, + { + "epoch": 0.45902990649390796, + "grad_norm": 0.014423569664359093, + "learning_rate": 5.433069168614127e-05, + "loss": 0.15260753631591797, + "step": 106920 + }, + { + "epoch": 0.45907283858392794, + "grad_norm": 0.008579591289162636, + "learning_rate": 5.4326379966023646e-05, + "loss": 0.23585269451141358, + "step": 106930 + }, + { + "epoch": 0.45911577067394793, + "grad_norm": 1.2639870643615723, + "learning_rate": 5.432206824590602e-05, + "loss": 0.21346158981323243, + "step": 106940 + }, + { + "epoch": 0.459158702763968, + "grad_norm": 0.5312317609786987, + "learning_rate": 5.43177565257884e-05, + "loss": 0.2956232070922852, + "step": 106950 + }, + { + "epoch": 0.45920163485398796, + "grad_norm": 0.003094247542321682, + "learning_rate": 5.431344480567078e-05, + "loss": 0.15500446557998657, + "step": 106960 + }, + { + "epoch": 0.45924456694400795, + "grad_norm": 0.17038699984550476, + "learning_rate": 5.4309133085553155e-05, + "loss": 0.04464645087718964, + "step": 106970 + }, + { + "epoch": 0.459287499034028, + "grad_norm": 0.003357459092512727, + "learning_rate": 5.430482136543553e-05, + "loss": 0.1722028970718384, + "step": 106980 + }, + { + "epoch": 0.459330431124048, + "grad_norm": 0.051209039986133575, + "learning_rate": 5.430050964531791e-05, + "loss": 0.30563194751739503, + "step": 106990 + }, + { + "epoch": 0.45937336321406796, + "grad_norm": 0.14684444665908813, + "learning_rate": 5.429619792520029e-05, + "loss": 0.11479036808013916, + "step": 107000 + }, + { + "epoch": 0.45937336321406796, + "eval_loss": 0.410770058631897, + "eval_runtime": 27.1924, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 3.677, + "step": 107000 + }, + { + "epoch": 0.459416295304088, + "grad_norm": 0.0816546231508255, + "learning_rate": 5.429188620508265e-05, + "loss": 0.11942390203475953, + "step": 107010 + }, + { + "epoch": 0.459459227394108, + "grad_norm": 1.5017467737197876, + "learning_rate": 5.428757448496503e-05, + "loss": 0.23939263820648193, + "step": 107020 + }, + { + "epoch": 0.459502159484128, + "grad_norm": 0.0422968752682209, + "learning_rate": 5.4283262764847406e-05, + "loss": 0.09964839220046998, + "step": 107030 + }, + { + "epoch": 0.459545091574148, + "grad_norm": 0.0042203571647405624, + "learning_rate": 5.427895104472979e-05, + "loss": 0.3719061613082886, + "step": 107040 + }, + { + "epoch": 0.459588023664168, + "grad_norm": 0.25797560811042786, + "learning_rate": 5.427463932461217e-05, + "loss": 0.0998129665851593, + "step": 107050 + }, + { + "epoch": 0.45963095575418805, + "grad_norm": 1.907709002494812, + "learning_rate": 5.4270327604494544e-05, + "loss": 0.31435647010803225, + "step": 107060 + }, + { + "epoch": 0.45967388784420804, + "grad_norm": 0.050441499799489975, + "learning_rate": 5.426601588437692e-05, + "loss": 0.007012385129928589, + "step": 107070 + }, + { + "epoch": 0.459716819934228, + "grad_norm": 0.015389510430395603, + "learning_rate": 5.42617041642593e-05, + "loss": 0.12824069261550902, + "step": 107080 + }, + { + "epoch": 0.45975975202424807, + "grad_norm": 1.227673888206482, + "learning_rate": 5.425739244414166e-05, + "loss": 0.40938620567321776, + "step": 107090 + }, + { + "epoch": 0.45980268411426806, + "grad_norm": 0.0380869060754776, + "learning_rate": 5.425308072402404e-05, + "loss": 0.3142383337020874, + "step": 107100 + }, + { + "epoch": 0.45984561620428804, + "grad_norm": 0.0016160336090251803, + "learning_rate": 5.424876900390642e-05, + "loss": 0.21405341625213622, + "step": 107110 + }, + { + "epoch": 0.4598885482943081, + "grad_norm": 3.592622995376587, + "learning_rate": 5.4244457283788795e-05, + "loss": 0.1815681576728821, + "step": 107120 + }, + { + "epoch": 0.4599314803843281, + "grad_norm": 0.01647309958934784, + "learning_rate": 5.424014556367117e-05, + "loss": 0.21015794277191163, + "step": 107130 + }, + { + "epoch": 0.45997441247434806, + "grad_norm": 1.1071962118148804, + "learning_rate": 5.423583384355355e-05, + "loss": 0.36042187213897703, + "step": 107140 + }, + { + "epoch": 0.4600173445643681, + "grad_norm": 0.0012331603793427348, + "learning_rate": 5.423152212343593e-05, + "loss": 0.11302400827407837, + "step": 107150 + }, + { + "epoch": 0.4600602766543881, + "grad_norm": 0.006316584534943104, + "learning_rate": 5.4227210403318304e-05, + "loss": 0.3362801313400269, + "step": 107160 + }, + { + "epoch": 0.4601032087444081, + "grad_norm": 0.020972879603505135, + "learning_rate": 5.4222898683200675e-05, + "loss": 0.10412032604217529, + "step": 107170 + }, + { + "epoch": 0.4601461408344281, + "grad_norm": 5.101585865020752, + "learning_rate": 5.421858696308305e-05, + "loss": 0.18556244373321534, + "step": 107180 + }, + { + "epoch": 0.4601890729244481, + "grad_norm": 0.0018986143404617906, + "learning_rate": 5.421427524296543e-05, + "loss": 0.15888286828994752, + "step": 107190 + }, + { + "epoch": 0.4602320050144681, + "grad_norm": 1.5135869979858398, + "learning_rate": 5.420996352284781e-05, + "loss": 0.37897820472717286, + "step": 107200 + }, + { + "epoch": 0.46027493710448814, + "grad_norm": 0.06372527033090591, + "learning_rate": 5.4205651802730184e-05, + "loss": 0.29553487300872805, + "step": 107210 + }, + { + "epoch": 0.4603178691945081, + "grad_norm": 1.9978270530700684, + "learning_rate": 5.420134008261256e-05, + "loss": 0.4286454677581787, + "step": 107220 + }, + { + "epoch": 0.4603608012845281, + "grad_norm": 0.21217115223407745, + "learning_rate": 5.419702836249494e-05, + "loss": 0.2591038703918457, + "step": 107230 + }, + { + "epoch": 0.46040373337454815, + "grad_norm": 0.09320700168609619, + "learning_rate": 5.4192716642377316e-05, + "loss": 0.26857402324676516, + "step": 107240 + }, + { + "epoch": 0.46044666546456814, + "grad_norm": 1.601436734199524, + "learning_rate": 5.418840492225968e-05, + "loss": 0.19982035160064698, + "step": 107250 + }, + { + "epoch": 0.4604895975545881, + "grad_norm": 0.6431751847267151, + "learning_rate": 5.4184093202142064e-05, + "loss": 0.31042304039001467, + "step": 107260 + }, + { + "epoch": 0.46053252964460817, + "grad_norm": 6.150503635406494, + "learning_rate": 5.417978148202444e-05, + "loss": 0.2688772439956665, + "step": 107270 + }, + { + "epoch": 0.46057546173462816, + "grad_norm": 1.0778108835220337, + "learning_rate": 5.417546976190682e-05, + "loss": 0.13753099441528321, + "step": 107280 + }, + { + "epoch": 0.4606183938246482, + "grad_norm": 1.964769721031189, + "learning_rate": 5.4171158041789196e-05, + "loss": 0.3952221393585205, + "step": 107290 + }, + { + "epoch": 0.4606613259146682, + "grad_norm": 0.42801520228385925, + "learning_rate": 5.4166846321671574e-05, + "loss": 0.38815133571624755, + "step": 107300 + }, + { + "epoch": 0.46070425800468817, + "grad_norm": 0.25493547320365906, + "learning_rate": 5.416253460155395e-05, + "loss": 0.03624766170978546, + "step": 107310 + }, + { + "epoch": 0.4607471900947082, + "grad_norm": 0.002436437178403139, + "learning_rate": 5.415822288143633e-05, + "loss": 0.19822332859039307, + "step": 107320 + }, + { + "epoch": 0.4607901221847282, + "grad_norm": 0.02002480812370777, + "learning_rate": 5.4153911161318706e-05, + "loss": 0.2200617551803589, + "step": 107330 + }, + { + "epoch": 0.4608330542747482, + "grad_norm": 0.013750090263783932, + "learning_rate": 5.414959944120107e-05, + "loss": 0.15083693265914916, + "step": 107340 + }, + { + "epoch": 0.46087598636476823, + "grad_norm": 7.042511463165283, + "learning_rate": 5.414528772108345e-05, + "loss": 0.03854614496231079, + "step": 107350 + }, + { + "epoch": 0.4609189184547882, + "grad_norm": 0.013194686733186245, + "learning_rate": 5.4140976000965824e-05, + "loss": 0.25104031562805174, + "step": 107360 + }, + { + "epoch": 0.4609618505448082, + "grad_norm": 0.3216243386268616, + "learning_rate": 5.41366642808482e-05, + "loss": 0.20811998844146729, + "step": 107370 + }, + { + "epoch": 0.46100478263482825, + "grad_norm": 0.005275218281894922, + "learning_rate": 5.413235256073058e-05, + "loss": 0.18954060077667237, + "step": 107380 + }, + { + "epoch": 0.46104771472484823, + "grad_norm": 1.6544564962387085, + "learning_rate": 5.4128040840612956e-05, + "loss": 0.37151012420654295, + "step": 107390 + }, + { + "epoch": 0.4610906468148682, + "grad_norm": 0.029340092092752457, + "learning_rate": 5.4123729120495333e-05, + "loss": 0.10839523077011108, + "step": 107400 + }, + { + "epoch": 0.46113357890488826, + "grad_norm": 0.0010873668361455202, + "learning_rate": 5.411941740037772e-05, + "loss": 0.22068979740142822, + "step": 107410 + }, + { + "epoch": 0.46117651099490825, + "grad_norm": 0.7604783177375793, + "learning_rate": 5.411510568026008e-05, + "loss": 0.29136641025543214, + "step": 107420 + }, + { + "epoch": 0.46121944308492824, + "grad_norm": 2.561589241027832, + "learning_rate": 5.411079396014246e-05, + "loss": 0.2376845359802246, + "step": 107430 + }, + { + "epoch": 0.4612623751749483, + "grad_norm": 0.03527192771434784, + "learning_rate": 5.4106482240024836e-05, + "loss": 0.3317641496658325, + "step": 107440 + }, + { + "epoch": 0.46130530726496827, + "grad_norm": 0.05285614728927612, + "learning_rate": 5.4102170519907213e-05, + "loss": 0.2993663549423218, + "step": 107450 + }, + { + "epoch": 0.46134823935498825, + "grad_norm": 0.08579917997121811, + "learning_rate": 5.409785879978959e-05, + "loss": 0.454376745223999, + "step": 107460 + }, + { + "epoch": 0.4613911714450083, + "grad_norm": 2.122178316116333, + "learning_rate": 5.409354707967197e-05, + "loss": 0.35166881084442136, + "step": 107470 + }, + { + "epoch": 0.4614341035350283, + "grad_norm": 1.0269190073013306, + "learning_rate": 5.4089235359554345e-05, + "loss": 0.2291714906692505, + "step": 107480 + }, + { + "epoch": 0.4614770356250483, + "grad_norm": 0.23851673305034637, + "learning_rate": 5.408492363943672e-05, + "loss": 0.12133759260177612, + "step": 107490 + }, + { + "epoch": 0.4615199677150683, + "grad_norm": 3.512455940246582, + "learning_rate": 5.408061191931909e-05, + "loss": 0.14880177974700928, + "step": 107500 + }, + { + "epoch": 0.4615628998050883, + "grad_norm": 0.008847239427268505, + "learning_rate": 5.407630019920147e-05, + "loss": 0.2780074834823608, + "step": 107510 + }, + { + "epoch": 0.46160583189510834, + "grad_norm": 0.14573852717876434, + "learning_rate": 5.407198847908385e-05, + "loss": 0.13713104724884034, + "step": 107520 + }, + { + "epoch": 0.46164876398512833, + "grad_norm": 0.02383585087954998, + "learning_rate": 5.4067676758966225e-05, + "loss": 0.4326714038848877, + "step": 107530 + }, + { + "epoch": 0.4616916960751483, + "grad_norm": 2.5402002334594727, + "learning_rate": 5.40633650388486e-05, + "loss": 0.39748458862304686, + "step": 107540 + }, + { + "epoch": 0.46173462816516836, + "grad_norm": 0.00459776958450675, + "learning_rate": 5.405905331873098e-05, + "loss": 0.11582623720169068, + "step": 107550 + }, + { + "epoch": 0.46177756025518835, + "grad_norm": 0.00286311749368906, + "learning_rate": 5.405474159861336e-05, + "loss": 0.28338334560394285, + "step": 107560 + }, + { + "epoch": 0.46182049234520833, + "grad_norm": 2.2359275817871094, + "learning_rate": 5.4050429878495735e-05, + "loss": 0.19136548042297363, + "step": 107570 + }, + { + "epoch": 0.4618634244352284, + "grad_norm": 1.865877389907837, + "learning_rate": 5.40461181583781e-05, + "loss": 0.21330225467681885, + "step": 107580 + }, + { + "epoch": 0.46190635652524836, + "grad_norm": 0.017468079924583435, + "learning_rate": 5.4041806438260476e-05, + "loss": 0.08362210988998413, + "step": 107590 + }, + { + "epoch": 0.46194928861526835, + "grad_norm": 0.003552723675966263, + "learning_rate": 5.403749471814285e-05, + "loss": 0.3091665983200073, + "step": 107600 + }, + { + "epoch": 0.4619922207052884, + "grad_norm": 0.2940342426300049, + "learning_rate": 5.403318299802523e-05, + "loss": 0.22590343952178954, + "step": 107610 + }, + { + "epoch": 0.4620351527953084, + "grad_norm": 0.058316994458436966, + "learning_rate": 5.402887127790761e-05, + "loss": 0.20781214237213136, + "step": 107620 + }, + { + "epoch": 0.46207808488532837, + "grad_norm": 0.006735431496053934, + "learning_rate": 5.402455955778999e-05, + "loss": 0.41092872619628906, + "step": 107630 + }, + { + "epoch": 0.4621210169753484, + "grad_norm": 0.0062255957163870335, + "learning_rate": 5.402024783767237e-05, + "loss": 0.23068614006042482, + "step": 107640 + }, + { + "epoch": 0.4621639490653684, + "grad_norm": 0.05530041456222534, + "learning_rate": 5.401593611755475e-05, + "loss": 0.15006642341613768, + "step": 107650 + }, + { + "epoch": 0.4622068811553884, + "grad_norm": 0.00756365992128849, + "learning_rate": 5.4011624397437124e-05, + "loss": 0.2585622310638428, + "step": 107660 + }, + { + "epoch": 0.4622498132454084, + "grad_norm": 0.8214569687843323, + "learning_rate": 5.400731267731949e-05, + "loss": 0.1466231107711792, + "step": 107670 + }, + { + "epoch": 0.4622927453354284, + "grad_norm": 5.284905910491943, + "learning_rate": 5.4003000957201865e-05, + "loss": 0.4204678535461426, + "step": 107680 + }, + { + "epoch": 0.4623356774254484, + "grad_norm": 3.108691692352295, + "learning_rate": 5.399868923708424e-05, + "loss": 0.2649915456771851, + "step": 107690 + }, + { + "epoch": 0.46237860951546844, + "grad_norm": 3.194406032562256, + "learning_rate": 5.399437751696662e-05, + "loss": 0.2339545726776123, + "step": 107700 + }, + { + "epoch": 0.46242154160548843, + "grad_norm": 1.4864603281021118, + "learning_rate": 5.3990065796849e-05, + "loss": 0.2706879138946533, + "step": 107710 + }, + { + "epoch": 0.46246447369550847, + "grad_norm": 1.7163327932357788, + "learning_rate": 5.3985754076731375e-05, + "loss": 0.26778957843780515, + "step": 107720 + }, + { + "epoch": 0.46250740578552846, + "grad_norm": 0.005073135253041983, + "learning_rate": 5.398144235661375e-05, + "loss": 0.18886086940765381, + "step": 107730 + }, + { + "epoch": 0.46255033787554845, + "grad_norm": 3.492783308029175, + "learning_rate": 5.397713063649613e-05, + "loss": 0.3263582468032837, + "step": 107740 + }, + { + "epoch": 0.4625932699655685, + "grad_norm": 1.7698097229003906, + "learning_rate": 5.39728189163785e-05, + "loss": 0.3023443460464478, + "step": 107750 + }, + { + "epoch": 0.4626362020555885, + "grad_norm": 0.1830163300037384, + "learning_rate": 5.396850719626088e-05, + "loss": 0.168390691280365, + "step": 107760 + }, + { + "epoch": 0.46267913414560846, + "grad_norm": 0.008062989450991154, + "learning_rate": 5.3964195476143255e-05, + "loss": 0.2124844789505005, + "step": 107770 + }, + { + "epoch": 0.4627220662356285, + "grad_norm": 0.07734813541173935, + "learning_rate": 5.395988375602563e-05, + "loss": 0.20680534839630127, + "step": 107780 + }, + { + "epoch": 0.4627649983256485, + "grad_norm": 0.07653836160898209, + "learning_rate": 5.395557203590801e-05, + "loss": 0.14306381940841675, + "step": 107790 + }, + { + "epoch": 0.4628079304156685, + "grad_norm": 0.08502493053674698, + "learning_rate": 5.3951260315790387e-05, + "loss": 0.07291386127471924, + "step": 107800 + }, + { + "epoch": 0.4628508625056885, + "grad_norm": 2.5778799057006836, + "learning_rate": 5.3946948595672764e-05, + "loss": 0.28781633377075194, + "step": 107810 + }, + { + "epoch": 0.4628937945957085, + "grad_norm": 0.006881984416395426, + "learning_rate": 5.394263687555514e-05, + "loss": 0.12385448217391967, + "step": 107820 + }, + { + "epoch": 0.4629367266857285, + "grad_norm": 1.7388111352920532, + "learning_rate": 5.3938325155437505e-05, + "loss": 0.41140213012695315, + "step": 107830 + }, + { + "epoch": 0.46297965877574854, + "grad_norm": 1.329716682434082, + "learning_rate": 5.393401343531988e-05, + "loss": 0.1415262460708618, + "step": 107840 + }, + { + "epoch": 0.4630225908657685, + "grad_norm": 3.928546905517578, + "learning_rate": 5.3929701715202266e-05, + "loss": 0.25690150260925293, + "step": 107850 + }, + { + "epoch": 0.4630655229557885, + "grad_norm": 0.09528861194849014, + "learning_rate": 5.3925389995084644e-05, + "loss": 0.1600080370903015, + "step": 107860 + }, + { + "epoch": 0.46310845504580855, + "grad_norm": 0.04780289903283119, + "learning_rate": 5.392107827496702e-05, + "loss": 0.0948940396308899, + "step": 107870 + }, + { + "epoch": 0.46315138713582854, + "grad_norm": 0.505102813243866, + "learning_rate": 5.39167665548494e-05, + "loss": 0.20008227825164795, + "step": 107880 + }, + { + "epoch": 0.4631943192258485, + "grad_norm": 1.3333100080490112, + "learning_rate": 5.3912454834731776e-05, + "loss": 0.38938562870025634, + "step": 107890 + }, + { + "epoch": 0.46323725131586857, + "grad_norm": 0.10062599927186966, + "learning_rate": 5.390814311461415e-05, + "loss": 0.1558384656906128, + "step": 107900 + }, + { + "epoch": 0.46328018340588856, + "grad_norm": 0.2624666690826416, + "learning_rate": 5.390383139449652e-05, + "loss": 0.34660289287567136, + "step": 107910 + }, + { + "epoch": 0.4633231154959086, + "grad_norm": 5.808780193328857, + "learning_rate": 5.3899519674378894e-05, + "loss": 0.2815376043319702, + "step": 107920 + }, + { + "epoch": 0.4633660475859286, + "grad_norm": 0.02222195826470852, + "learning_rate": 5.389520795426127e-05, + "loss": 0.1592707395553589, + "step": 107930 + }, + { + "epoch": 0.4634089796759486, + "grad_norm": 0.1832973062992096, + "learning_rate": 5.389089623414365e-05, + "loss": 0.051647549867630003, + "step": 107940 + }, + { + "epoch": 0.4634519117659686, + "grad_norm": 1.8670419454574585, + "learning_rate": 5.3886584514026026e-05, + "loss": 0.17604377269744872, + "step": 107950 + }, + { + "epoch": 0.4634948438559886, + "grad_norm": 2.1626927852630615, + "learning_rate": 5.3882272793908404e-05, + "loss": 0.2858207941055298, + "step": 107960 + }, + { + "epoch": 0.4635377759460086, + "grad_norm": 1.8692584037780762, + "learning_rate": 5.387796107379078e-05, + "loss": 0.28811016082763674, + "step": 107970 + }, + { + "epoch": 0.46358070803602863, + "grad_norm": 1.229792594909668, + "learning_rate": 5.387364935367316e-05, + "loss": 0.19653548002243043, + "step": 107980 + }, + { + "epoch": 0.4636236401260486, + "grad_norm": 1.7526077032089233, + "learning_rate": 5.386933763355553e-05, + "loss": 0.10595974922180176, + "step": 107990 + }, + { + "epoch": 0.4636665722160686, + "grad_norm": 0.05710328370332718, + "learning_rate": 5.3865025913437906e-05, + "loss": 0.21571879386901854, + "step": 108000 + }, + { + "epoch": 0.4636665722160686, + "eval_loss": 0.41005194187164307, + "eval_runtime": 27.2514, + "eval_samples_per_second": 3.67, + "eval_steps_per_second": 3.67, + "step": 108000 + }, + { + "epoch": 0.46370950430608865, + "grad_norm": 0.7972310185432434, + "learning_rate": 5.3860714193320284e-05, + "loss": 0.17171356678009034, + "step": 108010 + }, + { + "epoch": 0.46375243639610864, + "grad_norm": 2.4184482097625732, + "learning_rate": 5.385640247320266e-05, + "loss": 0.33305227756500244, + "step": 108020 + }, + { + "epoch": 0.4637953684861286, + "grad_norm": 1.5586384534835815, + "learning_rate": 5.385209075308504e-05, + "loss": 0.2196629285812378, + "step": 108030 + }, + { + "epoch": 0.46383830057614867, + "grad_norm": 3.1902902126312256, + "learning_rate": 5.3847779032967416e-05, + "loss": 0.2836449146270752, + "step": 108040 + }, + { + "epoch": 0.46388123266616865, + "grad_norm": 0.7188327312469482, + "learning_rate": 5.384346731284979e-05, + "loss": 0.2961325168609619, + "step": 108050 + }, + { + "epoch": 0.46392416475618864, + "grad_norm": 0.04909314587712288, + "learning_rate": 5.383915559273217e-05, + "loss": 0.22378733158111572, + "step": 108060 + }, + { + "epoch": 0.4639670968462087, + "grad_norm": 0.5020912885665894, + "learning_rate": 5.383484387261455e-05, + "loss": 0.21214513778686522, + "step": 108070 + }, + { + "epoch": 0.46401002893622867, + "grad_norm": 1.335729718208313, + "learning_rate": 5.383053215249692e-05, + "loss": 0.2735942602157593, + "step": 108080 + }, + { + "epoch": 0.46405296102624866, + "grad_norm": 0.004877585452049971, + "learning_rate": 5.3826220432379296e-05, + "loss": 0.10161428451538086, + "step": 108090 + }, + { + "epoch": 0.4640958931162687, + "grad_norm": 0.014859105460345745, + "learning_rate": 5.382190871226167e-05, + "loss": 0.19523313045501708, + "step": 108100 + }, + { + "epoch": 0.4641388252062887, + "grad_norm": 0.0427483394742012, + "learning_rate": 5.381759699214405e-05, + "loss": 0.3561259746551514, + "step": 108110 + }, + { + "epoch": 0.4641817572963087, + "grad_norm": 0.011663105338811874, + "learning_rate": 5.381328527202643e-05, + "loss": 0.11659363508224488, + "step": 108120 + }, + { + "epoch": 0.4642246893863287, + "grad_norm": 0.10213266313076019, + "learning_rate": 5.3808973551908805e-05, + "loss": 0.18022869825363158, + "step": 108130 + }, + { + "epoch": 0.4642676214763487, + "grad_norm": 0.03695356100797653, + "learning_rate": 5.380466183179118e-05, + "loss": 0.2642909288406372, + "step": 108140 + }, + { + "epoch": 0.46431055356636874, + "grad_norm": 1.6254346370697021, + "learning_rate": 5.380035011167356e-05, + "loss": 0.2565154552459717, + "step": 108150 + }, + { + "epoch": 0.46435348565638873, + "grad_norm": 1.142285943031311, + "learning_rate": 5.3796038391555923e-05, + "loss": 0.35707571506500246, + "step": 108160 + }, + { + "epoch": 0.4643964177464087, + "grad_norm": 0.008432451635599136, + "learning_rate": 5.37917266714383e-05, + "loss": 0.2093576431274414, + "step": 108170 + }, + { + "epoch": 0.46443934983642876, + "grad_norm": 1.543778896331787, + "learning_rate": 5.378741495132068e-05, + "loss": 0.19204574823379517, + "step": 108180 + }, + { + "epoch": 0.46448228192644875, + "grad_norm": 0.12307104468345642, + "learning_rate": 5.3783103231203055e-05, + "loss": 0.20635404586791992, + "step": 108190 + }, + { + "epoch": 0.46452521401646873, + "grad_norm": 9.1827974319458, + "learning_rate": 5.377879151108543e-05, + "loss": 0.3973682880401611, + "step": 108200 + }, + { + "epoch": 0.4645681461064888, + "grad_norm": 0.09982259571552277, + "learning_rate": 5.377447979096781e-05, + "loss": 0.08464877605438233, + "step": 108210 + }, + { + "epoch": 0.46461107819650876, + "grad_norm": 0.39693737030029297, + "learning_rate": 5.3770168070850194e-05, + "loss": 0.2325838804244995, + "step": 108220 + }, + { + "epoch": 0.46465401028652875, + "grad_norm": 0.02635379508137703, + "learning_rate": 5.376585635073257e-05, + "loss": 0.3548180103302002, + "step": 108230 + }, + { + "epoch": 0.4646969423765488, + "grad_norm": 0.046965520828962326, + "learning_rate": 5.3761544630614935e-05, + "loss": 0.22502679824829103, + "step": 108240 + }, + { + "epoch": 0.4647398744665688, + "grad_norm": 0.8190841674804688, + "learning_rate": 5.375723291049731e-05, + "loss": 0.19198547601699828, + "step": 108250 + }, + { + "epoch": 0.46478280655658877, + "grad_norm": 0.005247071385383606, + "learning_rate": 5.375292119037969e-05, + "loss": 0.1480405807495117, + "step": 108260 + }, + { + "epoch": 0.4648257386466088, + "grad_norm": 0.0025028225500136614, + "learning_rate": 5.374860947026207e-05, + "loss": 0.25399935245513916, + "step": 108270 + }, + { + "epoch": 0.4648686707366288, + "grad_norm": 0.04282236471772194, + "learning_rate": 5.3744297750144445e-05, + "loss": 0.17009189128875732, + "step": 108280 + }, + { + "epoch": 0.4649116028266488, + "grad_norm": 12.793172836303711, + "learning_rate": 5.373998603002682e-05, + "loss": 0.4070758819580078, + "step": 108290 + }, + { + "epoch": 0.4649545349166688, + "grad_norm": 0.03920111805200577, + "learning_rate": 5.37356743099092e-05, + "loss": 0.12617360353469848, + "step": 108300 + }, + { + "epoch": 0.4649974670066888, + "grad_norm": 5.277186393737793, + "learning_rate": 5.373136258979158e-05, + "loss": 0.3769539356231689, + "step": 108310 + }, + { + "epoch": 0.4650403990967088, + "grad_norm": 0.0842542052268982, + "learning_rate": 5.372705086967395e-05, + "loss": 0.227150559425354, + "step": 108320 + }, + { + "epoch": 0.46508333118672884, + "grad_norm": 3.8030989170074463, + "learning_rate": 5.3722739149556325e-05, + "loss": 0.34054558277130126, + "step": 108330 + }, + { + "epoch": 0.46512626327674883, + "grad_norm": 3.126784324645996, + "learning_rate": 5.37184274294387e-05, + "loss": 0.34538271427154543, + "step": 108340 + }, + { + "epoch": 0.4651691953667688, + "grad_norm": 4.654267311096191, + "learning_rate": 5.371411570932108e-05, + "loss": 0.3616941452026367, + "step": 108350 + }, + { + "epoch": 0.46521212745678886, + "grad_norm": 1.575369119644165, + "learning_rate": 5.370980398920346e-05, + "loss": 0.3219918727874756, + "step": 108360 + }, + { + "epoch": 0.46525505954680885, + "grad_norm": 0.09393744170665741, + "learning_rate": 5.3705492269085834e-05, + "loss": 0.38858466148376464, + "step": 108370 + }, + { + "epoch": 0.4652979916368289, + "grad_norm": 1.2401673793792725, + "learning_rate": 5.370118054896821e-05, + "loss": 0.342305064201355, + "step": 108380 + }, + { + "epoch": 0.4653409237268489, + "grad_norm": 0.6935057044029236, + "learning_rate": 5.369686882885059e-05, + "loss": 0.19834787845611573, + "step": 108390 + }, + { + "epoch": 0.46538385581686886, + "grad_norm": 0.6531186103820801, + "learning_rate": 5.3692557108732966e-05, + "loss": 0.20809459686279297, + "step": 108400 + }, + { + "epoch": 0.4654267879068889, + "grad_norm": 0.16829413175582886, + "learning_rate": 5.368824538861533e-05, + "loss": 0.20340075492858886, + "step": 108410 + }, + { + "epoch": 0.4654697199969089, + "grad_norm": 0.010563013143837452, + "learning_rate": 5.368393366849771e-05, + "loss": 0.3063130140304565, + "step": 108420 + }, + { + "epoch": 0.4655126520869289, + "grad_norm": 5.520454406738281, + "learning_rate": 5.3679621948380085e-05, + "loss": 0.17612496614456177, + "step": 108430 + }, + { + "epoch": 0.4655555841769489, + "grad_norm": 0.07296188920736313, + "learning_rate": 5.367531022826247e-05, + "loss": 0.18347402811050414, + "step": 108440 + }, + { + "epoch": 0.4655985162669689, + "grad_norm": 0.04715902730822563, + "learning_rate": 5.3670998508144846e-05, + "loss": 0.09383673667907715, + "step": 108450 + }, + { + "epoch": 0.4656414483569889, + "grad_norm": 1.912249207496643, + "learning_rate": 5.3666686788027223e-05, + "loss": 0.21449682712554932, + "step": 108460 + }, + { + "epoch": 0.46568438044700894, + "grad_norm": 0.007666005752980709, + "learning_rate": 5.36623750679096e-05, + "loss": 0.08209252953529358, + "step": 108470 + }, + { + "epoch": 0.4657273125370289, + "grad_norm": 1.7507456541061401, + "learning_rate": 5.365806334779198e-05, + "loss": 0.2532007932662964, + "step": 108480 + }, + { + "epoch": 0.4657702446270489, + "grad_norm": 0.05488888546824455, + "learning_rate": 5.365375162767434e-05, + "loss": 0.11761891841888428, + "step": 108490 + }, + { + "epoch": 0.46581317671706896, + "grad_norm": 7.336211681365967, + "learning_rate": 5.364943990755672e-05, + "loss": 0.3313975095748901, + "step": 108500 + }, + { + "epoch": 0.46585610880708894, + "grad_norm": 1.9621061086654663, + "learning_rate": 5.3645128187439097e-05, + "loss": 0.16002817153930665, + "step": 108510 + }, + { + "epoch": 0.46589904089710893, + "grad_norm": 0.01421435084193945, + "learning_rate": 5.3640816467321474e-05, + "loss": 0.18043923377990723, + "step": 108520 + }, + { + "epoch": 0.46594197298712897, + "grad_norm": 0.8421728014945984, + "learning_rate": 5.363650474720385e-05, + "loss": 0.1255861282348633, + "step": 108530 + }, + { + "epoch": 0.46598490507714896, + "grad_norm": 2.173349618911743, + "learning_rate": 5.363219302708623e-05, + "loss": 0.2318701982498169, + "step": 108540 + }, + { + "epoch": 0.46602783716716895, + "grad_norm": 0.006124487146735191, + "learning_rate": 5.3627881306968606e-05, + "loss": 0.1734129548072815, + "step": 108550 + }, + { + "epoch": 0.466070769257189, + "grad_norm": 0.003295590402558446, + "learning_rate": 5.362356958685098e-05, + "loss": 0.2409308671951294, + "step": 108560 + }, + { + "epoch": 0.466113701347209, + "grad_norm": 0.4986785650253296, + "learning_rate": 5.3619257866733354e-05, + "loss": 0.15486088991165162, + "step": 108570 + }, + { + "epoch": 0.466156633437229, + "grad_norm": 2.084712028503418, + "learning_rate": 5.361494614661573e-05, + "loss": 0.3027064085006714, + "step": 108580 + }, + { + "epoch": 0.466199565527249, + "grad_norm": 0.852931797504425, + "learning_rate": 5.361063442649811e-05, + "loss": 0.12534109354019166, + "step": 108590 + }, + { + "epoch": 0.466242497617269, + "grad_norm": 0.9069915413856506, + "learning_rate": 5.3606322706380486e-05, + "loss": 0.4733282089233398, + "step": 108600 + }, + { + "epoch": 0.46628542970728903, + "grad_norm": 1.9747471809387207, + "learning_rate": 5.360201098626286e-05, + "loss": 0.21740949153900146, + "step": 108610 + }, + { + "epoch": 0.466328361797309, + "grad_norm": 0.005773196928203106, + "learning_rate": 5.359769926614524e-05, + "loss": 0.23748469352722168, + "step": 108620 + }, + { + "epoch": 0.466371293887329, + "grad_norm": 1.2689443826675415, + "learning_rate": 5.359338754602762e-05, + "loss": 0.04138025641441345, + "step": 108630 + }, + { + "epoch": 0.46641422597734905, + "grad_norm": 2.238314151763916, + "learning_rate": 5.3589075825909995e-05, + "loss": 0.17560259103775025, + "step": 108640 + }, + { + "epoch": 0.46645715806736904, + "grad_norm": 0.3152078688144684, + "learning_rate": 5.358476410579236e-05, + "loss": 0.363291072845459, + "step": 108650 + }, + { + "epoch": 0.466500090157389, + "grad_norm": 1.2952321767807007, + "learning_rate": 5.358045238567474e-05, + "loss": 0.1913788437843323, + "step": 108660 + }, + { + "epoch": 0.46654302224740907, + "grad_norm": 5.483338832855225, + "learning_rate": 5.357614066555712e-05, + "loss": 0.13968578577041627, + "step": 108670 + }, + { + "epoch": 0.46658595433742905, + "grad_norm": 1.802327275276184, + "learning_rate": 5.35718289454395e-05, + "loss": 0.3405932903289795, + "step": 108680 + }, + { + "epoch": 0.46662888642744904, + "grad_norm": 0.006264100782573223, + "learning_rate": 5.3567517225321875e-05, + "loss": 0.005323518067598343, + "step": 108690 + }, + { + "epoch": 0.4666718185174691, + "grad_norm": 1.5411258935928345, + "learning_rate": 5.356320550520425e-05, + "loss": 0.37329974174499514, + "step": 108700 + }, + { + "epoch": 0.46671475060748907, + "grad_norm": 0.034903932362794876, + "learning_rate": 5.355889378508663e-05, + "loss": 0.20319321155548095, + "step": 108710 + }, + { + "epoch": 0.46675768269750906, + "grad_norm": 0.0059880223125219345, + "learning_rate": 5.355458206496901e-05, + "loss": 0.2106635332107544, + "step": 108720 + }, + { + "epoch": 0.4668006147875291, + "grad_norm": 0.1818341761827469, + "learning_rate": 5.355027034485137e-05, + "loss": 0.10562245845794678, + "step": 108730 + }, + { + "epoch": 0.4668435468775491, + "grad_norm": 3.9489457607269287, + "learning_rate": 5.354595862473375e-05, + "loss": 0.14832943677902222, + "step": 108740 + }, + { + "epoch": 0.4668864789675691, + "grad_norm": 0.0023902824614197016, + "learning_rate": 5.3541646904616126e-05, + "loss": 0.14622147083282472, + "step": 108750 + }, + { + "epoch": 0.4669294110575891, + "grad_norm": 0.004773691762238741, + "learning_rate": 5.35373351844985e-05, + "loss": 0.06058769226074219, + "step": 108760 + }, + { + "epoch": 0.4669723431476091, + "grad_norm": 2.265639305114746, + "learning_rate": 5.353302346438088e-05, + "loss": 0.18167110681533813, + "step": 108770 + }, + { + "epoch": 0.4670152752376291, + "grad_norm": 0.02852572314441204, + "learning_rate": 5.352871174426326e-05, + "loss": 0.08986608982086182, + "step": 108780 + }, + { + "epoch": 0.46705820732764913, + "grad_norm": 0.047613635659217834, + "learning_rate": 5.3524400024145635e-05, + "loss": 0.16906614303588868, + "step": 108790 + }, + { + "epoch": 0.4671011394176691, + "grad_norm": 0.41578468680381775, + "learning_rate": 5.352008830402801e-05, + "loss": 0.16157785654067994, + "step": 108800 + }, + { + "epoch": 0.46714407150768916, + "grad_norm": 1.8083186149597168, + "learning_rate": 5.351577658391039e-05, + "loss": 0.124634051322937, + "step": 108810 + }, + { + "epoch": 0.46718700359770915, + "grad_norm": 3.3372297286987305, + "learning_rate": 5.351146486379276e-05, + "loss": 0.23483569622039796, + "step": 108820 + }, + { + "epoch": 0.46722993568772914, + "grad_norm": 0.03171273320913315, + "learning_rate": 5.350715314367514e-05, + "loss": 0.40413923263549806, + "step": 108830 + }, + { + "epoch": 0.4672728677777492, + "grad_norm": 0.006995361298322678, + "learning_rate": 5.3502841423557515e-05, + "loss": 0.2489945650100708, + "step": 108840 + }, + { + "epoch": 0.46731579986776917, + "grad_norm": 1.9916613101959229, + "learning_rate": 5.349852970343989e-05, + "loss": 0.23514945507049562, + "step": 108850 + }, + { + "epoch": 0.46735873195778915, + "grad_norm": 2.323140859603882, + "learning_rate": 5.349421798332227e-05, + "loss": 0.27089509963989256, + "step": 108860 + }, + { + "epoch": 0.4674016640478092, + "grad_norm": 0.2249898463487625, + "learning_rate": 5.348990626320465e-05, + "loss": 0.37014145851135255, + "step": 108870 + }, + { + "epoch": 0.4674445961378292, + "grad_norm": 0.0015096982242539525, + "learning_rate": 5.3485594543087024e-05, + "loss": 0.2118845224380493, + "step": 108880 + }, + { + "epoch": 0.46748752822784917, + "grad_norm": 0.017406558617949486, + "learning_rate": 5.34812828229694e-05, + "loss": 0.22873473167419434, + "step": 108890 + }, + { + "epoch": 0.4675304603178692, + "grad_norm": 0.0044220988638699055, + "learning_rate": 5.347697110285177e-05, + "loss": 0.25328223705291747, + "step": 108900 + }, + { + "epoch": 0.4675733924078892, + "grad_norm": 0.4546348750591278, + "learning_rate": 5.347265938273415e-05, + "loss": 0.1413517713546753, + "step": 108910 + }, + { + "epoch": 0.4676163244979092, + "grad_norm": 0.05150995030999184, + "learning_rate": 5.346834766261653e-05, + "loss": 0.42641348838806153, + "step": 108920 + }, + { + "epoch": 0.46765925658792923, + "grad_norm": 0.05774686112999916, + "learning_rate": 5.3464035942498904e-05, + "loss": 0.15248560905456543, + "step": 108930 + }, + { + "epoch": 0.4677021886779492, + "grad_norm": 0.18553832173347473, + "learning_rate": 5.345972422238128e-05, + "loss": 0.3334579706192017, + "step": 108940 + }, + { + "epoch": 0.4677451207679692, + "grad_norm": 0.004935821518301964, + "learning_rate": 5.345541250226366e-05, + "loss": 0.4072174072265625, + "step": 108950 + }, + { + "epoch": 0.46778805285798924, + "grad_norm": 2.7452855110168457, + "learning_rate": 5.3451100782146036e-05, + "loss": 0.34284253120422364, + "step": 108960 + }, + { + "epoch": 0.46783098494800923, + "grad_norm": 0.3916510045528412, + "learning_rate": 5.3446789062028414e-05, + "loss": 0.1928103446960449, + "step": 108970 + }, + { + "epoch": 0.4678739170380292, + "grad_norm": 0.15699084103107452, + "learning_rate": 5.344247734191078e-05, + "loss": 0.14758397340774537, + "step": 108980 + }, + { + "epoch": 0.46791684912804926, + "grad_norm": 10.559981346130371, + "learning_rate": 5.3438165621793155e-05, + "loss": 0.16865952014923097, + "step": 108990 + }, + { + "epoch": 0.46795978121806925, + "grad_norm": 0.046823471784591675, + "learning_rate": 5.343385390167553e-05, + "loss": 0.2123495101928711, + "step": 109000 + }, + { + "epoch": 0.46795978121806925, + "eval_loss": 0.4111691415309906, + "eval_runtime": 27.1166, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 109000 + }, + { + "epoch": 0.4680027133080893, + "grad_norm": 0.4841916561126709, + "learning_rate": 5.342954218155791e-05, + "loss": 0.05959020853042603, + "step": 109010 + }, + { + "epoch": 0.4680456453981093, + "grad_norm": 0.018183136358857155, + "learning_rate": 5.342523046144029e-05, + "loss": 0.2590435266494751, + "step": 109020 + }, + { + "epoch": 0.46808857748812926, + "grad_norm": 1.3049360513687134, + "learning_rate": 5.3420918741322664e-05, + "loss": 0.13173724412918092, + "step": 109030 + }, + { + "epoch": 0.4681315095781493, + "grad_norm": 0.007529239635914564, + "learning_rate": 5.341660702120505e-05, + "loss": 0.04893192052841187, + "step": 109040 + }, + { + "epoch": 0.4681744416681693, + "grad_norm": 0.007698412984609604, + "learning_rate": 5.3412295301087426e-05, + "loss": 0.17854329347610473, + "step": 109050 + }, + { + "epoch": 0.4682173737581893, + "grad_norm": 0.04524169862270355, + "learning_rate": 5.340798358096979e-05, + "loss": 0.10847384929656982, + "step": 109060 + }, + { + "epoch": 0.4682603058482093, + "grad_norm": 1.1935988664627075, + "learning_rate": 5.340367186085217e-05, + "loss": 0.40726146697998045, + "step": 109070 + }, + { + "epoch": 0.4683032379382293, + "grad_norm": 0.02711350843310356, + "learning_rate": 5.3399360140734544e-05, + "loss": 0.24927499294281005, + "step": 109080 + }, + { + "epoch": 0.4683461700282493, + "grad_norm": 1.875342607498169, + "learning_rate": 5.339504842061692e-05, + "loss": 0.24080886840820312, + "step": 109090 + }, + { + "epoch": 0.46838910211826934, + "grad_norm": 0.5155624747276306, + "learning_rate": 5.33907367004993e-05, + "loss": 0.2785279989242554, + "step": 109100 + }, + { + "epoch": 0.4684320342082893, + "grad_norm": 0.4726239740848541, + "learning_rate": 5.3386424980381676e-05, + "loss": 0.11514071226119996, + "step": 109110 + }, + { + "epoch": 0.4684749662983093, + "grad_norm": 1.5348830223083496, + "learning_rate": 5.3382113260264053e-05, + "loss": 0.29875593185424804, + "step": 109120 + }, + { + "epoch": 0.46851789838832936, + "grad_norm": 0.020795587450265884, + "learning_rate": 5.337780154014643e-05, + "loss": 0.26834123134613036, + "step": 109130 + }, + { + "epoch": 0.46856083047834934, + "grad_norm": 0.6724562644958496, + "learning_rate": 5.337348982002881e-05, + "loss": 0.32038588523864747, + "step": 109140 + }, + { + "epoch": 0.46860376256836933, + "grad_norm": 1.894964337348938, + "learning_rate": 5.336917809991118e-05, + "loss": 0.19419206380844117, + "step": 109150 + }, + { + "epoch": 0.4686466946583894, + "grad_norm": 0.018505332991480827, + "learning_rate": 5.3364866379793556e-05, + "loss": 0.09740328192710876, + "step": 109160 + }, + { + "epoch": 0.46868962674840936, + "grad_norm": 1.5556461811065674, + "learning_rate": 5.3360554659675933e-05, + "loss": 0.2878258228302002, + "step": 109170 + }, + { + "epoch": 0.46873255883842935, + "grad_norm": 0.02745775319635868, + "learning_rate": 5.335624293955831e-05, + "loss": 0.2433319091796875, + "step": 109180 + }, + { + "epoch": 0.4687754909284494, + "grad_norm": 0.02605215087532997, + "learning_rate": 5.335193121944069e-05, + "loss": 0.3428020477294922, + "step": 109190 + }, + { + "epoch": 0.4688184230184694, + "grad_norm": 1.8436968326568604, + "learning_rate": 5.3347619499323065e-05, + "loss": 0.3498523712158203, + "step": 109200 + }, + { + "epoch": 0.46886135510848936, + "grad_norm": 0.005615072324872017, + "learning_rate": 5.334330777920544e-05, + "loss": 0.30814356803894044, + "step": 109210 + }, + { + "epoch": 0.4689042871985094, + "grad_norm": 4.321868896484375, + "learning_rate": 5.333899605908782e-05, + "loss": 0.41483297348022463, + "step": 109220 + }, + { + "epoch": 0.4689472192885294, + "grad_norm": 0.4750598669052124, + "learning_rate": 5.3334684338970184e-05, + "loss": 0.12364082336425782, + "step": 109230 + }, + { + "epoch": 0.46899015137854944, + "grad_norm": 1.165134072303772, + "learning_rate": 5.333037261885256e-05, + "loss": 0.22800350189208984, + "step": 109240 + }, + { + "epoch": 0.4690330834685694, + "grad_norm": 0.007292190100997686, + "learning_rate": 5.332606089873494e-05, + "loss": 0.2522198915481567, + "step": 109250 + }, + { + "epoch": 0.4690760155585894, + "grad_norm": 6.852068901062012, + "learning_rate": 5.332174917861732e-05, + "loss": 0.3143084287643433, + "step": 109260 + }, + { + "epoch": 0.46911894764860945, + "grad_norm": 1.5385671854019165, + "learning_rate": 5.33174374584997e-05, + "loss": 0.12840102910995482, + "step": 109270 + }, + { + "epoch": 0.46916187973862944, + "grad_norm": 0.03560010343790054, + "learning_rate": 5.331312573838208e-05, + "loss": 0.10503036975860595, + "step": 109280 + }, + { + "epoch": 0.4692048118286494, + "grad_norm": 0.30751195549964905, + "learning_rate": 5.3308814018264455e-05, + "loss": 0.09718015193939208, + "step": 109290 + }, + { + "epoch": 0.46924774391866947, + "grad_norm": 0.061752088367938995, + "learning_rate": 5.330450229814683e-05, + "loss": 0.22438604831695558, + "step": 109300 + }, + { + "epoch": 0.46929067600868946, + "grad_norm": 0.007534640375524759, + "learning_rate": 5.3300190578029196e-05, + "loss": 0.22838716506958007, + "step": 109310 + }, + { + "epoch": 0.46933360809870944, + "grad_norm": 4.146587371826172, + "learning_rate": 5.329587885791157e-05, + "loss": 0.12649660110473632, + "step": 109320 + }, + { + "epoch": 0.4693765401887295, + "grad_norm": 4.130105018615723, + "learning_rate": 5.329156713779395e-05, + "loss": 0.36939468383789065, + "step": 109330 + }, + { + "epoch": 0.46941947227874947, + "grad_norm": 0.7898271083831787, + "learning_rate": 5.328725541767633e-05, + "loss": 0.08706992864608765, + "step": 109340 + }, + { + "epoch": 0.46946240436876946, + "grad_norm": 0.09479454159736633, + "learning_rate": 5.3282943697558705e-05, + "loss": 0.29662575721740725, + "step": 109350 + }, + { + "epoch": 0.4695053364587895, + "grad_norm": 0.06022655963897705, + "learning_rate": 5.327863197744108e-05, + "loss": 0.2366532564163208, + "step": 109360 + }, + { + "epoch": 0.4695482685488095, + "grad_norm": 0.016174262389540672, + "learning_rate": 5.327432025732346e-05, + "loss": 0.29160394668579104, + "step": 109370 + }, + { + "epoch": 0.4695912006388295, + "grad_norm": 1.7989237308502197, + "learning_rate": 5.327000853720584e-05, + "loss": 0.3305816650390625, + "step": 109380 + }, + { + "epoch": 0.4696341327288495, + "grad_norm": 2.016319751739502, + "learning_rate": 5.326569681708821e-05, + "loss": 0.250068736076355, + "step": 109390 + }, + { + "epoch": 0.4696770648188695, + "grad_norm": 0.8311567306518555, + "learning_rate": 5.3261385096970585e-05, + "loss": 0.3437382936477661, + "step": 109400 + }, + { + "epoch": 0.4697199969088895, + "grad_norm": 33.06391143798828, + "learning_rate": 5.325707337685296e-05, + "loss": 0.40858283042907717, + "step": 109410 + }, + { + "epoch": 0.46976292899890953, + "grad_norm": 1.9839880466461182, + "learning_rate": 5.325276165673534e-05, + "loss": 0.28404059410095217, + "step": 109420 + }, + { + "epoch": 0.4698058610889295, + "grad_norm": 0.5857807993888855, + "learning_rate": 5.324844993661772e-05, + "loss": 0.10082913637161255, + "step": 109430 + }, + { + "epoch": 0.46984879317894956, + "grad_norm": 0.2136264443397522, + "learning_rate": 5.3244138216500095e-05, + "loss": 0.35057988166809084, + "step": 109440 + }, + { + "epoch": 0.46989172526896955, + "grad_norm": 0.2856413722038269, + "learning_rate": 5.323982649638247e-05, + "loss": 0.1828877568244934, + "step": 109450 + }, + { + "epoch": 0.46993465735898954, + "grad_norm": 0.009458329528570175, + "learning_rate": 5.323551477626485e-05, + "loss": 0.113239586353302, + "step": 109460 + }, + { + "epoch": 0.4699775894490096, + "grad_norm": 0.04635424166917801, + "learning_rate": 5.323120305614723e-05, + "loss": 0.29793319702148435, + "step": 109470 + }, + { + "epoch": 0.47002052153902957, + "grad_norm": 1.4999786615371704, + "learning_rate": 5.32268913360296e-05, + "loss": 0.2726386547088623, + "step": 109480 + }, + { + "epoch": 0.47006345362904955, + "grad_norm": 4.102458953857422, + "learning_rate": 5.3222579615911975e-05, + "loss": 0.22052557468414308, + "step": 109490 + }, + { + "epoch": 0.4701063857190696, + "grad_norm": 1.3203516006469727, + "learning_rate": 5.321826789579435e-05, + "loss": 0.511915636062622, + "step": 109500 + }, + { + "epoch": 0.4701493178090896, + "grad_norm": 0.005893752444535494, + "learning_rate": 5.321395617567673e-05, + "loss": 0.0750274658203125, + "step": 109510 + }, + { + "epoch": 0.47019224989910957, + "grad_norm": 0.012768547981977463, + "learning_rate": 5.3209644455559107e-05, + "loss": 0.14960445165634156, + "step": 109520 + }, + { + "epoch": 0.4702351819891296, + "grad_norm": 0.05208965763449669, + "learning_rate": 5.3205332735441484e-05, + "loss": 0.08533784747123718, + "step": 109530 + }, + { + "epoch": 0.4702781140791496, + "grad_norm": 0.016734503209590912, + "learning_rate": 5.320102101532386e-05, + "loss": 0.18010318279266357, + "step": 109540 + }, + { + "epoch": 0.4703210461691696, + "grad_norm": 0.019492534920573235, + "learning_rate": 5.319670929520624e-05, + "loss": 0.2558018922805786, + "step": 109550 + }, + { + "epoch": 0.47036397825918963, + "grad_norm": 0.07820451259613037, + "learning_rate": 5.31923975750886e-05, + "loss": 0.1978413224220276, + "step": 109560 + }, + { + "epoch": 0.4704069103492096, + "grad_norm": 1.5199116468429565, + "learning_rate": 5.318808585497098e-05, + "loss": 0.14105242490768433, + "step": 109570 + }, + { + "epoch": 0.4704498424392296, + "grad_norm": 0.027323994785547256, + "learning_rate": 5.318377413485336e-05, + "loss": 0.17264108657836913, + "step": 109580 + }, + { + "epoch": 0.47049277452924965, + "grad_norm": 0.03172069415450096, + "learning_rate": 5.3179462414735734e-05, + "loss": 0.2262335777282715, + "step": 109590 + }, + { + "epoch": 0.47053570661926963, + "grad_norm": 2.656409978866577, + "learning_rate": 5.317515069461811e-05, + "loss": 0.22067790031433104, + "step": 109600 + }, + { + "epoch": 0.4705786387092896, + "grad_norm": 0.37605583667755127, + "learning_rate": 5.317083897450049e-05, + "loss": 0.09247267246246338, + "step": 109610 + }, + { + "epoch": 0.47062157079930966, + "grad_norm": 1.5773375034332275, + "learning_rate": 5.3166527254382866e-05, + "loss": 0.26245760917663574, + "step": 109620 + }, + { + "epoch": 0.47066450288932965, + "grad_norm": 0.8994236588478088, + "learning_rate": 5.316221553426525e-05, + "loss": 0.4384958744049072, + "step": 109630 + }, + { + "epoch": 0.47070743497934964, + "grad_norm": 0.13297097384929657, + "learning_rate": 5.3157903814147614e-05, + "loss": 0.23621623516082763, + "step": 109640 + }, + { + "epoch": 0.4707503670693697, + "grad_norm": 1.8098262548446655, + "learning_rate": 5.315359209402999e-05, + "loss": 0.3028961420059204, + "step": 109650 + }, + { + "epoch": 0.47079329915938967, + "grad_norm": 0.0058063119649887085, + "learning_rate": 5.314928037391237e-05, + "loss": 0.18072078227996827, + "step": 109660 + }, + { + "epoch": 0.4708362312494097, + "grad_norm": 0.9555726051330566, + "learning_rate": 5.3144968653794746e-05, + "loss": 0.3762235641479492, + "step": 109670 + }, + { + "epoch": 0.4708791633394297, + "grad_norm": 0.0875546932220459, + "learning_rate": 5.3140656933677124e-05, + "loss": 0.3112565755844116, + "step": 109680 + }, + { + "epoch": 0.4709220954294497, + "grad_norm": 1.1810975074768066, + "learning_rate": 5.31363452135595e-05, + "loss": 0.20165557861328126, + "step": 109690 + }, + { + "epoch": 0.4709650275194697, + "grad_norm": 3.7935070991516113, + "learning_rate": 5.313203349344188e-05, + "loss": 0.15693705081939696, + "step": 109700 + }, + { + "epoch": 0.4710079596094897, + "grad_norm": 0.036692049354314804, + "learning_rate": 5.3127721773324256e-05, + "loss": 0.1284176826477051, + "step": 109710 + }, + { + "epoch": 0.4710508916995097, + "grad_norm": 3.318089008331299, + "learning_rate": 5.3123410053206626e-05, + "loss": 0.3561805248260498, + "step": 109720 + }, + { + "epoch": 0.47109382378952974, + "grad_norm": 1.2822105884552002, + "learning_rate": 5.3119098333089004e-05, + "loss": 0.1795423984527588, + "step": 109730 + }, + { + "epoch": 0.47113675587954973, + "grad_norm": 0.8673492074012756, + "learning_rate": 5.311478661297138e-05, + "loss": 0.15667368173599244, + "step": 109740 + }, + { + "epoch": 0.4711796879695697, + "grad_norm": 3.5049326419830322, + "learning_rate": 5.311047489285376e-05, + "loss": 0.07165217995643616, + "step": 109750 + }, + { + "epoch": 0.47122262005958976, + "grad_norm": 0.13629527390003204, + "learning_rate": 5.3106163172736136e-05, + "loss": 0.2818335294723511, + "step": 109760 + }, + { + "epoch": 0.47126555214960975, + "grad_norm": 4.236764430999756, + "learning_rate": 5.310185145261851e-05, + "loss": 0.2754476547241211, + "step": 109770 + }, + { + "epoch": 0.47130848423962973, + "grad_norm": 2.255838394165039, + "learning_rate": 5.309753973250089e-05, + "loss": 0.19391024112701416, + "step": 109780 + }, + { + "epoch": 0.4713514163296498, + "grad_norm": 0.0032226841431111097, + "learning_rate": 5.309322801238327e-05, + "loss": 0.42916345596313477, + "step": 109790 + }, + { + "epoch": 0.47139434841966976, + "grad_norm": 0.003980181645601988, + "learning_rate": 5.308891629226563e-05, + "loss": 0.16325417757034302, + "step": 109800 + }, + { + "epoch": 0.47143728050968975, + "grad_norm": 0.017930980771780014, + "learning_rate": 5.308460457214801e-05, + "loss": 0.10078818798065185, + "step": 109810 + }, + { + "epoch": 0.4714802125997098, + "grad_norm": 0.7667815089225769, + "learning_rate": 5.3080292852030386e-05, + "loss": 0.13166972398757934, + "step": 109820 + }, + { + "epoch": 0.4715231446897298, + "grad_norm": 6.7686848640441895, + "learning_rate": 5.3075981131912764e-05, + "loss": 0.09097627997398376, + "step": 109830 + }, + { + "epoch": 0.47156607677974977, + "grad_norm": 0.042791981250047684, + "learning_rate": 5.307166941179514e-05, + "loss": 0.2671439409255981, + "step": 109840 + }, + { + "epoch": 0.4716090088697698, + "grad_norm": 1.2552294731140137, + "learning_rate": 5.3067357691677525e-05, + "loss": 0.17842416763305663, + "step": 109850 + }, + { + "epoch": 0.4716519409597898, + "grad_norm": 0.04525917023420334, + "learning_rate": 5.30630459715599e-05, + "loss": 0.19458953142166138, + "step": 109860 + }, + { + "epoch": 0.47169487304980984, + "grad_norm": 0.9127861857414246, + "learning_rate": 5.305873425144228e-05, + "loss": 0.2227538824081421, + "step": 109870 + }, + { + "epoch": 0.4717378051398298, + "grad_norm": 1.8166645765304565, + "learning_rate": 5.305442253132466e-05, + "loss": 0.2958853721618652, + "step": 109880 + }, + { + "epoch": 0.4717807372298498, + "grad_norm": 2.7529006004333496, + "learning_rate": 5.305011081120702e-05, + "loss": 0.3049177885055542, + "step": 109890 + }, + { + "epoch": 0.47182366931986985, + "grad_norm": 0.006838400382548571, + "learning_rate": 5.30457990910894e-05, + "loss": 0.20164821147918702, + "step": 109900 + }, + { + "epoch": 0.47186660140988984, + "grad_norm": 6.462440013885498, + "learning_rate": 5.3041487370971775e-05, + "loss": 0.3127782344818115, + "step": 109910 + }, + { + "epoch": 0.47190953349990983, + "grad_norm": 0.17948110401630402, + "learning_rate": 5.303717565085415e-05, + "loss": 0.1981052875518799, + "step": 109920 + }, + { + "epoch": 0.47195246558992987, + "grad_norm": 0.9323148131370544, + "learning_rate": 5.303286393073653e-05, + "loss": 0.38721270561218263, + "step": 109930 + }, + { + "epoch": 0.47199539767994986, + "grad_norm": 0.008327801711857319, + "learning_rate": 5.302855221061891e-05, + "loss": 0.20434670448303222, + "step": 109940 + }, + { + "epoch": 0.47203832976996984, + "grad_norm": 0.12778253853321075, + "learning_rate": 5.3024240490501285e-05, + "loss": 0.3304303646087646, + "step": 109950 + }, + { + "epoch": 0.4720812618599899, + "grad_norm": 0.2479141652584076, + "learning_rate": 5.301992877038366e-05, + "loss": 0.19963374137878417, + "step": 109960 + }, + { + "epoch": 0.4721241939500099, + "grad_norm": 1.2295243740081787, + "learning_rate": 5.301561705026603e-05, + "loss": 0.187886381149292, + "step": 109970 + }, + { + "epoch": 0.47216712604002986, + "grad_norm": 0.06274612247943878, + "learning_rate": 5.301130533014841e-05, + "loss": 0.192645800113678, + "step": 109980 + }, + { + "epoch": 0.4722100581300499, + "grad_norm": 0.0075582414865493774, + "learning_rate": 5.300699361003079e-05, + "loss": 0.14299780130386353, + "step": 109990 + }, + { + "epoch": 0.4722529902200699, + "grad_norm": 2.172623872756958, + "learning_rate": 5.3002681889913165e-05, + "loss": 0.23523974418640137, + "step": 110000 + }, + { + "epoch": 0.4722529902200699, + "eval_loss": 0.40026649832725525, + "eval_runtime": 27.1709, + "eval_samples_per_second": 3.68, + "eval_steps_per_second": 3.68, + "step": 110000 + }, + { + "epoch": 0.4722959223100899, + "grad_norm": 0.006778498645871878, + "learning_rate": 5.299837016979554e-05, + "loss": 0.2251523494720459, + "step": 110010 + }, + { + "epoch": 0.4723388544001099, + "grad_norm": 3.404278039932251, + "learning_rate": 5.299405844967792e-05, + "loss": 0.3573782920837402, + "step": 110020 + }, + { + "epoch": 0.4723817864901299, + "grad_norm": 1.1682584285736084, + "learning_rate": 5.29897467295603e-05, + "loss": 0.37567944526672364, + "step": 110030 + }, + { + "epoch": 0.4724247185801499, + "grad_norm": 0.08961494266986847, + "learning_rate": 5.2985435009442674e-05, + "loss": 0.14649477005004882, + "step": 110040 + }, + { + "epoch": 0.47246765067016994, + "grad_norm": 0.005352088250219822, + "learning_rate": 5.298112328932504e-05, + "loss": 0.13340699672698975, + "step": 110050 + }, + { + "epoch": 0.4725105827601899, + "grad_norm": 3.589823007583618, + "learning_rate": 5.2976811569207415e-05, + "loss": 0.18872573375701904, + "step": 110060 + }, + { + "epoch": 0.4725535148502099, + "grad_norm": 0.025254419073462486, + "learning_rate": 5.29724998490898e-05, + "loss": 0.011775702983140946, + "step": 110070 + }, + { + "epoch": 0.47259644694022995, + "grad_norm": 4.436582088470459, + "learning_rate": 5.296818812897218e-05, + "loss": 0.2874211072921753, + "step": 110080 + }, + { + "epoch": 0.47263937903024994, + "grad_norm": 0.1682950258255005, + "learning_rate": 5.2963876408854554e-05, + "loss": 0.21236715316772461, + "step": 110090 + }, + { + "epoch": 0.47268231112027, + "grad_norm": 0.1564854085445404, + "learning_rate": 5.295956468873693e-05, + "loss": 0.1416730046272278, + "step": 110100 + }, + { + "epoch": 0.47272524321028997, + "grad_norm": 0.7669548392295837, + "learning_rate": 5.295525296861931e-05, + "loss": 0.17509924173355101, + "step": 110110 + }, + { + "epoch": 0.47276817530030996, + "grad_norm": 0.0032598222605884075, + "learning_rate": 5.2950941248501686e-05, + "loss": 0.2041093111038208, + "step": 110120 + }, + { + "epoch": 0.47281110739033, + "grad_norm": 0.001301642507314682, + "learning_rate": 5.294662952838405e-05, + "loss": 0.1385445475578308, + "step": 110130 + }, + { + "epoch": 0.47285403948035, + "grad_norm": 1.4497944116592407, + "learning_rate": 5.294231780826643e-05, + "loss": 0.2178436517715454, + "step": 110140 + }, + { + "epoch": 0.47289697157037, + "grad_norm": 0.007498675025999546, + "learning_rate": 5.2938006088148805e-05, + "loss": 0.07806483507156373, + "step": 110150 + }, + { + "epoch": 0.47293990366039, + "grad_norm": 0.051783930510282516, + "learning_rate": 5.293369436803118e-05, + "loss": 0.23047008514404296, + "step": 110160 + }, + { + "epoch": 0.47298283575041, + "grad_norm": 0.0037600200157612562, + "learning_rate": 5.292938264791356e-05, + "loss": 0.2534470081329346, + "step": 110170 + }, + { + "epoch": 0.47302576784043, + "grad_norm": 0.002208675490692258, + "learning_rate": 5.292507092779594e-05, + "loss": 0.23899390697479247, + "step": 110180 + }, + { + "epoch": 0.47306869993045003, + "grad_norm": 0.8531462550163269, + "learning_rate": 5.2920759207678314e-05, + "loss": 0.29016637802124023, + "step": 110190 + }, + { + "epoch": 0.47311163202047, + "grad_norm": 0.02837371453642845, + "learning_rate": 5.291644748756069e-05, + "loss": 0.15528700351715088, + "step": 110200 + }, + { + "epoch": 0.47315456411049, + "grad_norm": 0.017523042857646942, + "learning_rate": 5.291213576744307e-05, + "loss": 0.0026233930140733717, + "step": 110210 + }, + { + "epoch": 0.47319749620051005, + "grad_norm": 5.990091323852539, + "learning_rate": 5.290782404732544e-05, + "loss": 0.3041208744049072, + "step": 110220 + }, + { + "epoch": 0.47324042829053004, + "grad_norm": 0.006588002201169729, + "learning_rate": 5.2903512327207817e-05, + "loss": 0.20339715480804443, + "step": 110230 + }, + { + "epoch": 0.47328336038055, + "grad_norm": 0.0035168312024325132, + "learning_rate": 5.2899200607090194e-05, + "loss": 0.09533035755157471, + "step": 110240 + }, + { + "epoch": 0.47332629247057006, + "grad_norm": 0.055465489625930786, + "learning_rate": 5.289488888697257e-05, + "loss": 0.27167162895202634, + "step": 110250 + }, + { + "epoch": 0.47336922456059005, + "grad_norm": 0.08093491941690445, + "learning_rate": 5.289057716685495e-05, + "loss": 0.38339624404907224, + "step": 110260 + }, + { + "epoch": 0.47341215665061004, + "grad_norm": 0.005503931548446417, + "learning_rate": 5.2886265446737326e-05, + "loss": 0.1939884305000305, + "step": 110270 + }, + { + "epoch": 0.4734550887406301, + "grad_norm": 2.163684129714966, + "learning_rate": 5.28819537266197e-05, + "loss": 0.14528403282165528, + "step": 110280 + }, + { + "epoch": 0.47349802083065007, + "grad_norm": 1.828765869140625, + "learning_rate": 5.287764200650208e-05, + "loss": 0.2360905170440674, + "step": 110290 + }, + { + "epoch": 0.4735409529206701, + "grad_norm": 1.3224155902862549, + "learning_rate": 5.287333028638445e-05, + "loss": 0.3001755475997925, + "step": 110300 + }, + { + "epoch": 0.4735838850106901, + "grad_norm": 0.03000202588737011, + "learning_rate": 5.286901856626683e-05, + "loss": 0.08096457123756409, + "step": 110310 + }, + { + "epoch": 0.4736268171007101, + "grad_norm": 0.008006912656128407, + "learning_rate": 5.2864706846149206e-05, + "loss": 0.13578498363494873, + "step": 110320 + }, + { + "epoch": 0.4736697491907301, + "grad_norm": 0.00391837302595377, + "learning_rate": 5.286039512603158e-05, + "loss": 0.10231612920761109, + "step": 110330 + }, + { + "epoch": 0.4737126812807501, + "grad_norm": 0.004665139596909285, + "learning_rate": 5.285608340591396e-05, + "loss": 0.07968645691871643, + "step": 110340 + }, + { + "epoch": 0.4737556133707701, + "grad_norm": 3.0334272384643555, + "learning_rate": 5.285177168579634e-05, + "loss": 0.11113240718841552, + "step": 110350 + }, + { + "epoch": 0.47379854546079014, + "grad_norm": 0.28551867604255676, + "learning_rate": 5.2847459965678715e-05, + "loss": 0.1603380799293518, + "step": 110360 + }, + { + "epoch": 0.47384147755081013, + "grad_norm": 1.029355764389038, + "learning_rate": 5.284314824556109e-05, + "loss": 0.34641385078430176, + "step": 110370 + }, + { + "epoch": 0.4738844096408301, + "grad_norm": 2.454301357269287, + "learning_rate": 5.2838836525443456e-05, + "loss": 0.26242704391479493, + "step": 110380 + }, + { + "epoch": 0.47392734173085016, + "grad_norm": 30.86275863647461, + "learning_rate": 5.2834524805325834e-05, + "loss": 0.23771564960479735, + "step": 110390 + }, + { + "epoch": 0.47397027382087015, + "grad_norm": 0.00213251612149179, + "learning_rate": 5.283021308520821e-05, + "loss": 0.1638559579849243, + "step": 110400 + }, + { + "epoch": 0.47401320591089013, + "grad_norm": 1.0422087907791138, + "learning_rate": 5.282590136509059e-05, + "loss": 0.45887227058410646, + "step": 110410 + }, + { + "epoch": 0.4740561380009102, + "grad_norm": 10.386863708496094, + "learning_rate": 5.2821589644972966e-05, + "loss": 0.2633431196212769, + "step": 110420 + }, + { + "epoch": 0.47409907009093016, + "grad_norm": 0.05543503910303116, + "learning_rate": 5.281727792485534e-05, + "loss": 0.11826821565628051, + "step": 110430 + }, + { + "epoch": 0.47414200218095015, + "grad_norm": 8.459673881530762, + "learning_rate": 5.281296620473772e-05, + "loss": 0.18388675451278685, + "step": 110440 + }, + { + "epoch": 0.4741849342709702, + "grad_norm": 0.0549483522772789, + "learning_rate": 5.2808654484620105e-05, + "loss": 0.5347713470458985, + "step": 110450 + }, + { + "epoch": 0.4742278663609902, + "grad_norm": 3.4024839401245117, + "learning_rate": 5.280434276450247e-05, + "loss": 0.15446404218673707, + "step": 110460 + }, + { + "epoch": 0.47427079845101017, + "grad_norm": 0.011814040131866932, + "learning_rate": 5.2800031044384846e-05, + "loss": 0.23509562015533447, + "step": 110470 + }, + { + "epoch": 0.4743137305410302, + "grad_norm": 0.2656792104244232, + "learning_rate": 5.279571932426722e-05, + "loss": 0.2127734661102295, + "step": 110480 + }, + { + "epoch": 0.4743566626310502, + "grad_norm": 8.172897338867188, + "learning_rate": 5.27914076041496e-05, + "loss": 0.17217317819595337, + "step": 110490 + }, + { + "epoch": 0.4743995947210702, + "grad_norm": 13.168989181518555, + "learning_rate": 5.278709588403198e-05, + "loss": 0.24612603187561036, + "step": 110500 + }, + { + "epoch": 0.4744425268110902, + "grad_norm": 3.857973337173462, + "learning_rate": 5.2782784163914355e-05, + "loss": 0.3045464754104614, + "step": 110510 + }, + { + "epoch": 0.4744854589011102, + "grad_norm": 1.6314276456832886, + "learning_rate": 5.277847244379673e-05, + "loss": 0.13810135126113893, + "step": 110520 + }, + { + "epoch": 0.47452839099113026, + "grad_norm": 0.989060640335083, + "learning_rate": 5.277416072367911e-05, + "loss": 0.26412765979766845, + "step": 110530 + }, + { + "epoch": 0.47457132308115024, + "grad_norm": 0.005572533700615168, + "learning_rate": 5.276984900356148e-05, + "loss": 0.15340116024017333, + "step": 110540 + }, + { + "epoch": 0.47461425517117023, + "grad_norm": 0.20932750403881073, + "learning_rate": 5.276553728344386e-05, + "loss": 0.32161037921905516, + "step": 110550 + }, + { + "epoch": 0.47465718726119027, + "grad_norm": 0.5605794191360474, + "learning_rate": 5.2761225563326235e-05, + "loss": 0.1692768692970276, + "step": 110560 + }, + { + "epoch": 0.47470011935121026, + "grad_norm": 0.016393939033150673, + "learning_rate": 5.275691384320861e-05, + "loss": 0.20412814617156982, + "step": 110570 + }, + { + "epoch": 0.47474305144123025, + "grad_norm": 0.30977731943130493, + "learning_rate": 5.275260212309099e-05, + "loss": 0.16929389238357545, + "step": 110580 + }, + { + "epoch": 0.4747859835312503, + "grad_norm": 0.8243743777275085, + "learning_rate": 5.274829040297337e-05, + "loss": 0.21461176872253418, + "step": 110590 + }, + { + "epoch": 0.4748289156212703, + "grad_norm": 1.380896806716919, + "learning_rate": 5.2743978682855744e-05, + "loss": 0.3001677989959717, + "step": 110600 + }, + { + "epoch": 0.47487184771129026, + "grad_norm": 4.265039920806885, + "learning_rate": 5.273966696273812e-05, + "loss": 0.2183671236038208, + "step": 110610 + }, + { + "epoch": 0.4749147798013103, + "grad_norm": 0.011042381636798382, + "learning_rate": 5.27353552426205e-05, + "loss": 0.15000921487808228, + "step": 110620 + }, + { + "epoch": 0.4749577118913303, + "grad_norm": 0.1344660520553589, + "learning_rate": 5.273104352250286e-05, + "loss": 0.3278426885604858, + "step": 110630 + }, + { + "epoch": 0.4750006439813503, + "grad_norm": 1.2693231105804443, + "learning_rate": 5.272673180238524e-05, + "loss": 0.18513789176940917, + "step": 110640 + }, + { + "epoch": 0.4750435760713703, + "grad_norm": 1.4191405773162842, + "learning_rate": 5.272242008226762e-05, + "loss": 0.31360783576965334, + "step": 110650 + }, + { + "epoch": 0.4750865081613903, + "grad_norm": 2.736665725708008, + "learning_rate": 5.2718108362149995e-05, + "loss": 0.5891505718231201, + "step": 110660 + }, + { + "epoch": 0.4751294402514103, + "grad_norm": 3.2283921241760254, + "learning_rate": 5.271379664203238e-05, + "loss": 0.4493126392364502, + "step": 110670 + }, + { + "epoch": 0.47517237234143034, + "grad_norm": 3.095923900604248, + "learning_rate": 5.2709484921914756e-05, + "loss": 0.2838698625564575, + "step": 110680 + }, + { + "epoch": 0.4752153044314503, + "grad_norm": 0.1428341418504715, + "learning_rate": 5.2705173201797134e-05, + "loss": 0.25497987270355227, + "step": 110690 + }, + { + "epoch": 0.4752582365214703, + "grad_norm": 1.663386344909668, + "learning_rate": 5.270086148167951e-05, + "loss": 0.33310327529907224, + "step": 110700 + }, + { + "epoch": 0.47530116861149035, + "grad_norm": 1.6262311935424805, + "learning_rate": 5.2696549761561875e-05, + "loss": 0.2923267841339111, + "step": 110710 + }, + { + "epoch": 0.47534410070151034, + "grad_norm": 0.10630851984024048, + "learning_rate": 5.269223804144425e-05, + "loss": 0.05050194263458252, + "step": 110720 + }, + { + "epoch": 0.4753870327915304, + "grad_norm": 3.5088255405426025, + "learning_rate": 5.268792632132663e-05, + "loss": 0.17360684871673585, + "step": 110730 + }, + { + "epoch": 0.47542996488155037, + "grad_norm": 1.5751526355743408, + "learning_rate": 5.268361460120901e-05, + "loss": 0.3811746120452881, + "step": 110740 + }, + { + "epoch": 0.47547289697157036, + "grad_norm": 0.017309105023741722, + "learning_rate": 5.2679302881091384e-05, + "loss": 0.08757068514823914, + "step": 110750 + }, + { + "epoch": 0.4755158290615904, + "grad_norm": 0.0026281701866537333, + "learning_rate": 5.267499116097376e-05, + "loss": 0.14881271123886108, + "step": 110760 + }, + { + "epoch": 0.4755587611516104, + "grad_norm": 0.1079968512058258, + "learning_rate": 5.267067944085614e-05, + "loss": 0.17866290807724, + "step": 110770 + }, + { + "epoch": 0.4756016932416304, + "grad_norm": 0.003995122853666544, + "learning_rate": 5.2666367720738516e-05, + "loss": 0.1523299217224121, + "step": 110780 + }, + { + "epoch": 0.4756446253316504, + "grad_norm": 0.0031116269528865814, + "learning_rate": 5.266205600062089e-05, + "loss": 0.3764505386352539, + "step": 110790 + }, + { + "epoch": 0.4756875574216704, + "grad_norm": 0.026464959606528282, + "learning_rate": 5.2657744280503264e-05, + "loss": 0.20965878963470458, + "step": 110800 + }, + { + "epoch": 0.4757304895116904, + "grad_norm": 0.03027081862092018, + "learning_rate": 5.265343256038564e-05, + "loss": 0.2689178705215454, + "step": 110810 + }, + { + "epoch": 0.47577342160171043, + "grad_norm": 0.015275675803422928, + "learning_rate": 5.264912084026802e-05, + "loss": 0.4521240234375, + "step": 110820 + }, + { + "epoch": 0.4758163536917304, + "grad_norm": 0.05949430167675018, + "learning_rate": 5.2644809120150396e-05, + "loss": 0.28630361557006834, + "step": 110830 + }, + { + "epoch": 0.4758592857817504, + "grad_norm": 0.05996134132146835, + "learning_rate": 5.2640497400032774e-05, + "loss": 0.252076530456543, + "step": 110840 + }, + { + "epoch": 0.47590221787177045, + "grad_norm": 0.060242872685194016, + "learning_rate": 5.263618567991515e-05, + "loss": 0.1737877368927002, + "step": 110850 + }, + { + "epoch": 0.47594514996179044, + "grad_norm": 2.192213773727417, + "learning_rate": 5.263187395979753e-05, + "loss": 0.09967674612998963, + "step": 110860 + }, + { + "epoch": 0.4759880820518104, + "grad_norm": 1.1650608777999878, + "learning_rate": 5.262756223967989e-05, + "loss": 0.1381733775138855, + "step": 110870 + }, + { + "epoch": 0.47603101414183047, + "grad_norm": 0.07127422839403152, + "learning_rate": 5.262325051956227e-05, + "loss": 0.22627367973327636, + "step": 110880 + }, + { + "epoch": 0.47607394623185045, + "grad_norm": 0.010779001750051975, + "learning_rate": 5.2618938799444653e-05, + "loss": 0.02740491330623627, + "step": 110890 + }, + { + "epoch": 0.47611687832187044, + "grad_norm": 2.1994879245758057, + "learning_rate": 5.261462707932703e-05, + "loss": 0.21756505966186523, + "step": 110900 + }, + { + "epoch": 0.4761598104118905, + "grad_norm": 0.00900544598698616, + "learning_rate": 5.261031535920941e-05, + "loss": 0.4406434535980225, + "step": 110910 + }, + { + "epoch": 0.47620274250191047, + "grad_norm": 0.3732303977012634, + "learning_rate": 5.2606003639091785e-05, + "loss": 0.17997138500213622, + "step": 110920 + }, + { + "epoch": 0.47624567459193046, + "grad_norm": 0.13721691071987152, + "learning_rate": 5.260169191897416e-05, + "loss": 0.13512275218963624, + "step": 110930 + }, + { + "epoch": 0.4762886066819505, + "grad_norm": 0.8601694107055664, + "learning_rate": 5.259738019885654e-05, + "loss": 0.24069135189056395, + "step": 110940 + }, + { + "epoch": 0.4763315387719705, + "grad_norm": 1.0583916902542114, + "learning_rate": 5.259306847873892e-05, + "loss": 0.058995991945266724, + "step": 110950 + }, + { + "epoch": 0.47637447086199053, + "grad_norm": 0.05748229846358299, + "learning_rate": 5.258875675862128e-05, + "loss": 0.2452263593673706, + "step": 110960 + }, + { + "epoch": 0.4764174029520105, + "grad_norm": 1.6908286809921265, + "learning_rate": 5.258444503850366e-05, + "loss": 0.3112839698791504, + "step": 110970 + }, + { + "epoch": 0.4764603350420305, + "grad_norm": 3.540738105773926, + "learning_rate": 5.2580133318386036e-05, + "loss": 0.36870133876800537, + "step": 110980 + }, + { + "epoch": 0.47650326713205055, + "grad_norm": 0.00804793369024992, + "learning_rate": 5.257582159826841e-05, + "loss": 0.3045008659362793, + "step": 110990 + }, + { + "epoch": 0.47654619922207053, + "grad_norm": 0.06705295294523239, + "learning_rate": 5.257150987815079e-05, + "loss": 0.12070378065109252, + "step": 111000 + }, + { + "epoch": 0.47654619922207053, + "eval_loss": 0.4230143427848816, + "eval_runtime": 27.1135, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 111000 + }, + { + "epoch": 0.4765891313120905, + "grad_norm": 0.004154821392148733, + "learning_rate": 5.256719815803317e-05, + "loss": 0.3670050144195557, + "step": 111010 + }, + { + "epoch": 0.47663206340211056, + "grad_norm": 0.09109306335449219, + "learning_rate": 5.2562886437915545e-05, + "loss": 0.10622333288192749, + "step": 111020 + }, + { + "epoch": 0.47667499549213055, + "grad_norm": 0.0096572982147336, + "learning_rate": 5.255857471779792e-05, + "loss": 0.08712666034698487, + "step": 111030 + }, + { + "epoch": 0.47671792758215054, + "grad_norm": 0.024771859869360924, + "learning_rate": 5.255426299768029e-05, + "loss": 0.26303396224975584, + "step": 111040 + }, + { + "epoch": 0.4767608596721706, + "grad_norm": 0.7985782623291016, + "learning_rate": 5.254995127756267e-05, + "loss": 0.06067940592765808, + "step": 111050 + }, + { + "epoch": 0.47680379176219057, + "grad_norm": 1.2885322570800781, + "learning_rate": 5.254563955744505e-05, + "loss": 0.2834064483642578, + "step": 111060 + }, + { + "epoch": 0.47684672385221055, + "grad_norm": 0.044166211038827896, + "learning_rate": 5.2541327837327425e-05, + "loss": 0.15519756078720093, + "step": 111070 + }, + { + "epoch": 0.4768896559422306, + "grad_norm": 0.010398822836577892, + "learning_rate": 5.25370161172098e-05, + "loss": 0.23026049137115479, + "step": 111080 + }, + { + "epoch": 0.4769325880322506, + "grad_norm": 0.013979957439005375, + "learning_rate": 5.253270439709218e-05, + "loss": 0.19074589014053345, + "step": 111090 + }, + { + "epoch": 0.47697552012227057, + "grad_norm": 0.026473939418792725, + "learning_rate": 5.252839267697456e-05, + "loss": 0.1719115138053894, + "step": 111100 + }, + { + "epoch": 0.4770184522122906, + "grad_norm": 1.366654872894287, + "learning_rate": 5.2524080956856935e-05, + "loss": 0.18147858381271362, + "step": 111110 + }, + { + "epoch": 0.4770613843023106, + "grad_norm": 3.7447845935821533, + "learning_rate": 5.2519769236739305e-05, + "loss": 0.40271430015563964, + "step": 111120 + }, + { + "epoch": 0.4771043163923306, + "grad_norm": 3.210329532623291, + "learning_rate": 5.251545751662168e-05, + "loss": 0.19403756856918336, + "step": 111130 + }, + { + "epoch": 0.4771472484823506, + "grad_norm": 1.8033745288848877, + "learning_rate": 5.251114579650406e-05, + "loss": 0.3300994634628296, + "step": 111140 + }, + { + "epoch": 0.4771901805723706, + "grad_norm": 0.12008707970380783, + "learning_rate": 5.250683407638644e-05, + "loss": 0.33939058780670167, + "step": 111150 + }, + { + "epoch": 0.47723311266239066, + "grad_norm": 2.0061378479003906, + "learning_rate": 5.2502522356268815e-05, + "loss": 0.26279187202453613, + "step": 111160 + }, + { + "epoch": 0.47727604475241064, + "grad_norm": 0.08203338086605072, + "learning_rate": 5.249821063615119e-05, + "loss": 0.10013169050216675, + "step": 111170 + }, + { + "epoch": 0.47731897684243063, + "grad_norm": 0.15610380470752716, + "learning_rate": 5.249389891603357e-05, + "loss": 0.18954191207885743, + "step": 111180 + }, + { + "epoch": 0.4773619089324507, + "grad_norm": 4.496754169464111, + "learning_rate": 5.248958719591595e-05, + "loss": 0.1938277006149292, + "step": 111190 + }, + { + "epoch": 0.47740484102247066, + "grad_norm": 0.13328373432159424, + "learning_rate": 5.248527547579831e-05, + "loss": 0.24010121822357178, + "step": 111200 + }, + { + "epoch": 0.47744777311249065, + "grad_norm": 0.1279555857181549, + "learning_rate": 5.248096375568069e-05, + "loss": 0.11613349914550782, + "step": 111210 + }, + { + "epoch": 0.4774907052025107, + "grad_norm": 0.0011030016466975212, + "learning_rate": 5.2476652035563065e-05, + "loss": 0.07406458854675294, + "step": 111220 + }, + { + "epoch": 0.4775336372925307, + "grad_norm": 3.7059450149536133, + "learning_rate": 5.247234031544544e-05, + "loss": 0.18384324312210082, + "step": 111230 + }, + { + "epoch": 0.47757656938255066, + "grad_norm": 1.8988912105560303, + "learning_rate": 5.246802859532782e-05, + "loss": 0.407755708694458, + "step": 111240 + }, + { + "epoch": 0.4776195014725707, + "grad_norm": 2.294595956802368, + "learning_rate": 5.24637168752102e-05, + "loss": 0.3127762317657471, + "step": 111250 + }, + { + "epoch": 0.4776624335625907, + "grad_norm": 0.10534092783927917, + "learning_rate": 5.245940515509258e-05, + "loss": 0.29289281368255615, + "step": 111260 + }, + { + "epoch": 0.4777053656526107, + "grad_norm": 0.11372819542884827, + "learning_rate": 5.245509343497496e-05, + "loss": 0.1034854531288147, + "step": 111270 + }, + { + "epoch": 0.4777482977426307, + "grad_norm": 3.390481472015381, + "learning_rate": 5.245078171485732e-05, + "loss": 0.14818179607391357, + "step": 111280 + }, + { + "epoch": 0.4777912298326507, + "grad_norm": 0.8811556100845337, + "learning_rate": 5.24464699947397e-05, + "loss": 0.2420907974243164, + "step": 111290 + }, + { + "epoch": 0.4778341619226707, + "grad_norm": 34.448577880859375, + "learning_rate": 5.244215827462208e-05, + "loss": 0.21140847206115723, + "step": 111300 + }, + { + "epoch": 0.47787709401269074, + "grad_norm": 0.5432401299476624, + "learning_rate": 5.2437846554504454e-05, + "loss": 0.11056315898895264, + "step": 111310 + }, + { + "epoch": 0.4779200261027107, + "grad_norm": 0.012676320970058441, + "learning_rate": 5.243353483438683e-05, + "loss": 0.32334301471710203, + "step": 111320 + }, + { + "epoch": 0.4779629581927307, + "grad_norm": 0.014294223859906197, + "learning_rate": 5.242922311426921e-05, + "loss": 0.09135165214538574, + "step": 111330 + }, + { + "epoch": 0.47800589028275076, + "grad_norm": 0.05797756835818291, + "learning_rate": 5.2424911394151586e-05, + "loss": 0.048809555172920224, + "step": 111340 + }, + { + "epoch": 0.47804882237277074, + "grad_norm": 2.065965175628662, + "learning_rate": 5.2420599674033964e-05, + "loss": 0.1818181872367859, + "step": 111350 + }, + { + "epoch": 0.47809175446279073, + "grad_norm": 0.17510123550891876, + "learning_rate": 5.241628795391634e-05, + "loss": 0.1729954957962036, + "step": 111360 + }, + { + "epoch": 0.4781346865528108, + "grad_norm": 4.402050971984863, + "learning_rate": 5.241197623379871e-05, + "loss": 0.4581557273864746, + "step": 111370 + }, + { + "epoch": 0.47817761864283076, + "grad_norm": 1.0645898580551147, + "learning_rate": 5.240766451368109e-05, + "loss": 0.08990298509597779, + "step": 111380 + }, + { + "epoch": 0.4782205507328508, + "grad_norm": 0.0008161990554071963, + "learning_rate": 5.2403352793563466e-05, + "loss": 0.30890114307403566, + "step": 111390 + }, + { + "epoch": 0.4782634828228708, + "grad_norm": 0.07157833129167557, + "learning_rate": 5.2399041073445844e-05, + "loss": 0.3999182224273682, + "step": 111400 + }, + { + "epoch": 0.4783064149128908, + "grad_norm": 0.011867137625813484, + "learning_rate": 5.239472935332822e-05, + "loss": 0.24128921031951905, + "step": 111410 + }, + { + "epoch": 0.4783493470029108, + "grad_norm": 1.8518315553665161, + "learning_rate": 5.23904176332106e-05, + "loss": 0.22349026203155517, + "step": 111420 + }, + { + "epoch": 0.4783922790929308, + "grad_norm": 0.2319876104593277, + "learning_rate": 5.2386105913092976e-05, + "loss": 0.26111981868743894, + "step": 111430 + }, + { + "epoch": 0.4784352111829508, + "grad_norm": 0.006205259822309017, + "learning_rate": 5.238179419297535e-05, + "loss": 0.26369264125823977, + "step": 111440 + }, + { + "epoch": 0.47847814327297084, + "grad_norm": 0.11651434004306793, + "learning_rate": 5.237748247285772e-05, + "loss": 0.3667259931564331, + "step": 111450 + }, + { + "epoch": 0.4785210753629908, + "grad_norm": 0.014610327780246735, + "learning_rate": 5.2373170752740094e-05, + "loss": 0.1900045394897461, + "step": 111460 + }, + { + "epoch": 0.4785640074530108, + "grad_norm": 0.023935405537486076, + "learning_rate": 5.236885903262247e-05, + "loss": 0.177459454536438, + "step": 111470 + }, + { + "epoch": 0.47860693954303085, + "grad_norm": 7.586323261260986, + "learning_rate": 5.2364547312504856e-05, + "loss": 0.30872330665588377, + "step": 111480 + }, + { + "epoch": 0.47864987163305084, + "grad_norm": 0.0036038036923855543, + "learning_rate": 5.236023559238723e-05, + "loss": 0.14827797412872315, + "step": 111490 + }, + { + "epoch": 0.4786928037230708, + "grad_norm": 0.006241375580430031, + "learning_rate": 5.235592387226961e-05, + "loss": 0.16753029823303223, + "step": 111500 + }, + { + "epoch": 0.47873573581309087, + "grad_norm": 1.5585517883300781, + "learning_rate": 5.235161215215199e-05, + "loss": 0.1405550241470337, + "step": 111510 + }, + { + "epoch": 0.47877866790311085, + "grad_norm": 0.1075335368514061, + "learning_rate": 5.2347300432034365e-05, + "loss": 0.08326172828674316, + "step": 111520 + }, + { + "epoch": 0.47882159999313084, + "grad_norm": 0.9561521410942078, + "learning_rate": 5.234298871191673e-05, + "loss": 0.18973840475082399, + "step": 111530 + }, + { + "epoch": 0.4788645320831509, + "grad_norm": 0.9297080039978027, + "learning_rate": 5.2338676991799106e-05, + "loss": 0.13529865741729735, + "step": 111540 + }, + { + "epoch": 0.47890746417317087, + "grad_norm": 10.015151977539062, + "learning_rate": 5.2334365271681484e-05, + "loss": 0.4769240379333496, + "step": 111550 + }, + { + "epoch": 0.47895039626319086, + "grad_norm": 1.3597183227539062, + "learning_rate": 5.233005355156386e-05, + "loss": 0.30367794036865237, + "step": 111560 + }, + { + "epoch": 0.4789933283532109, + "grad_norm": 0.005315948743373156, + "learning_rate": 5.232574183144624e-05, + "loss": 0.2673523187637329, + "step": 111570 + }, + { + "epoch": 0.4790362604432309, + "grad_norm": 0.38153043389320374, + "learning_rate": 5.2321430111328616e-05, + "loss": 0.3261179685592651, + "step": 111580 + }, + { + "epoch": 0.47907919253325093, + "grad_norm": 0.05338694155216217, + "learning_rate": 5.231711839121099e-05, + "loss": 0.1568708062171936, + "step": 111590 + }, + { + "epoch": 0.4791221246232709, + "grad_norm": 0.016230806708335876, + "learning_rate": 5.231280667109337e-05, + "loss": 0.37447845935821533, + "step": 111600 + }, + { + "epoch": 0.4791650567132909, + "grad_norm": 0.18410959839820862, + "learning_rate": 5.230849495097574e-05, + "loss": 0.05639318823814392, + "step": 111610 + }, + { + "epoch": 0.47920798880331095, + "grad_norm": 1.1493514776229858, + "learning_rate": 5.230418323085812e-05, + "loss": 0.22289519309997557, + "step": 111620 + }, + { + "epoch": 0.47925092089333093, + "grad_norm": 0.036008015275001526, + "learning_rate": 5.2299871510740496e-05, + "loss": 0.011840692907571792, + "step": 111630 + }, + { + "epoch": 0.4792938529833509, + "grad_norm": 0.007809455972164869, + "learning_rate": 5.229555979062287e-05, + "loss": 0.09044709801673889, + "step": 111640 + }, + { + "epoch": 0.47933678507337096, + "grad_norm": 0.03709058091044426, + "learning_rate": 5.229124807050525e-05, + "loss": 0.042278504371643065, + "step": 111650 + }, + { + "epoch": 0.47937971716339095, + "grad_norm": 1.3962702751159668, + "learning_rate": 5.228693635038763e-05, + "loss": 0.3972173690795898, + "step": 111660 + }, + { + "epoch": 0.47942264925341094, + "grad_norm": 1.8785520792007446, + "learning_rate": 5.2282624630270005e-05, + "loss": 0.3868700504302979, + "step": 111670 + }, + { + "epoch": 0.479465581343431, + "grad_norm": 1.7913538217544556, + "learning_rate": 5.227831291015238e-05, + "loss": 0.11502017974853515, + "step": 111680 + }, + { + "epoch": 0.47950851343345097, + "grad_norm": 0.012045775540173054, + "learning_rate": 5.227400119003476e-05, + "loss": 0.1622507929801941, + "step": 111690 + }, + { + "epoch": 0.47955144552347095, + "grad_norm": 0.004656339529901743, + "learning_rate": 5.226968946991713e-05, + "loss": 0.1231924057006836, + "step": 111700 + }, + { + "epoch": 0.479594377613491, + "grad_norm": 1.5003231763839722, + "learning_rate": 5.226537774979951e-05, + "loss": 0.31814677715301515, + "step": 111710 + }, + { + "epoch": 0.479637309703511, + "grad_norm": 0.01938486658036709, + "learning_rate": 5.2261066029681885e-05, + "loss": 0.08459774255752564, + "step": 111720 + }, + { + "epoch": 0.47968024179353097, + "grad_norm": 1.3566876649856567, + "learning_rate": 5.225675430956426e-05, + "loss": 0.07076024413108825, + "step": 111730 + }, + { + "epoch": 0.479723173883551, + "grad_norm": 1.2478437423706055, + "learning_rate": 5.225244258944664e-05, + "loss": 0.2324007272720337, + "step": 111740 + }, + { + "epoch": 0.479766105973571, + "grad_norm": 1.2657784223556519, + "learning_rate": 5.224813086932902e-05, + "loss": 0.20952317714691163, + "step": 111750 + }, + { + "epoch": 0.479809038063591, + "grad_norm": 1.6232753992080688, + "learning_rate": 5.2243819149211394e-05, + "loss": 0.19485619068145751, + "step": 111760 + }, + { + "epoch": 0.47985197015361103, + "grad_norm": 0.009501464664936066, + "learning_rate": 5.223950742909377e-05, + "loss": 0.20127930641174316, + "step": 111770 + }, + { + "epoch": 0.479894902243631, + "grad_norm": 0.003903453005477786, + "learning_rate": 5.2235195708976135e-05, + "loss": 0.17025061845779418, + "step": 111780 + }, + { + "epoch": 0.479937834333651, + "grad_norm": 0.023910705000162125, + "learning_rate": 5.223088398885851e-05, + "loss": 0.2176452398300171, + "step": 111790 + }, + { + "epoch": 0.47998076642367105, + "grad_norm": 0.0150348711758852, + "learning_rate": 5.222657226874089e-05, + "loss": 0.15627822875976563, + "step": 111800 + }, + { + "epoch": 0.48002369851369103, + "grad_norm": 0.006376425735652447, + "learning_rate": 5.222226054862327e-05, + "loss": 0.2449338436126709, + "step": 111810 + }, + { + "epoch": 0.4800666306037111, + "grad_norm": 0.03453676775097847, + "learning_rate": 5.2217948828505645e-05, + "loss": 0.07500687837600709, + "step": 111820 + }, + { + "epoch": 0.48010956269373106, + "grad_norm": 0.01049389410763979, + "learning_rate": 5.221363710838802e-05, + "loss": 0.38739445209503176, + "step": 111830 + }, + { + "epoch": 0.48015249478375105, + "grad_norm": 0.003912179730832577, + "learning_rate": 5.22093253882704e-05, + "loss": 0.06761714220046997, + "step": 111840 + }, + { + "epoch": 0.4801954268737711, + "grad_norm": 1.2374663352966309, + "learning_rate": 5.2205013668152784e-05, + "loss": 0.22694122791290283, + "step": 111850 + }, + { + "epoch": 0.4802383589637911, + "grad_norm": 0.4927384853363037, + "learning_rate": 5.220070194803515e-05, + "loss": 0.02246246635913849, + "step": 111860 + }, + { + "epoch": 0.48028129105381107, + "grad_norm": 54.35266876220703, + "learning_rate": 5.2196390227917525e-05, + "loss": 0.1613282561302185, + "step": 111870 + }, + { + "epoch": 0.4803242231438311, + "grad_norm": 1.6677411794662476, + "learning_rate": 5.21920785077999e-05, + "loss": 0.036837369203567505, + "step": 111880 + }, + { + "epoch": 0.4803671552338511, + "grad_norm": 0.15668274462223053, + "learning_rate": 5.218776678768228e-05, + "loss": 0.08144662380218506, + "step": 111890 + }, + { + "epoch": 0.4804100873238711, + "grad_norm": 0.2714574933052063, + "learning_rate": 5.218345506756466e-05, + "loss": 0.1942346692085266, + "step": 111900 + }, + { + "epoch": 0.4804530194138911, + "grad_norm": 3.1457133293151855, + "learning_rate": 5.2179143347447034e-05, + "loss": 0.3857900857925415, + "step": 111910 + }, + { + "epoch": 0.4804959515039111, + "grad_norm": 2.2899222373962402, + "learning_rate": 5.217483162732941e-05, + "loss": 0.2843789577484131, + "step": 111920 + }, + { + "epoch": 0.4805388835939311, + "grad_norm": 0.030405929312109947, + "learning_rate": 5.217051990721179e-05, + "loss": 0.08297332525253295, + "step": 111930 + }, + { + "epoch": 0.48058181568395114, + "grad_norm": 0.010032770223915577, + "learning_rate": 5.216620818709416e-05, + "loss": 0.08996413946151734, + "step": 111940 + }, + { + "epoch": 0.48062474777397113, + "grad_norm": 0.0011612273519858718, + "learning_rate": 5.216189646697654e-05, + "loss": 0.2334982395172119, + "step": 111950 + }, + { + "epoch": 0.4806676798639911, + "grad_norm": 0.02428930439054966, + "learning_rate": 5.2157584746858914e-05, + "loss": 0.21660916805267333, + "step": 111960 + }, + { + "epoch": 0.48071061195401116, + "grad_norm": 0.00992624368518591, + "learning_rate": 5.215327302674129e-05, + "loss": 0.08710908889770508, + "step": 111970 + }, + { + "epoch": 0.48075354404403114, + "grad_norm": 1.4480361938476562, + "learning_rate": 5.214896130662367e-05, + "loss": 0.20752573013305664, + "step": 111980 + }, + { + "epoch": 0.48079647613405113, + "grad_norm": 0.0048288386315107346, + "learning_rate": 5.2144649586506046e-05, + "loss": 0.25462770462036133, + "step": 111990 + }, + { + "epoch": 0.4808394082240712, + "grad_norm": 3.1555609703063965, + "learning_rate": 5.214033786638842e-05, + "loss": 0.06526825428009034, + "step": 112000 + }, + { + "epoch": 0.4808394082240712, + "eval_loss": 0.4083115756511688, + "eval_runtime": 27.112, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 112000 + }, + { + "epoch": 0.48088234031409116, + "grad_norm": 0.0345025397837162, + "learning_rate": 5.21360261462708e-05, + "loss": 0.20930685997009277, + "step": 112010 + }, + { + "epoch": 0.4809252724041112, + "grad_norm": 0.7449776530265808, + "learning_rate": 5.213171442615318e-05, + "loss": 0.521756362915039, + "step": 112020 + }, + { + "epoch": 0.4809682044941312, + "grad_norm": 0.010897096246480942, + "learning_rate": 5.212740270603554e-05, + "loss": 0.06535886526107788, + "step": 112030 + }, + { + "epoch": 0.4810111365841512, + "grad_norm": 1.3106818199157715, + "learning_rate": 5.212309098591792e-05, + "loss": 0.2568356037139893, + "step": 112040 + }, + { + "epoch": 0.4810540686741712, + "grad_norm": 1.2762078046798706, + "learning_rate": 5.2118779265800296e-05, + "loss": 0.1292945384979248, + "step": 112050 + }, + { + "epoch": 0.4810970007641912, + "grad_norm": 1.3804891109466553, + "learning_rate": 5.2114467545682674e-05, + "loss": 0.32599267959594724, + "step": 112060 + }, + { + "epoch": 0.4811399328542112, + "grad_norm": 0.0030066012404859066, + "learning_rate": 5.211015582556506e-05, + "loss": 0.33525660037994387, + "step": 112070 + }, + { + "epoch": 0.48118286494423124, + "grad_norm": 3.0664565563201904, + "learning_rate": 5.2105844105447435e-05, + "loss": 0.2135364294052124, + "step": 112080 + }, + { + "epoch": 0.4812257970342512, + "grad_norm": 1.4243364334106445, + "learning_rate": 5.210153238532981e-05, + "loss": 0.33560497760772706, + "step": 112090 + }, + { + "epoch": 0.4812687291242712, + "grad_norm": 6.101076126098633, + "learning_rate": 5.209722066521219e-05, + "loss": 0.43417649269104003, + "step": 112100 + }, + { + "epoch": 0.48131166121429125, + "grad_norm": 0.004235812928527594, + "learning_rate": 5.2092908945094554e-05, + "loss": 0.2360063076019287, + "step": 112110 + }, + { + "epoch": 0.48135459330431124, + "grad_norm": 3.2225868701934814, + "learning_rate": 5.208859722497693e-05, + "loss": 0.3149883270263672, + "step": 112120 + }, + { + "epoch": 0.4813975253943312, + "grad_norm": 0.22777503728866577, + "learning_rate": 5.208428550485931e-05, + "loss": 0.11221933364868164, + "step": 112130 + }, + { + "epoch": 0.48144045748435127, + "grad_norm": 0.0066849710419774055, + "learning_rate": 5.2079973784741686e-05, + "loss": 0.20609259605407715, + "step": 112140 + }, + { + "epoch": 0.48148338957437126, + "grad_norm": 2.6385879516601562, + "learning_rate": 5.207566206462406e-05, + "loss": 0.10182238817214966, + "step": 112150 + }, + { + "epoch": 0.48152632166439124, + "grad_norm": 6.075347900390625, + "learning_rate": 5.207135034450644e-05, + "loss": 0.1820530891418457, + "step": 112160 + }, + { + "epoch": 0.4815692537544113, + "grad_norm": 0.19385981559753418, + "learning_rate": 5.206703862438882e-05, + "loss": 0.25064907073974607, + "step": 112170 + }, + { + "epoch": 0.4816121858444313, + "grad_norm": 0.003163368906825781, + "learning_rate": 5.2062726904271195e-05, + "loss": 0.2682561159133911, + "step": 112180 + }, + { + "epoch": 0.48165511793445126, + "grad_norm": 0.0014554493827745318, + "learning_rate": 5.2058415184153566e-05, + "loss": 0.07060363292694091, + "step": 112190 + }, + { + "epoch": 0.4816980500244713, + "grad_norm": 1.6436724662780762, + "learning_rate": 5.205410346403594e-05, + "loss": 0.2483814001083374, + "step": 112200 + }, + { + "epoch": 0.4817409821144913, + "grad_norm": 0.10719986259937286, + "learning_rate": 5.204979174391832e-05, + "loss": 0.10259265899658203, + "step": 112210 + }, + { + "epoch": 0.4817839142045113, + "grad_norm": 0.07681338489055634, + "learning_rate": 5.20454800238007e-05, + "loss": 0.1327905535697937, + "step": 112220 + }, + { + "epoch": 0.4818268462945313, + "grad_norm": 2.0866005420684814, + "learning_rate": 5.2041168303683075e-05, + "loss": 0.16332383155822755, + "step": 112230 + }, + { + "epoch": 0.4818697783845513, + "grad_norm": 0.059452347457408905, + "learning_rate": 5.203685658356545e-05, + "loss": 0.25697765350341795, + "step": 112240 + }, + { + "epoch": 0.48191271047457135, + "grad_norm": 0.689960777759552, + "learning_rate": 5.203254486344783e-05, + "loss": 0.22725882530212402, + "step": 112250 + }, + { + "epoch": 0.48195564256459134, + "grad_norm": 2.9958412647247314, + "learning_rate": 5.202823314333021e-05, + "loss": 0.11253809928894043, + "step": 112260 + }, + { + "epoch": 0.4819985746546113, + "grad_norm": 0.010006138123571873, + "learning_rate": 5.202392142321257e-05, + "loss": 0.2789454936981201, + "step": 112270 + }, + { + "epoch": 0.48204150674463137, + "grad_norm": 29.61517906188965, + "learning_rate": 5.201960970309495e-05, + "loss": 0.29062979221343993, + "step": 112280 + }, + { + "epoch": 0.48208443883465135, + "grad_norm": 1.6461533308029175, + "learning_rate": 5.201529798297733e-05, + "loss": 0.26518611907958983, + "step": 112290 + }, + { + "epoch": 0.48212737092467134, + "grad_norm": 0.8490138053894043, + "learning_rate": 5.201098626285971e-05, + "loss": 0.14062355756759642, + "step": 112300 + }, + { + "epoch": 0.4821703030146914, + "grad_norm": 1.953096628189087, + "learning_rate": 5.200667454274209e-05, + "loss": 0.2001904010772705, + "step": 112310 + }, + { + "epoch": 0.48221323510471137, + "grad_norm": 0.07899369299411774, + "learning_rate": 5.2002362822624464e-05, + "loss": 0.078703773021698, + "step": 112320 + }, + { + "epoch": 0.48225616719473136, + "grad_norm": 0.0437178909778595, + "learning_rate": 5.199805110250684e-05, + "loss": 0.08275541067123413, + "step": 112330 + }, + { + "epoch": 0.4822990992847514, + "grad_norm": 0.14887292683124542, + "learning_rate": 5.199373938238922e-05, + "loss": 0.15887333154678346, + "step": 112340 + }, + { + "epoch": 0.4823420313747714, + "grad_norm": 0.015832485631108284, + "learning_rate": 5.198942766227158e-05, + "loss": 0.2449730396270752, + "step": 112350 + }, + { + "epoch": 0.48238496346479137, + "grad_norm": 1.3846042156219482, + "learning_rate": 5.198511594215396e-05, + "loss": 0.4144141674041748, + "step": 112360 + }, + { + "epoch": 0.4824278955548114, + "grad_norm": 0.39883336424827576, + "learning_rate": 5.198080422203634e-05, + "loss": 0.17197943925857545, + "step": 112370 + }, + { + "epoch": 0.4824708276448314, + "grad_norm": 0.035733215510845184, + "learning_rate": 5.1976492501918715e-05, + "loss": 0.2461705446243286, + "step": 112380 + }, + { + "epoch": 0.4825137597348514, + "grad_norm": 4.164612293243408, + "learning_rate": 5.197218078180109e-05, + "loss": 0.0967523694038391, + "step": 112390 + }, + { + "epoch": 0.48255669182487143, + "grad_norm": 1.2880102396011353, + "learning_rate": 5.196786906168347e-05, + "loss": 0.10995590686798096, + "step": 112400 + }, + { + "epoch": 0.4825996239148914, + "grad_norm": 22.918487548828125, + "learning_rate": 5.196355734156585e-05, + "loss": 0.14426627159118652, + "step": 112410 + }, + { + "epoch": 0.4826425560049114, + "grad_norm": 0.009621814824640751, + "learning_rate": 5.1959245621448224e-05, + "loss": 0.05674814581871033, + "step": 112420 + }, + { + "epoch": 0.48268548809493145, + "grad_norm": 1.0595000982284546, + "learning_rate": 5.19549339013306e-05, + "loss": 0.17759565114974976, + "step": 112430 + }, + { + "epoch": 0.48272842018495143, + "grad_norm": 0.08510831743478775, + "learning_rate": 5.195062218121297e-05, + "loss": 0.4445448875427246, + "step": 112440 + }, + { + "epoch": 0.4827713522749715, + "grad_norm": 28.908931732177734, + "learning_rate": 5.194631046109535e-05, + "loss": 0.34495205879211427, + "step": 112450 + }, + { + "epoch": 0.48281428436499146, + "grad_norm": 0.07188957184553146, + "learning_rate": 5.194199874097773e-05, + "loss": 0.23050713539123535, + "step": 112460 + }, + { + "epoch": 0.48285721645501145, + "grad_norm": 0.01424756832420826, + "learning_rate": 5.1937687020860104e-05, + "loss": 0.14939489364624023, + "step": 112470 + }, + { + "epoch": 0.4829001485450315, + "grad_norm": 1.1364645957946777, + "learning_rate": 5.193337530074248e-05, + "loss": 0.3412284851074219, + "step": 112480 + }, + { + "epoch": 0.4829430806350515, + "grad_norm": 0.003248439868912101, + "learning_rate": 5.192906358062486e-05, + "loss": 0.1077422022819519, + "step": 112490 + }, + { + "epoch": 0.48298601272507147, + "grad_norm": 0.3519462049007416, + "learning_rate": 5.1924751860507236e-05, + "loss": 0.27584822177886964, + "step": 112500 + }, + { + "epoch": 0.4830289448150915, + "grad_norm": 0.0038426381070166826, + "learning_rate": 5.1920440140389614e-05, + "loss": 0.19264932870864868, + "step": 112510 + }, + { + "epoch": 0.4830718769051115, + "grad_norm": 0.17295725643634796, + "learning_rate": 5.1916128420271984e-05, + "loss": 0.4067643642425537, + "step": 112520 + }, + { + "epoch": 0.4831148089951315, + "grad_norm": 0.014537591487169266, + "learning_rate": 5.191181670015436e-05, + "loss": 0.12884639501571654, + "step": 112530 + }, + { + "epoch": 0.4831577410851515, + "grad_norm": 3.0052669048309326, + "learning_rate": 5.190750498003674e-05, + "loss": 0.16474716663360595, + "step": 112540 + }, + { + "epoch": 0.4832006731751715, + "grad_norm": 0.0008527276222594082, + "learning_rate": 5.1903193259919116e-05, + "loss": 0.15970051288604736, + "step": 112550 + }, + { + "epoch": 0.4832436052651915, + "grad_norm": 3.4651949405670166, + "learning_rate": 5.1898881539801494e-05, + "loss": 0.3635735273361206, + "step": 112560 + }, + { + "epoch": 0.48328653735521154, + "grad_norm": 0.04663752019405365, + "learning_rate": 5.189456981968387e-05, + "loss": 0.16978216171264648, + "step": 112570 + }, + { + "epoch": 0.48332946944523153, + "grad_norm": 0.04115908965468407, + "learning_rate": 5.189025809956625e-05, + "loss": 0.18163001537322998, + "step": 112580 + }, + { + "epoch": 0.4833724015352515, + "grad_norm": 0.003294061403721571, + "learning_rate": 5.1885946379448626e-05, + "loss": 0.21141493320465088, + "step": 112590 + }, + { + "epoch": 0.48341533362527156, + "grad_norm": 0.26114872097969055, + "learning_rate": 5.188163465933099e-05, + "loss": 0.2178705930709839, + "step": 112600 + }, + { + "epoch": 0.48345826571529155, + "grad_norm": 0.08138860762119293, + "learning_rate": 5.187732293921337e-05, + "loss": 0.1656543493270874, + "step": 112610 + }, + { + "epoch": 0.48350119780531153, + "grad_norm": 0.029856372624635696, + "learning_rate": 5.1873011219095744e-05, + "loss": 0.27136709690093996, + "step": 112620 + }, + { + "epoch": 0.4835441298953316, + "grad_norm": 0.8668439388275146, + "learning_rate": 5.186869949897812e-05, + "loss": 0.06196349859237671, + "step": 112630 + }, + { + "epoch": 0.48358706198535156, + "grad_norm": 0.5561325550079346, + "learning_rate": 5.18643877788605e-05, + "loss": 0.05971835851669312, + "step": 112640 + }, + { + "epoch": 0.48362999407537155, + "grad_norm": 0.10063795745372772, + "learning_rate": 5.1860076058742876e-05, + "loss": 0.21844873428344727, + "step": 112650 + }, + { + "epoch": 0.4836729261653916, + "grad_norm": 0.0034989193081855774, + "learning_rate": 5.1855764338625253e-05, + "loss": 0.1683848023414612, + "step": 112660 + }, + { + "epoch": 0.4837158582554116, + "grad_norm": 0.005735354032367468, + "learning_rate": 5.185145261850764e-05, + "loss": 0.05226213932037353, + "step": 112670 + }, + { + "epoch": 0.4837587903454316, + "grad_norm": 0.01305412221699953, + "learning_rate": 5.184714089839e-05, + "loss": 0.1914324641227722, + "step": 112680 + }, + { + "epoch": 0.4838017224354516, + "grad_norm": 0.005273500457406044, + "learning_rate": 5.184282917827238e-05, + "loss": 0.09522543549537658, + "step": 112690 + }, + { + "epoch": 0.4838446545254716, + "grad_norm": 1.982298731803894, + "learning_rate": 5.1838517458154756e-05, + "loss": 0.1866297483444214, + "step": 112700 + }, + { + "epoch": 0.48388758661549164, + "grad_norm": 1.0073444843292236, + "learning_rate": 5.183420573803713e-05, + "loss": 0.1342653751373291, + "step": 112710 + }, + { + "epoch": 0.4839305187055116, + "grad_norm": 2.56182599067688, + "learning_rate": 5.182989401791951e-05, + "loss": 0.5220149040222168, + "step": 112720 + }, + { + "epoch": 0.4839734507955316, + "grad_norm": 0.007171725854277611, + "learning_rate": 5.182558229780189e-05, + "loss": 0.1385490655899048, + "step": 112730 + }, + { + "epoch": 0.48401638288555165, + "grad_norm": 0.1748906672000885, + "learning_rate": 5.1821270577684265e-05, + "loss": 0.19893691539764405, + "step": 112740 + }, + { + "epoch": 0.48405931497557164, + "grad_norm": 3.96621036529541, + "learning_rate": 5.181695885756664e-05, + "loss": 0.4268038272857666, + "step": 112750 + }, + { + "epoch": 0.48410224706559163, + "grad_norm": 0.04518533870577812, + "learning_rate": 5.181264713744902e-05, + "loss": 0.04139855802059174, + "step": 112760 + }, + { + "epoch": 0.48414517915561167, + "grad_norm": 0.47020474076271057, + "learning_rate": 5.180833541733139e-05, + "loss": 0.20869805812835693, + "step": 112770 + }, + { + "epoch": 0.48418811124563166, + "grad_norm": 0.0010732099181041121, + "learning_rate": 5.180402369721377e-05, + "loss": 0.18752647638320924, + "step": 112780 + }, + { + "epoch": 0.48423104333565165, + "grad_norm": 0.10524271428585052, + "learning_rate": 5.1799711977096145e-05, + "loss": 0.14028940200805665, + "step": 112790 + }, + { + "epoch": 0.4842739754256717, + "grad_norm": 0.10299362242221832, + "learning_rate": 5.179540025697852e-05, + "loss": 0.2775475740432739, + "step": 112800 + }, + { + "epoch": 0.4843169075156917, + "grad_norm": 0.15372726321220398, + "learning_rate": 5.17910885368609e-05, + "loss": 0.22584834098815917, + "step": 112810 + }, + { + "epoch": 0.48435983960571166, + "grad_norm": 0.01749584637582302, + "learning_rate": 5.178677681674328e-05, + "loss": 0.31413612365722654, + "step": 112820 + }, + { + "epoch": 0.4844027716957317, + "grad_norm": 0.005402527749538422, + "learning_rate": 5.1782465096625655e-05, + "loss": 0.3106074810028076, + "step": 112830 + }, + { + "epoch": 0.4844457037857517, + "grad_norm": 2.2965476512908936, + "learning_rate": 5.177815337650803e-05, + "loss": 0.2918811798095703, + "step": 112840 + }, + { + "epoch": 0.4844886358757717, + "grad_norm": 0.01444154977798462, + "learning_rate": 5.1773841656390396e-05, + "loss": 0.1380111336708069, + "step": 112850 + }, + { + "epoch": 0.4845315679657917, + "grad_norm": 0.0188368521630764, + "learning_rate": 5.176952993627277e-05, + "loss": 0.20080742835998536, + "step": 112860 + }, + { + "epoch": 0.4845745000558117, + "grad_norm": 0.36231333017349243, + "learning_rate": 5.176521821615515e-05, + "loss": 0.223512601852417, + "step": 112870 + }, + { + "epoch": 0.48461743214583175, + "grad_norm": 0.07586616277694702, + "learning_rate": 5.176090649603753e-05, + "loss": 0.07849234938621522, + "step": 112880 + }, + { + "epoch": 0.48466036423585174, + "grad_norm": 0.36311033368110657, + "learning_rate": 5.175659477591991e-05, + "loss": 0.28073141574859617, + "step": 112890 + }, + { + "epoch": 0.4847032963258717, + "grad_norm": 0.022957701236009598, + "learning_rate": 5.175228305580229e-05, + "loss": 0.31559345722198484, + "step": 112900 + }, + { + "epoch": 0.48474622841589177, + "grad_norm": 1.596835970878601, + "learning_rate": 5.174797133568467e-05, + "loss": 0.2369527578353882, + "step": 112910 + }, + { + "epoch": 0.48478916050591175, + "grad_norm": 0.09815307706594467, + "learning_rate": 5.1743659615567044e-05, + "loss": 0.3749623537063599, + "step": 112920 + }, + { + "epoch": 0.48483209259593174, + "grad_norm": 0.024481289088726044, + "learning_rate": 5.173934789544941e-05, + "loss": 0.10471458435058593, + "step": 112930 + }, + { + "epoch": 0.4848750246859518, + "grad_norm": 2.419257164001465, + "learning_rate": 5.1735036175331785e-05, + "loss": 0.15412943363189696, + "step": 112940 + }, + { + "epoch": 0.48491795677597177, + "grad_norm": 0.015406905673444271, + "learning_rate": 5.173072445521416e-05, + "loss": 0.1519034743309021, + "step": 112950 + }, + { + "epoch": 0.48496088886599176, + "grad_norm": 0.03381875902414322, + "learning_rate": 5.172641273509654e-05, + "loss": 0.40987367630004884, + "step": 112960 + }, + { + "epoch": 0.4850038209560118, + "grad_norm": 0.03254738822579384, + "learning_rate": 5.172210101497892e-05, + "loss": 0.1492979645729065, + "step": 112970 + }, + { + "epoch": 0.4850467530460318, + "grad_norm": 0.060563910752534866, + "learning_rate": 5.1717789294861295e-05, + "loss": 0.22253785133361817, + "step": 112980 + }, + { + "epoch": 0.4850896851360518, + "grad_norm": 0.05428497865796089, + "learning_rate": 5.171347757474367e-05, + "loss": 0.1761980414390564, + "step": 112990 + }, + { + "epoch": 0.4851326172260718, + "grad_norm": 7.6052141189575195, + "learning_rate": 5.170916585462605e-05, + "loss": 0.02835783362388611, + "step": 113000 + }, + { + "epoch": 0.4851326172260718, + "eval_loss": 0.40955492854118347, + "eval_runtime": 27.2062, + "eval_samples_per_second": 3.676, + "eval_steps_per_second": 3.676, + "step": 113000 + }, + { + "epoch": 0.4851755493160918, + "grad_norm": 0.03771714121103287, + "learning_rate": 5.170485413450842e-05, + "loss": 0.18397490978240966, + "step": 113010 + }, + { + "epoch": 0.4852184814061118, + "grad_norm": 0.0032267896458506584, + "learning_rate": 5.17005424143908e-05, + "loss": 0.044160327315330504, + "step": 113020 + }, + { + "epoch": 0.48526141349613183, + "grad_norm": 1.068603754043579, + "learning_rate": 5.1696230694273174e-05, + "loss": 0.15256195068359374, + "step": 113030 + }, + { + "epoch": 0.4853043455861518, + "grad_norm": 6.162492275238037, + "learning_rate": 5.169191897415555e-05, + "loss": 0.18141252994537355, + "step": 113040 + }, + { + "epoch": 0.4853472776761718, + "grad_norm": 0.0070702810771763325, + "learning_rate": 5.168760725403793e-05, + "loss": 0.49767394065856935, + "step": 113050 + }, + { + "epoch": 0.48539020976619185, + "grad_norm": 0.0628747045993805, + "learning_rate": 5.1683295533920306e-05, + "loss": 0.1891782522201538, + "step": 113060 + }, + { + "epoch": 0.48543314185621184, + "grad_norm": 0.00538325309753418, + "learning_rate": 5.1678983813802684e-05, + "loss": 0.12313296794891357, + "step": 113070 + }, + { + "epoch": 0.4854760739462318, + "grad_norm": 0.3890695869922638, + "learning_rate": 5.167467209368506e-05, + "loss": 0.007072269916534424, + "step": 113080 + }, + { + "epoch": 0.48551900603625187, + "grad_norm": 0.1812918335199356, + "learning_rate": 5.1670360373567425e-05, + "loss": 0.16834324598312378, + "step": 113090 + }, + { + "epoch": 0.48556193812627185, + "grad_norm": 1.4702471494674683, + "learning_rate": 5.16660486534498e-05, + "loss": 0.11569063663482666, + "step": 113100 + }, + { + "epoch": 0.4856048702162919, + "grad_norm": 0.1837807446718216, + "learning_rate": 5.1661736933332186e-05, + "loss": 0.12595237493515016, + "step": 113110 + }, + { + "epoch": 0.4856478023063119, + "grad_norm": 3.104374885559082, + "learning_rate": 5.1657425213214564e-05, + "loss": 0.2997664213180542, + "step": 113120 + }, + { + "epoch": 0.48569073439633187, + "grad_norm": 0.06540167331695557, + "learning_rate": 5.165311349309694e-05, + "loss": 0.15600260496139526, + "step": 113130 + }, + { + "epoch": 0.4857336664863519, + "grad_norm": 2.046344041824341, + "learning_rate": 5.164880177297932e-05, + "loss": 0.1797309160232544, + "step": 113140 + }, + { + "epoch": 0.4857765985763719, + "grad_norm": 0.003572971560060978, + "learning_rate": 5.1644490052861696e-05, + "loss": 0.2054067850112915, + "step": 113150 + }, + { + "epoch": 0.4858195306663919, + "grad_norm": 0.017250074073672295, + "learning_rate": 5.164017833274407e-05, + "loss": 0.40507144927978517, + "step": 113160 + }, + { + "epoch": 0.48586246275641193, + "grad_norm": 0.5369052290916443, + "learning_rate": 5.163586661262645e-05, + "loss": 0.448713493347168, + "step": 113170 + }, + { + "epoch": 0.4859053948464319, + "grad_norm": 0.4330086410045624, + "learning_rate": 5.1631554892508814e-05, + "loss": 0.1150307297706604, + "step": 113180 + }, + { + "epoch": 0.4859483269364519, + "grad_norm": 0.06005656346678734, + "learning_rate": 5.162724317239119e-05, + "loss": 0.5557120323181153, + "step": 113190 + }, + { + "epoch": 0.48599125902647194, + "grad_norm": 0.04298854619264603, + "learning_rate": 5.162293145227357e-05, + "loss": 0.14865396022796631, + "step": 113200 + }, + { + "epoch": 0.48603419111649193, + "grad_norm": 0.14638099074363708, + "learning_rate": 5.1618619732155946e-05, + "loss": 0.14868471622467042, + "step": 113210 + }, + { + "epoch": 0.4860771232065119, + "grad_norm": 0.03624096140265465, + "learning_rate": 5.1614308012038324e-05, + "loss": 0.20731801986694337, + "step": 113220 + }, + { + "epoch": 0.48612005529653196, + "grad_norm": 0.004342042841017246, + "learning_rate": 5.16099962919207e-05, + "loss": 0.11572116613388062, + "step": 113230 + }, + { + "epoch": 0.48616298738655195, + "grad_norm": 1.1985704898834229, + "learning_rate": 5.160568457180308e-05, + "loss": 0.11593867540359497, + "step": 113240 + }, + { + "epoch": 0.48620591947657193, + "grad_norm": 0.6859614849090576, + "learning_rate": 5.1601372851685456e-05, + "loss": 0.17292320728302002, + "step": 113250 + }, + { + "epoch": 0.486248851566592, + "grad_norm": 1.7073107957839966, + "learning_rate": 5.1597061131567826e-05, + "loss": 0.32567050457000735, + "step": 113260 + }, + { + "epoch": 0.48629178365661196, + "grad_norm": 1.3878463506698608, + "learning_rate": 5.1592749411450204e-05, + "loss": 0.32795984745025636, + "step": 113270 + }, + { + "epoch": 0.48633471574663195, + "grad_norm": 0.05851823836565018, + "learning_rate": 5.158843769133258e-05, + "loss": 0.1408895492553711, + "step": 113280 + }, + { + "epoch": 0.486377647836652, + "grad_norm": 0.006685543339699507, + "learning_rate": 5.158412597121496e-05, + "loss": 0.16352103948593139, + "step": 113290 + }, + { + "epoch": 0.486420579926672, + "grad_norm": 0.2041519284248352, + "learning_rate": 5.1579814251097336e-05, + "loss": 0.3879366397857666, + "step": 113300 + }, + { + "epoch": 0.486463512016692, + "grad_norm": 1.1903445720672607, + "learning_rate": 5.157550253097971e-05, + "loss": 0.3377037525177002, + "step": 113310 + }, + { + "epoch": 0.486506444106712, + "grad_norm": 2.3848395347595215, + "learning_rate": 5.157119081086209e-05, + "loss": 0.18910210132598876, + "step": 113320 + }, + { + "epoch": 0.486549376196732, + "grad_norm": 1.192986249923706, + "learning_rate": 5.156687909074447e-05, + "loss": 0.24098713397979737, + "step": 113330 + }, + { + "epoch": 0.48659230828675204, + "grad_norm": 0.9725649356842041, + "learning_rate": 5.156256737062684e-05, + "loss": 0.3244537115097046, + "step": 113340 + }, + { + "epoch": 0.486635240376772, + "grad_norm": 0.1657193899154663, + "learning_rate": 5.1558255650509216e-05, + "loss": 0.12726542949676514, + "step": 113350 + }, + { + "epoch": 0.486678172466792, + "grad_norm": 0.17325559258460999, + "learning_rate": 5.155394393039159e-05, + "loss": 0.2331669569015503, + "step": 113360 + }, + { + "epoch": 0.48672110455681206, + "grad_norm": 1.8775490522384644, + "learning_rate": 5.154963221027397e-05, + "loss": 0.15301434993743895, + "step": 113370 + }, + { + "epoch": 0.48676403664683204, + "grad_norm": 0.09411582350730896, + "learning_rate": 5.154532049015635e-05, + "loss": 0.15416059494018555, + "step": 113380 + }, + { + "epoch": 0.48680696873685203, + "grad_norm": 2.0111148357391357, + "learning_rate": 5.1541008770038725e-05, + "loss": 0.3471565008163452, + "step": 113390 + }, + { + "epoch": 0.4868499008268721, + "grad_norm": 0.0025355094112455845, + "learning_rate": 5.15366970499211e-05, + "loss": 0.38851518630981446, + "step": 113400 + }, + { + "epoch": 0.48689283291689206, + "grad_norm": 0.3443334996700287, + "learning_rate": 5.153238532980348e-05, + "loss": 0.2016951322555542, + "step": 113410 + }, + { + "epoch": 0.48693576500691205, + "grad_norm": 0.005935294553637505, + "learning_rate": 5.1528073609685843e-05, + "loss": 0.3032489538192749, + "step": 113420 + }, + { + "epoch": 0.4869786970969321, + "grad_norm": 5.571764945983887, + "learning_rate": 5.152376188956822e-05, + "loss": 0.32014267444610595, + "step": 113430 + }, + { + "epoch": 0.4870216291869521, + "grad_norm": 0.26585423946380615, + "learning_rate": 5.15194501694506e-05, + "loss": 0.09606272578239441, + "step": 113440 + }, + { + "epoch": 0.48706456127697206, + "grad_norm": 5.361762523651123, + "learning_rate": 5.1515138449332975e-05, + "loss": 0.3446836233139038, + "step": 113450 + }, + { + "epoch": 0.4871074933669921, + "grad_norm": 1.5857845544815063, + "learning_rate": 5.151082672921535e-05, + "loss": 0.08704413771629334, + "step": 113460 + }, + { + "epoch": 0.4871504254570121, + "grad_norm": 0.07909522950649261, + "learning_rate": 5.150651500909773e-05, + "loss": 0.10852447748184205, + "step": 113470 + }, + { + "epoch": 0.4871933575470321, + "grad_norm": 0.011265222914516926, + "learning_rate": 5.1502203288980114e-05, + "loss": 0.03509455025196075, + "step": 113480 + }, + { + "epoch": 0.4872362896370521, + "grad_norm": 0.5867186784744263, + "learning_rate": 5.149789156886249e-05, + "loss": 0.2819956302642822, + "step": 113490 + }, + { + "epoch": 0.4872792217270721, + "grad_norm": 0.015352309681475163, + "learning_rate": 5.149357984874487e-05, + "loss": 0.17799873352050782, + "step": 113500 + }, + { + "epoch": 0.4873221538170921, + "grad_norm": 3.170710802078247, + "learning_rate": 5.148926812862723e-05, + "loss": 0.2149796962738037, + "step": 113510 + }, + { + "epoch": 0.48736508590711214, + "grad_norm": 0.02258378453552723, + "learning_rate": 5.148495640850961e-05, + "loss": 0.16186734437942504, + "step": 113520 + }, + { + "epoch": 0.4874080179971321, + "grad_norm": 0.015534556470811367, + "learning_rate": 5.148064468839199e-05, + "loss": 0.281885027885437, + "step": 113530 + }, + { + "epoch": 0.48745095008715217, + "grad_norm": 0.7893404364585876, + "learning_rate": 5.1476332968274365e-05, + "loss": 0.13327269554138182, + "step": 113540 + }, + { + "epoch": 0.48749388217717216, + "grad_norm": 0.028485532850027084, + "learning_rate": 5.147202124815674e-05, + "loss": 0.27498056888580324, + "step": 113550 + }, + { + "epoch": 0.48753681426719214, + "grad_norm": 3.629746437072754, + "learning_rate": 5.146770952803912e-05, + "loss": 0.19867525100708008, + "step": 113560 + }, + { + "epoch": 0.4875797463572122, + "grad_norm": 1.8920320272445679, + "learning_rate": 5.14633978079215e-05, + "loss": 0.30650577545166013, + "step": 113570 + }, + { + "epoch": 0.48762267844723217, + "grad_norm": 3.0940048694610596, + "learning_rate": 5.1459086087803874e-05, + "loss": 0.19545323848724366, + "step": 113580 + }, + { + "epoch": 0.48766561053725216, + "grad_norm": 0.041216179728507996, + "learning_rate": 5.1454774367686245e-05, + "loss": 0.2117311477661133, + "step": 113590 + }, + { + "epoch": 0.4877085426272722, + "grad_norm": 0.07402265071868896, + "learning_rate": 5.145046264756862e-05, + "loss": 0.21154932975769042, + "step": 113600 + }, + { + "epoch": 0.4877514747172922, + "grad_norm": 0.004615292884409428, + "learning_rate": 5.1446150927451e-05, + "loss": 0.2221027135848999, + "step": 113610 + }, + { + "epoch": 0.4877944068073122, + "grad_norm": 0.05810505896806717, + "learning_rate": 5.144183920733338e-05, + "loss": 0.12148820161819458, + "step": 113620 + }, + { + "epoch": 0.4878373388973322, + "grad_norm": 0.13232535123825073, + "learning_rate": 5.1437527487215754e-05, + "loss": 0.3908156633377075, + "step": 113630 + }, + { + "epoch": 0.4878802709873522, + "grad_norm": 0.018233170732855797, + "learning_rate": 5.143321576709813e-05, + "loss": 0.19533765316009521, + "step": 113640 + }, + { + "epoch": 0.4879232030773722, + "grad_norm": 0.09772051870822906, + "learning_rate": 5.142890404698051e-05, + "loss": 0.2228538751602173, + "step": 113650 + }, + { + "epoch": 0.48796613516739223, + "grad_norm": 0.013814345002174377, + "learning_rate": 5.1424592326862886e-05, + "loss": 0.40313119888305665, + "step": 113660 + }, + { + "epoch": 0.4880090672574122, + "grad_norm": 0.018356265500187874, + "learning_rate": 5.142028060674525e-05, + "loss": 0.3107442855834961, + "step": 113670 + }, + { + "epoch": 0.4880519993474322, + "grad_norm": 1.6710891723632812, + "learning_rate": 5.141596888662763e-05, + "loss": 0.11449255943298339, + "step": 113680 + }, + { + "epoch": 0.48809493143745225, + "grad_norm": 0.003830577014014125, + "learning_rate": 5.1411657166510005e-05, + "loss": 0.1889503240585327, + "step": 113690 + }, + { + "epoch": 0.48813786352747224, + "grad_norm": 0.1663198471069336, + "learning_rate": 5.140734544639239e-05, + "loss": 0.2108973741531372, + "step": 113700 + }, + { + "epoch": 0.4881807956174922, + "grad_norm": 0.09352283924818039, + "learning_rate": 5.1403033726274766e-05, + "loss": 0.342281436920166, + "step": 113710 + }, + { + "epoch": 0.48822372770751227, + "grad_norm": 16.00942039489746, + "learning_rate": 5.139872200615714e-05, + "loss": 0.16771682500839233, + "step": 113720 + }, + { + "epoch": 0.48826665979753225, + "grad_norm": 0.3389633595943451, + "learning_rate": 5.139441028603952e-05, + "loss": 0.2200084686279297, + "step": 113730 + }, + { + "epoch": 0.4883095918875523, + "grad_norm": 0.07837370038032532, + "learning_rate": 5.13900985659219e-05, + "loss": 0.10341674089431763, + "step": 113740 + }, + { + "epoch": 0.4883525239775723, + "grad_norm": 0.8534498810768127, + "learning_rate": 5.138578684580426e-05, + "loss": 0.19216973781585694, + "step": 113750 + }, + { + "epoch": 0.48839545606759227, + "grad_norm": 0.02206382155418396, + "learning_rate": 5.138147512568664e-05, + "loss": 0.23950448036193847, + "step": 113760 + }, + { + "epoch": 0.4884383881576123, + "grad_norm": 0.008670583367347717, + "learning_rate": 5.1377163405569017e-05, + "loss": 0.4655170440673828, + "step": 113770 + }, + { + "epoch": 0.4884813202476323, + "grad_norm": 1.3391717672348022, + "learning_rate": 5.1372851685451394e-05, + "loss": 0.3201841115951538, + "step": 113780 + }, + { + "epoch": 0.4885242523376523, + "grad_norm": 6.023839473724365, + "learning_rate": 5.136853996533377e-05, + "loss": 0.3036703586578369, + "step": 113790 + }, + { + "epoch": 0.48856718442767233, + "grad_norm": 0.001943222712725401, + "learning_rate": 5.136422824521615e-05, + "loss": 0.34198386669158937, + "step": 113800 + }, + { + "epoch": 0.4886101165176923, + "grad_norm": 3.020825147628784, + "learning_rate": 5.1359916525098526e-05, + "loss": 0.10895280838012696, + "step": 113810 + }, + { + "epoch": 0.4886530486077123, + "grad_norm": 0.034754056483507156, + "learning_rate": 5.13556048049809e-05, + "loss": 0.055017054080963135, + "step": 113820 + }, + { + "epoch": 0.48869598069773235, + "grad_norm": 1.155048131942749, + "learning_rate": 5.1351293084863274e-05, + "loss": 0.11602849960327148, + "step": 113830 + }, + { + "epoch": 0.48873891278775233, + "grad_norm": 0.03798946365714073, + "learning_rate": 5.134698136474565e-05, + "loss": 0.09914991855621338, + "step": 113840 + }, + { + "epoch": 0.4887818448777723, + "grad_norm": 0.05045421048998833, + "learning_rate": 5.134266964462803e-05, + "loss": 0.18398324251174927, + "step": 113850 + }, + { + "epoch": 0.48882477696779236, + "grad_norm": 1.2780303955078125, + "learning_rate": 5.1338357924510406e-05, + "loss": 0.11606618165969848, + "step": 113860 + }, + { + "epoch": 0.48886770905781235, + "grad_norm": 0.3048155605792999, + "learning_rate": 5.133404620439278e-05, + "loss": 0.3404239177703857, + "step": 113870 + }, + { + "epoch": 0.48891064114783234, + "grad_norm": 1.7724405527114868, + "learning_rate": 5.132973448427516e-05, + "loss": 0.3235398530960083, + "step": 113880 + }, + { + "epoch": 0.4889535732378524, + "grad_norm": 2.0320141315460205, + "learning_rate": 5.132542276415754e-05, + "loss": 0.4618537902832031, + "step": 113890 + }, + { + "epoch": 0.48899650532787237, + "grad_norm": 0.7048388719558716, + "learning_rate": 5.1321111044039915e-05, + "loss": 0.26695349216461184, + "step": 113900 + }, + { + "epoch": 0.48903943741789235, + "grad_norm": 0.5743983387947083, + "learning_rate": 5.131679932392229e-05, + "loss": 0.034299665689468385, + "step": 113910 + }, + { + "epoch": 0.4890823695079124, + "grad_norm": 0.0030826705042272806, + "learning_rate": 5.131248760380466e-05, + "loss": 0.25197324752807615, + "step": 113920 + }, + { + "epoch": 0.4891253015979324, + "grad_norm": 0.0842275321483612, + "learning_rate": 5.130817588368704e-05, + "loss": 0.2050330400466919, + "step": 113930 + }, + { + "epoch": 0.48916823368795237, + "grad_norm": 0.2123071551322937, + "learning_rate": 5.130386416356942e-05, + "loss": 0.3663323402404785, + "step": 113940 + }, + { + "epoch": 0.4892111657779724, + "grad_norm": 0.0033182615879923105, + "learning_rate": 5.1299552443451795e-05, + "loss": 0.2355727195739746, + "step": 113950 + }, + { + "epoch": 0.4892540978679924, + "grad_norm": 0.00881041120737791, + "learning_rate": 5.129524072333417e-05, + "loss": 0.1516958475112915, + "step": 113960 + }, + { + "epoch": 0.48929702995801244, + "grad_norm": 1.4162713289260864, + "learning_rate": 5.129092900321655e-05, + "loss": 0.28292834758758545, + "step": 113970 + }, + { + "epoch": 0.48933996204803243, + "grad_norm": 0.005841487552970648, + "learning_rate": 5.128661728309893e-05, + "loss": 0.19476345777511597, + "step": 113980 + }, + { + "epoch": 0.4893828941380524, + "grad_norm": 0.00311907846480608, + "learning_rate": 5.1282305562981305e-05, + "loss": 0.19814854860305786, + "step": 113990 + }, + { + "epoch": 0.48942582622807246, + "grad_norm": 5.6313557624816895, + "learning_rate": 5.127799384286367e-05, + "loss": 0.23403961658477784, + "step": 114000 + }, + { + "epoch": 0.48942582622807246, + "eval_loss": 0.40528884530067444, + "eval_runtime": 27.1409, + "eval_samples_per_second": 3.684, + "eval_steps_per_second": 3.684, + "step": 114000 + }, + { + "epoch": 0.48946875831809245, + "grad_norm": 1.033115267753601, + "learning_rate": 5.1273682122746046e-05, + "loss": 0.35855731964111326, + "step": 114010 + }, + { + "epoch": 0.48951169040811243, + "grad_norm": 0.8171420097351074, + "learning_rate": 5.126937040262842e-05, + "loss": 0.1914085030555725, + "step": 114020 + }, + { + "epoch": 0.4895546224981325, + "grad_norm": 0.03207477182149887, + "learning_rate": 5.12650586825108e-05, + "loss": 0.21447515487670898, + "step": 114030 + }, + { + "epoch": 0.48959755458815246, + "grad_norm": 0.037823960185050964, + "learning_rate": 5.126074696239318e-05, + "loss": 0.24335236549377443, + "step": 114040 + }, + { + "epoch": 0.48964048667817245, + "grad_norm": 5.856334686279297, + "learning_rate": 5.1256435242275555e-05, + "loss": 0.20237469673156738, + "step": 114050 + }, + { + "epoch": 0.4896834187681925, + "grad_norm": 0.7282851338386536, + "learning_rate": 5.125212352215793e-05, + "loss": 0.15984612703323364, + "step": 114060 + }, + { + "epoch": 0.4897263508582125, + "grad_norm": 0.05635266751050949, + "learning_rate": 5.124781180204031e-05, + "loss": 0.14566596746444702, + "step": 114070 + }, + { + "epoch": 0.48976928294823247, + "grad_norm": 0.4587521255016327, + "learning_rate": 5.124350008192268e-05, + "loss": 0.03238787353038788, + "step": 114080 + }, + { + "epoch": 0.4898122150382525, + "grad_norm": 0.017040731385350227, + "learning_rate": 5.123918836180506e-05, + "loss": 0.0728563904762268, + "step": 114090 + }, + { + "epoch": 0.4898551471282725, + "grad_norm": 1.607393503189087, + "learning_rate": 5.1234876641687435e-05, + "loss": 0.05377238988876343, + "step": 114100 + }, + { + "epoch": 0.4898980792182925, + "grad_norm": 0.1536511927843094, + "learning_rate": 5.123056492156981e-05, + "loss": 0.25159103870391847, + "step": 114110 + }, + { + "epoch": 0.4899410113083125, + "grad_norm": 0.09835078567266464, + "learning_rate": 5.122625320145219e-05, + "loss": 0.1219517469406128, + "step": 114120 + }, + { + "epoch": 0.4899839433983325, + "grad_norm": 0.20577646791934967, + "learning_rate": 5.122194148133457e-05, + "loss": 0.19758754968643188, + "step": 114130 + }, + { + "epoch": 0.4900268754883525, + "grad_norm": 0.0003962178307119757, + "learning_rate": 5.1217629761216944e-05, + "loss": 0.1164817452430725, + "step": 114140 + }, + { + "epoch": 0.49006980757837254, + "grad_norm": 36.232933044433594, + "learning_rate": 5.121331804109932e-05, + "loss": 0.262998104095459, + "step": 114150 + }, + { + "epoch": 0.4901127396683925, + "grad_norm": 0.004099687095731497, + "learning_rate": 5.120900632098169e-05, + "loss": 0.2619943141937256, + "step": 114160 + }, + { + "epoch": 0.49015567175841257, + "grad_norm": 1.8435301780700684, + "learning_rate": 5.120469460086407e-05, + "loss": 0.3348772764205933, + "step": 114170 + }, + { + "epoch": 0.49019860384843256, + "grad_norm": 0.10621852427721024, + "learning_rate": 5.120038288074645e-05, + "loss": 0.26117191314697263, + "step": 114180 + }, + { + "epoch": 0.49024153593845254, + "grad_norm": 1.9179303646087646, + "learning_rate": 5.1196071160628824e-05, + "loss": 0.3860593318939209, + "step": 114190 + }, + { + "epoch": 0.4902844680284726, + "grad_norm": 0.0544576533138752, + "learning_rate": 5.11917594405112e-05, + "loss": 0.33986380100250246, + "step": 114200 + }, + { + "epoch": 0.4903274001184926, + "grad_norm": 0.003706451505422592, + "learning_rate": 5.118744772039358e-05, + "loss": 0.12367761135101318, + "step": 114210 + }, + { + "epoch": 0.49037033220851256, + "grad_norm": 0.1084350049495697, + "learning_rate": 5.1183136000275956e-05, + "loss": 0.2694986343383789, + "step": 114220 + }, + { + "epoch": 0.4904132642985326, + "grad_norm": 1.461536169052124, + "learning_rate": 5.1178824280158334e-05, + "loss": 0.17770261764526368, + "step": 114230 + }, + { + "epoch": 0.4904561963885526, + "grad_norm": 5.399726390838623, + "learning_rate": 5.117451256004071e-05, + "loss": 0.3205733299255371, + "step": 114240 + }, + { + "epoch": 0.4904991284785726, + "grad_norm": 0.01629946567118168, + "learning_rate": 5.1170200839923075e-05, + "loss": 0.047853922843933104, + "step": 114250 + }, + { + "epoch": 0.4905420605685926, + "grad_norm": 0.0395156666636467, + "learning_rate": 5.116588911980545e-05, + "loss": 0.3599144458770752, + "step": 114260 + }, + { + "epoch": 0.4905849926586126, + "grad_norm": 3.736130475997925, + "learning_rate": 5.116157739968783e-05, + "loss": 0.299530553817749, + "step": 114270 + }, + { + "epoch": 0.4906279247486326, + "grad_norm": 0.013340250588953495, + "learning_rate": 5.115726567957021e-05, + "loss": 0.1913688898086548, + "step": 114280 + }, + { + "epoch": 0.49067085683865264, + "grad_norm": 3.088780403137207, + "learning_rate": 5.1152953959452584e-05, + "loss": 0.37713470458984377, + "step": 114290 + }, + { + "epoch": 0.4907137889286726, + "grad_norm": 1.7537235021591187, + "learning_rate": 5.114864223933497e-05, + "loss": 0.2248607873916626, + "step": 114300 + }, + { + "epoch": 0.4907567210186926, + "grad_norm": 0.20872154831886292, + "learning_rate": 5.1144330519217346e-05, + "loss": 0.3299898147583008, + "step": 114310 + }, + { + "epoch": 0.49079965310871265, + "grad_norm": 0.05676392465829849, + "learning_rate": 5.114001879909972e-05, + "loss": 0.16998063325881957, + "step": 114320 + }, + { + "epoch": 0.49084258519873264, + "grad_norm": 1.6964046955108643, + "learning_rate": 5.113570707898209e-05, + "loss": 0.21034905910491944, + "step": 114330 + }, + { + "epoch": 0.4908855172887526, + "grad_norm": 0.03953026235103607, + "learning_rate": 5.1131395358864464e-05, + "loss": 0.4823922634124756, + "step": 114340 + }, + { + "epoch": 0.49092844937877267, + "grad_norm": 0.2652646005153656, + "learning_rate": 5.112708363874684e-05, + "loss": 0.28086440563201903, + "step": 114350 + }, + { + "epoch": 0.49097138146879266, + "grad_norm": 0.023406682536005974, + "learning_rate": 5.112277191862922e-05, + "loss": 0.14792462587356567, + "step": 114360 + }, + { + "epoch": 0.49101431355881264, + "grad_norm": 0.017988061532378197, + "learning_rate": 5.1118460198511596e-05, + "loss": 0.10092716217041016, + "step": 114370 + }, + { + "epoch": 0.4910572456488327, + "grad_norm": 0.343831866979599, + "learning_rate": 5.1114148478393973e-05, + "loss": 0.06460964083671569, + "step": 114380 + }, + { + "epoch": 0.4911001777388527, + "grad_norm": 0.30238035321235657, + "learning_rate": 5.110983675827635e-05, + "loss": 0.2701455354690552, + "step": 114390 + }, + { + "epoch": 0.4911431098288727, + "grad_norm": 0.0027301537338644266, + "learning_rate": 5.110552503815873e-05, + "loss": 0.21025633811950684, + "step": 114400 + }, + { + "epoch": 0.4911860419188927, + "grad_norm": 0.024237489327788353, + "learning_rate": 5.11012133180411e-05, + "loss": 0.2406532049179077, + "step": 114410 + }, + { + "epoch": 0.4912289740089127, + "grad_norm": 0.09928843379020691, + "learning_rate": 5.1096901597923476e-05, + "loss": 0.19101529121398925, + "step": 114420 + }, + { + "epoch": 0.49127190609893273, + "grad_norm": 0.013309850357472897, + "learning_rate": 5.1092589877805853e-05, + "loss": 0.21105690002441407, + "step": 114430 + }, + { + "epoch": 0.4913148381889527, + "grad_norm": 0.041601624339818954, + "learning_rate": 5.108827815768823e-05, + "loss": 0.22969114780426025, + "step": 114440 + }, + { + "epoch": 0.4913577702789727, + "grad_norm": 0.0371188260614872, + "learning_rate": 5.108396643757061e-05, + "loss": 0.15212260484695433, + "step": 114450 + }, + { + "epoch": 0.49140070236899275, + "grad_norm": 0.01270721573382616, + "learning_rate": 5.1079654717452985e-05, + "loss": 0.1889193534851074, + "step": 114460 + }, + { + "epoch": 0.49144363445901273, + "grad_norm": 0.2520737946033478, + "learning_rate": 5.107534299733536e-05, + "loss": 0.2561648845672607, + "step": 114470 + }, + { + "epoch": 0.4914865665490327, + "grad_norm": 2.448040008544922, + "learning_rate": 5.107103127721774e-05, + "loss": 0.33474059104919435, + "step": 114480 + }, + { + "epoch": 0.49152949863905276, + "grad_norm": 0.042958565056324005, + "learning_rate": 5.1066719557100104e-05, + "loss": 0.19774354696273805, + "step": 114490 + }, + { + "epoch": 0.49157243072907275, + "grad_norm": 0.07219936698675156, + "learning_rate": 5.106240783698248e-05, + "loss": 0.14528067111968995, + "step": 114500 + }, + { + "epoch": 0.49161536281909274, + "grad_norm": 2.0706522464752197, + "learning_rate": 5.105809611686486e-05, + "loss": 0.11780478954315185, + "step": 114510 + }, + { + "epoch": 0.4916582949091128, + "grad_norm": 0.023876963183283806, + "learning_rate": 5.105378439674724e-05, + "loss": 0.10775841474533081, + "step": 114520 + }, + { + "epoch": 0.49170122699913277, + "grad_norm": 0.006041590124368668, + "learning_rate": 5.104947267662962e-05, + "loss": 0.0707572877407074, + "step": 114530 + }, + { + "epoch": 0.49174415908915275, + "grad_norm": 0.09596258401870728, + "learning_rate": 5.1045160956512e-05, + "loss": 0.23318018913269042, + "step": 114540 + }, + { + "epoch": 0.4917870911791728, + "grad_norm": 5.521082878112793, + "learning_rate": 5.1040849236394375e-05, + "loss": 0.3243843078613281, + "step": 114550 + }, + { + "epoch": 0.4918300232691928, + "grad_norm": 6.089881896972656, + "learning_rate": 5.103653751627675e-05, + "loss": 0.15074145793914795, + "step": 114560 + }, + { + "epoch": 0.49187295535921277, + "grad_norm": 0.09942862391471863, + "learning_rate": 5.103222579615913e-05, + "loss": 0.1948152780532837, + "step": 114570 + }, + { + "epoch": 0.4919158874492328, + "grad_norm": 0.05901400372385979, + "learning_rate": 5.102791407604149e-05, + "loss": 0.27257773876190183, + "step": 114580 + }, + { + "epoch": 0.4919588195392528, + "grad_norm": 0.6826162934303284, + "learning_rate": 5.102360235592387e-05, + "loss": 0.25472943782806395, + "step": 114590 + }, + { + "epoch": 0.49200175162927284, + "grad_norm": 3.5605061054229736, + "learning_rate": 5.101929063580625e-05, + "loss": 0.33192465305328367, + "step": 114600 + }, + { + "epoch": 0.49204468371929283, + "grad_norm": 1.9935866594314575, + "learning_rate": 5.1014978915688625e-05, + "loss": 0.27500338554382325, + "step": 114610 + }, + { + "epoch": 0.4920876158093128, + "grad_norm": 0.005706002935767174, + "learning_rate": 5.1010667195571e-05, + "loss": 0.31219775676727296, + "step": 114620 + }, + { + "epoch": 0.49213054789933286, + "grad_norm": 0.11540130525827408, + "learning_rate": 5.100635547545338e-05, + "loss": 0.22799878120422362, + "step": 114630 + }, + { + "epoch": 0.49217347998935285, + "grad_norm": 5.40323543548584, + "learning_rate": 5.100204375533576e-05, + "loss": 0.23047494888305664, + "step": 114640 + }, + { + "epoch": 0.49221641207937283, + "grad_norm": 0.0065496861934661865, + "learning_rate": 5.0997732035218135e-05, + "loss": 0.11466679573059083, + "step": 114650 + }, + { + "epoch": 0.4922593441693929, + "grad_norm": 0.027168719097971916, + "learning_rate": 5.0993420315100505e-05, + "loss": 0.12369179725646973, + "step": 114660 + }, + { + "epoch": 0.49230227625941286, + "grad_norm": 0.31811249256134033, + "learning_rate": 5.098910859498288e-05, + "loss": 0.1178821325302124, + "step": 114670 + }, + { + "epoch": 0.49234520834943285, + "grad_norm": 0.0057378895580768585, + "learning_rate": 5.098479687486526e-05, + "loss": 0.3201324462890625, + "step": 114680 + }, + { + "epoch": 0.4923881404394529, + "grad_norm": 0.03739020228385925, + "learning_rate": 5.098048515474764e-05, + "loss": 0.28364293575286864, + "step": 114690 + }, + { + "epoch": 0.4924310725294729, + "grad_norm": 0.014455210417509079, + "learning_rate": 5.0976173434630015e-05, + "loss": 0.1237752914428711, + "step": 114700 + }, + { + "epoch": 0.49247400461949287, + "grad_norm": 0.05205453932285309, + "learning_rate": 5.097186171451239e-05, + "loss": 0.18404200077056884, + "step": 114710 + }, + { + "epoch": 0.4925169367095129, + "grad_norm": 12.226811408996582, + "learning_rate": 5.096754999439477e-05, + "loss": 0.4771873474121094, + "step": 114720 + }, + { + "epoch": 0.4925598687995329, + "grad_norm": 2.7584354877471924, + "learning_rate": 5.0963238274277147e-05, + "loss": 0.2770639657974243, + "step": 114730 + }, + { + "epoch": 0.4926028008895529, + "grad_norm": 0.0015160737093538046, + "learning_rate": 5.095892655415952e-05, + "loss": 0.2827488422393799, + "step": 114740 + }, + { + "epoch": 0.4926457329795729, + "grad_norm": 0.006125219166278839, + "learning_rate": 5.0954614834041894e-05, + "loss": 0.06660739183425904, + "step": 114750 + }, + { + "epoch": 0.4926886650695929, + "grad_norm": 0.1934569776058197, + "learning_rate": 5.095030311392427e-05, + "loss": 0.13013845682144165, + "step": 114760 + }, + { + "epoch": 0.4927315971596129, + "grad_norm": 0.5882942080497742, + "learning_rate": 5.094599139380665e-05, + "loss": 0.19378734827041627, + "step": 114770 + }, + { + "epoch": 0.49277452924963294, + "grad_norm": 0.19520607590675354, + "learning_rate": 5.0941679673689027e-05, + "loss": 0.24061408042907714, + "step": 114780 + }, + { + "epoch": 0.49281746133965293, + "grad_norm": 0.09184468537569046, + "learning_rate": 5.0937367953571404e-05, + "loss": 0.12484440803527833, + "step": 114790 + }, + { + "epoch": 0.4928603934296729, + "grad_norm": 0.0007245758315548301, + "learning_rate": 5.093305623345378e-05, + "loss": 0.0343919962644577, + "step": 114800 + }, + { + "epoch": 0.49290332551969296, + "grad_norm": 0.10586666315793991, + "learning_rate": 5.092874451333616e-05, + "loss": 0.3540113210678101, + "step": 114810 + }, + { + "epoch": 0.49294625760971295, + "grad_norm": 7.368129730224609, + "learning_rate": 5.092443279321852e-05, + "loss": 0.12444863319396973, + "step": 114820 + }, + { + "epoch": 0.492989189699733, + "grad_norm": 1.8489593267440796, + "learning_rate": 5.09201210731009e-05, + "loss": 0.10474470853805543, + "step": 114830 + }, + { + "epoch": 0.493032121789753, + "grad_norm": 2.487107515335083, + "learning_rate": 5.091580935298328e-05, + "loss": 0.22773370742797852, + "step": 114840 + }, + { + "epoch": 0.49307505387977296, + "grad_norm": 0.15341585874557495, + "learning_rate": 5.0911497632865654e-05, + "loss": 0.15825777053833007, + "step": 114850 + }, + { + "epoch": 0.493117985969793, + "grad_norm": 1.0390082597732544, + "learning_rate": 5.090718591274803e-05, + "loss": 0.14906710386276245, + "step": 114860 + }, + { + "epoch": 0.493160918059813, + "grad_norm": 4.846892833709717, + "learning_rate": 5.090287419263041e-05, + "loss": 0.31097846031188964, + "step": 114870 + }, + { + "epoch": 0.493203850149833, + "grad_norm": 4.312889099121094, + "learning_rate": 5.0898562472512786e-05, + "loss": 0.33429646492004395, + "step": 114880 + }, + { + "epoch": 0.493246782239853, + "grad_norm": 0.00027503733872435987, + "learning_rate": 5.089425075239517e-05, + "loss": 0.06513903737068176, + "step": 114890 + }, + { + "epoch": 0.493289714329873, + "grad_norm": 2.0348081588745117, + "learning_rate": 5.0889939032277534e-05, + "loss": 0.3529532432556152, + "step": 114900 + }, + { + "epoch": 0.493332646419893, + "grad_norm": 0.02989332005381584, + "learning_rate": 5.088562731215991e-05, + "loss": 0.14261656999588013, + "step": 114910 + }, + { + "epoch": 0.49337557850991304, + "grad_norm": 0.09872865676879883, + "learning_rate": 5.088131559204229e-05, + "loss": 0.23729093074798585, + "step": 114920 + }, + { + "epoch": 0.493418510599933, + "grad_norm": 0.004361944738775492, + "learning_rate": 5.0877003871924666e-05, + "loss": 0.1671282172203064, + "step": 114930 + }, + { + "epoch": 0.493461442689953, + "grad_norm": 0.19537287950515747, + "learning_rate": 5.0872692151807044e-05, + "loss": 0.27259490489959715, + "step": 114940 + }, + { + "epoch": 0.49350437477997305, + "grad_norm": 0.21257755160331726, + "learning_rate": 5.086838043168942e-05, + "loss": 0.136817467212677, + "step": 114950 + }, + { + "epoch": 0.49354730686999304, + "grad_norm": 1.6787389516830444, + "learning_rate": 5.08640687115718e-05, + "loss": 0.24155523777008056, + "step": 114960 + }, + { + "epoch": 0.49359023896001303, + "grad_norm": 6.152478218078613, + "learning_rate": 5.0859756991454176e-05, + "loss": 0.09717947244644165, + "step": 114970 + }, + { + "epoch": 0.49363317105003307, + "grad_norm": 1.884873390197754, + "learning_rate": 5.085544527133655e-05, + "loss": 0.3864275455474854, + "step": 114980 + }, + { + "epoch": 0.49367610314005306, + "grad_norm": 2.2125508785247803, + "learning_rate": 5.0851133551218924e-05, + "loss": 0.1695851445198059, + "step": 114990 + }, + { + "epoch": 0.49371903523007304, + "grad_norm": 1.7585374116897583, + "learning_rate": 5.08468218311013e-05, + "loss": 0.2917444705963135, + "step": 115000 + }, + { + "epoch": 0.49371903523007304, + "eval_loss": 0.4183075726032257, + "eval_runtime": 27.1776, + "eval_samples_per_second": 3.68, + "eval_steps_per_second": 3.68, + "step": 115000 + }, + { + "epoch": 0.4937619673200931, + "grad_norm": 0.01179379504173994, + "learning_rate": 5.084251011098368e-05, + "loss": 0.3183038949966431, + "step": 115010 + }, + { + "epoch": 0.4938048994101131, + "grad_norm": 0.91953045129776, + "learning_rate": 5.0838198390866056e-05, + "loss": 0.2725319147109985, + "step": 115020 + }, + { + "epoch": 0.4938478315001331, + "grad_norm": 0.1682545691728592, + "learning_rate": 5.083388667074843e-05, + "loss": 0.1873003602027893, + "step": 115030 + }, + { + "epoch": 0.4938907635901531, + "grad_norm": 1.7073339223861694, + "learning_rate": 5.082957495063081e-05, + "loss": 0.3962739944458008, + "step": 115040 + }, + { + "epoch": 0.4939336956801731, + "grad_norm": 1.3014336824417114, + "learning_rate": 5.082526323051319e-05, + "loss": 0.24986536502838136, + "step": 115050 + }, + { + "epoch": 0.49397662777019313, + "grad_norm": 0.003316489513963461, + "learning_rate": 5.0820951510395565e-05, + "loss": 0.06500194072723389, + "step": 115060 + }, + { + "epoch": 0.4940195598602131, + "grad_norm": 5.03993034362793, + "learning_rate": 5.081663979027793e-05, + "loss": 0.2883466720581055, + "step": 115070 + }, + { + "epoch": 0.4940624919502331, + "grad_norm": 0.00388818490318954, + "learning_rate": 5.0812328070160306e-05, + "loss": 0.25894618034362793, + "step": 115080 + }, + { + "epoch": 0.49410542404025315, + "grad_norm": 3.0344207286834717, + "learning_rate": 5.0808016350042683e-05, + "loss": 0.16486616134643556, + "step": 115090 + }, + { + "epoch": 0.49414835613027314, + "grad_norm": 1.1234501600265503, + "learning_rate": 5.080370462992506e-05, + "loss": 0.14668926000595092, + "step": 115100 + }, + { + "epoch": 0.4941912882202931, + "grad_norm": 2.564479351043701, + "learning_rate": 5.0799392909807445e-05, + "loss": 0.23735263347625732, + "step": 115110 + }, + { + "epoch": 0.49423422031031317, + "grad_norm": 0.11280937492847443, + "learning_rate": 5.079508118968982e-05, + "loss": 0.4572554111480713, + "step": 115120 + }, + { + "epoch": 0.49427715240033315, + "grad_norm": 3.4304449558258057, + "learning_rate": 5.07907694695722e-05, + "loss": 0.21845850944519044, + "step": 115130 + }, + { + "epoch": 0.49432008449035314, + "grad_norm": 3.999896764755249, + "learning_rate": 5.078645774945458e-05, + "loss": 0.30247633457183837, + "step": 115140 + }, + { + "epoch": 0.4943630165803732, + "grad_norm": 1.523688793182373, + "learning_rate": 5.078214602933694e-05, + "loss": 0.28164350986480713, + "step": 115150 + }, + { + "epoch": 0.49440594867039317, + "grad_norm": 1.8061667680740356, + "learning_rate": 5.077783430921932e-05, + "loss": 0.14227923154830932, + "step": 115160 + }, + { + "epoch": 0.49444888076041316, + "grad_norm": 6.9911417961120605, + "learning_rate": 5.0773522589101695e-05, + "loss": 0.2907358169555664, + "step": 115170 + }, + { + "epoch": 0.4944918128504332, + "grad_norm": 0.6293550133705139, + "learning_rate": 5.076921086898407e-05, + "loss": 0.31427788734436035, + "step": 115180 + }, + { + "epoch": 0.4945347449404532, + "grad_norm": 3.471299648284912, + "learning_rate": 5.076489914886645e-05, + "loss": 0.15946595668792723, + "step": 115190 + }, + { + "epoch": 0.4945776770304732, + "grad_norm": 0.005677805282175541, + "learning_rate": 5.076058742874883e-05, + "loss": 0.10743122100830078, + "step": 115200 + }, + { + "epoch": 0.4946206091204932, + "grad_norm": 2.375127077102661, + "learning_rate": 5.0756275708631205e-05, + "loss": 0.2899677515029907, + "step": 115210 + }, + { + "epoch": 0.4946635412105132, + "grad_norm": 0.005286522675305605, + "learning_rate": 5.075196398851358e-05, + "loss": 0.2951892137527466, + "step": 115220 + }, + { + "epoch": 0.4947064733005332, + "grad_norm": 0.008677645586431026, + "learning_rate": 5.074765226839595e-05, + "loss": 0.10036700963973999, + "step": 115230 + }, + { + "epoch": 0.49474940539055323, + "grad_norm": 0.16775937378406525, + "learning_rate": 5.074334054827833e-05, + "loss": 0.4316593647003174, + "step": 115240 + }, + { + "epoch": 0.4947923374805732, + "grad_norm": 0.08770100772380829, + "learning_rate": 5.073902882816071e-05, + "loss": 0.2387186050415039, + "step": 115250 + }, + { + "epoch": 0.49483526957059326, + "grad_norm": 1.3633723258972168, + "learning_rate": 5.0734717108043085e-05, + "loss": 0.22010915279388427, + "step": 115260 + }, + { + "epoch": 0.49487820166061325, + "grad_norm": 1.4718842506408691, + "learning_rate": 5.073040538792546e-05, + "loss": 0.15852314233779907, + "step": 115270 + }, + { + "epoch": 0.49492113375063324, + "grad_norm": 0.005069954786449671, + "learning_rate": 5.072609366780784e-05, + "loss": 0.28095600605010984, + "step": 115280 + }, + { + "epoch": 0.4949640658406533, + "grad_norm": 0.18669170141220093, + "learning_rate": 5.072178194769022e-05, + "loss": 0.13922568559646606, + "step": 115290 + }, + { + "epoch": 0.49500699793067326, + "grad_norm": 1.5393754243850708, + "learning_rate": 5.0717470227572594e-05, + "loss": 0.2517578363418579, + "step": 115300 + }, + { + "epoch": 0.49504993002069325, + "grad_norm": 1.222355842590332, + "learning_rate": 5.071315850745497e-05, + "loss": 0.1524641990661621, + "step": 115310 + }, + { + "epoch": 0.4950928621107133, + "grad_norm": 0.27816998958587646, + "learning_rate": 5.0708846787337335e-05, + "loss": 0.20404572486877443, + "step": 115320 + }, + { + "epoch": 0.4951357942007333, + "grad_norm": 0.2993142008781433, + "learning_rate": 5.070453506721972e-05, + "loss": 0.20290870666503907, + "step": 115330 + }, + { + "epoch": 0.49517872629075327, + "grad_norm": 2.6680591106414795, + "learning_rate": 5.07002233471021e-05, + "loss": 0.05664870738983154, + "step": 115340 + }, + { + "epoch": 0.4952216583807733, + "grad_norm": 0.09985325485467911, + "learning_rate": 5.0695911626984474e-05, + "loss": 0.21010010242462157, + "step": 115350 + }, + { + "epoch": 0.4952645904707933, + "grad_norm": 2.5624895095825195, + "learning_rate": 5.069159990686685e-05, + "loss": 0.23997178077697753, + "step": 115360 + }, + { + "epoch": 0.4953075225608133, + "grad_norm": 0.13102348148822784, + "learning_rate": 5.068728818674923e-05, + "loss": 0.27090566158294677, + "step": 115370 + }, + { + "epoch": 0.4953504546508333, + "grad_norm": 0.15313896536827087, + "learning_rate": 5.0682976466631606e-05, + "loss": 0.2939656496047974, + "step": 115380 + }, + { + "epoch": 0.4953933867408533, + "grad_norm": 0.003520481288433075, + "learning_rate": 5.0678664746513983e-05, + "loss": 0.18922423124313353, + "step": 115390 + }, + { + "epoch": 0.4954363188308733, + "grad_norm": 10.516127586364746, + "learning_rate": 5.067435302639635e-05, + "loss": 0.27016212940216067, + "step": 115400 + }, + { + "epoch": 0.49547925092089334, + "grad_norm": 18.63870620727539, + "learning_rate": 5.0670041306278725e-05, + "loss": 0.1411109447479248, + "step": 115410 + }, + { + "epoch": 0.49552218301091333, + "grad_norm": 1.1475532054901123, + "learning_rate": 5.06657295861611e-05, + "loss": 0.2901994466781616, + "step": 115420 + }, + { + "epoch": 0.4955651151009333, + "grad_norm": 4.191442489624023, + "learning_rate": 5.066141786604348e-05, + "loss": 0.34753267765045165, + "step": 115430 + }, + { + "epoch": 0.49560804719095336, + "grad_norm": 0.3751876950263977, + "learning_rate": 5.0657106145925857e-05, + "loss": 0.2544296979904175, + "step": 115440 + }, + { + "epoch": 0.49565097928097335, + "grad_norm": 1.2921435832977295, + "learning_rate": 5.0652794425808234e-05, + "loss": 0.12484211921691894, + "step": 115450 + }, + { + "epoch": 0.4956939113709934, + "grad_norm": 3.32179594039917, + "learning_rate": 5.064848270569061e-05, + "loss": 0.3070408821105957, + "step": 115460 + }, + { + "epoch": 0.4957368434610134, + "grad_norm": 0.002367328619584441, + "learning_rate": 5.064417098557299e-05, + "loss": 0.1736527442932129, + "step": 115470 + }, + { + "epoch": 0.49577977555103336, + "grad_norm": 0.16558293998241425, + "learning_rate": 5.063985926545536e-05, + "loss": 0.3311375856399536, + "step": 115480 + }, + { + "epoch": 0.4958227076410534, + "grad_norm": 3.545715093612671, + "learning_rate": 5.0635547545337737e-05, + "loss": 0.3532984495162964, + "step": 115490 + }, + { + "epoch": 0.4958656397310734, + "grad_norm": 0.015997091308236122, + "learning_rate": 5.0631235825220114e-05, + "loss": 0.12923310995101928, + "step": 115500 + }, + { + "epoch": 0.4959085718210934, + "grad_norm": 4.145458698272705, + "learning_rate": 5.062692410510249e-05, + "loss": 0.29781742095947267, + "step": 115510 + }, + { + "epoch": 0.4959515039111134, + "grad_norm": 0.6753315329551697, + "learning_rate": 5.062261238498487e-05, + "loss": 0.2975428342819214, + "step": 115520 + }, + { + "epoch": 0.4959944360011334, + "grad_norm": 0.40239396691322327, + "learning_rate": 5.0618300664867246e-05, + "loss": 0.17573122978210448, + "step": 115530 + }, + { + "epoch": 0.4960373680911534, + "grad_norm": 1.7181788682937622, + "learning_rate": 5.061398894474962e-05, + "loss": 0.26423892974853513, + "step": 115540 + }, + { + "epoch": 0.49608030018117344, + "grad_norm": 0.3651997447013855, + "learning_rate": 5.0609677224632e-05, + "loss": 0.20962002277374267, + "step": 115550 + }, + { + "epoch": 0.4961232322711934, + "grad_norm": 22.775222778320312, + "learning_rate": 5.060536550451437e-05, + "loss": 0.2838634967803955, + "step": 115560 + }, + { + "epoch": 0.4961661643612134, + "grad_norm": 2.0066092014312744, + "learning_rate": 5.060105378439675e-05, + "loss": 0.3719503164291382, + "step": 115570 + }, + { + "epoch": 0.49620909645123346, + "grad_norm": 0.005026548635214567, + "learning_rate": 5.0596742064279126e-05, + "loss": 0.3048412322998047, + "step": 115580 + }, + { + "epoch": 0.49625202854125344, + "grad_norm": 2.4710865020751953, + "learning_rate": 5.05924303441615e-05, + "loss": 0.2783297061920166, + "step": 115590 + }, + { + "epoch": 0.49629496063127343, + "grad_norm": 2.661207914352417, + "learning_rate": 5.058811862404388e-05, + "loss": 0.18164483308792115, + "step": 115600 + }, + { + "epoch": 0.49633789272129347, + "grad_norm": 8.226025965996087e-05, + "learning_rate": 5.058380690392626e-05, + "loss": 0.2600426197052002, + "step": 115610 + }, + { + "epoch": 0.49638082481131346, + "grad_norm": 7.507086277008057, + "learning_rate": 5.0579495183808635e-05, + "loss": 0.08841606378555297, + "step": 115620 + }, + { + "epoch": 0.49642375690133345, + "grad_norm": 0.0025435436982661486, + "learning_rate": 5.057518346369101e-05, + "loss": 0.32819669246673583, + "step": 115630 + }, + { + "epoch": 0.4964666889913535, + "grad_norm": 0.012275975197553635, + "learning_rate": 5.0570871743573376e-05, + "loss": 0.11880701780319214, + "step": 115640 + }, + { + "epoch": 0.4965096210813735, + "grad_norm": 2.4064557552337646, + "learning_rate": 5.0566560023455754e-05, + "loss": 0.06984534859657288, + "step": 115650 + }, + { + "epoch": 0.49655255317139346, + "grad_norm": 0.04519079998135567, + "learning_rate": 5.056224830333813e-05, + "loss": 0.3047942638397217, + "step": 115660 + }, + { + "epoch": 0.4965954852614135, + "grad_norm": 0.003867541207000613, + "learning_rate": 5.055793658322051e-05, + "loss": 0.1949278235435486, + "step": 115670 + }, + { + "epoch": 0.4966384173514335, + "grad_norm": 29.170747756958008, + "learning_rate": 5.0553624863102886e-05, + "loss": 0.17468286752700807, + "step": 115680 + }, + { + "epoch": 0.49668134944145353, + "grad_norm": 0.8490044474601746, + "learning_rate": 5.054931314298526e-05, + "loss": 0.10209113359451294, + "step": 115690 + }, + { + "epoch": 0.4967242815314735, + "grad_norm": 0.07221094518899918, + "learning_rate": 5.054500142286765e-05, + "loss": 0.26395204067230227, + "step": 115700 + }, + { + "epoch": 0.4967672136214935, + "grad_norm": 2.139655828475952, + "learning_rate": 5.0540689702750025e-05, + "loss": 0.25615296363830564, + "step": 115710 + }, + { + "epoch": 0.49681014571151355, + "grad_norm": 3.531961441040039, + "learning_rate": 5.05363779826324e-05, + "loss": 0.34113330841064454, + "step": 115720 + }, + { + "epoch": 0.49685307780153354, + "grad_norm": 0.0022067560348659754, + "learning_rate": 5.0532066262514766e-05, + "loss": 0.2792416334152222, + "step": 115730 + }, + { + "epoch": 0.4968960098915535, + "grad_norm": 0.0736042708158493, + "learning_rate": 5.052775454239714e-05, + "loss": 0.03103959560394287, + "step": 115740 + }, + { + "epoch": 0.49693894198157357, + "grad_norm": 0.21198052167892456, + "learning_rate": 5.052344282227952e-05, + "loss": 0.16917411088943482, + "step": 115750 + }, + { + "epoch": 0.49698187407159355, + "grad_norm": 0.041175056248903275, + "learning_rate": 5.05191311021619e-05, + "loss": 0.2969489336013794, + "step": 115760 + }, + { + "epoch": 0.49702480616161354, + "grad_norm": 0.0025716687086969614, + "learning_rate": 5.0514819382044275e-05, + "loss": 0.27678828239440917, + "step": 115770 + }, + { + "epoch": 0.4970677382516336, + "grad_norm": 0.03726640343666077, + "learning_rate": 5.051050766192665e-05, + "loss": 0.21982161998748778, + "step": 115780 + }, + { + "epoch": 0.49711067034165357, + "grad_norm": 0.061146777123212814, + "learning_rate": 5.050619594180903e-05, + "loss": 0.16412194967269897, + "step": 115790 + }, + { + "epoch": 0.49715360243167356, + "grad_norm": 0.6972399353981018, + "learning_rate": 5.050188422169141e-05, + "loss": 0.19004242420196532, + "step": 115800 + }, + { + "epoch": 0.4971965345216936, + "grad_norm": 0.057269349694252014, + "learning_rate": 5.049757250157378e-05, + "loss": 0.12597191333770752, + "step": 115810 + }, + { + "epoch": 0.4972394666117136, + "grad_norm": 23.783647537231445, + "learning_rate": 5.0493260781456155e-05, + "loss": 0.23004698753356934, + "step": 115820 + }, + { + "epoch": 0.4972823987017336, + "grad_norm": 0.4776040315628052, + "learning_rate": 5.048894906133853e-05, + "loss": 0.16874227523803711, + "step": 115830 + }, + { + "epoch": 0.4973253307917536, + "grad_norm": 1.4436566829681396, + "learning_rate": 5.048463734122091e-05, + "loss": 0.09769206047058106, + "step": 115840 + }, + { + "epoch": 0.4973682628817736, + "grad_norm": 0.0008209854131564498, + "learning_rate": 5.048032562110329e-05, + "loss": 0.21059079170227052, + "step": 115850 + }, + { + "epoch": 0.4974111949717936, + "grad_norm": 0.003916463814675808, + "learning_rate": 5.0476013900985664e-05, + "loss": 0.006223606318235398, + "step": 115860 + }, + { + "epoch": 0.49745412706181363, + "grad_norm": 0.16059304773807526, + "learning_rate": 5.047170218086804e-05, + "loss": 0.2492811918258667, + "step": 115870 + }, + { + "epoch": 0.4974970591518336, + "grad_norm": 5.057432651519775, + "learning_rate": 5.046739046075042e-05, + "loss": 0.3591012477874756, + "step": 115880 + }, + { + "epoch": 0.49753999124185366, + "grad_norm": 0.0025596225168555975, + "learning_rate": 5.046307874063278e-05, + "loss": 0.17509062290191652, + "step": 115890 + }, + { + "epoch": 0.49758292333187365, + "grad_norm": 0.0018866428872570395, + "learning_rate": 5.045876702051516e-05, + "loss": 0.1546018362045288, + "step": 115900 + }, + { + "epoch": 0.49762585542189364, + "grad_norm": 0.004940045066177845, + "learning_rate": 5.045445530039754e-05, + "loss": 0.270506763458252, + "step": 115910 + }, + { + "epoch": 0.4976687875119137, + "grad_norm": 0.15914833545684814, + "learning_rate": 5.045014358027992e-05, + "loss": 0.3475140333175659, + "step": 115920 + }, + { + "epoch": 0.49771171960193367, + "grad_norm": 2.368710994720459, + "learning_rate": 5.04458318601623e-05, + "loss": 0.11058902740478516, + "step": 115930 + }, + { + "epoch": 0.49775465169195365, + "grad_norm": 1.7389672994613647, + "learning_rate": 5.0441520140044676e-05, + "loss": 0.07232906222343445, + "step": 115940 + }, + { + "epoch": 0.4977975837819737, + "grad_norm": 0.010259340517222881, + "learning_rate": 5.0437208419927054e-05, + "loss": 0.27948570251464844, + "step": 115950 + }, + { + "epoch": 0.4978405158719937, + "grad_norm": 0.05925685167312622, + "learning_rate": 5.043289669980943e-05, + "loss": 0.16085286140441896, + "step": 115960 + }, + { + "epoch": 0.49788344796201367, + "grad_norm": 2.66933536529541, + "learning_rate": 5.0428584979691795e-05, + "loss": 0.313973593711853, + "step": 115970 + }, + { + "epoch": 0.4979263800520337, + "grad_norm": 1.533821940422058, + "learning_rate": 5.042427325957417e-05, + "loss": 0.3402239799499512, + "step": 115980 + }, + { + "epoch": 0.4979693121420537, + "grad_norm": 0.11309437453746796, + "learning_rate": 5.041996153945655e-05, + "loss": 0.37814791202545167, + "step": 115990 + }, + { + "epoch": 0.4980122442320737, + "grad_norm": 1.377073884010315, + "learning_rate": 5.041564981933893e-05, + "loss": 0.2134047269821167, + "step": 116000 + }, + { + "epoch": 0.4980122442320737, + "eval_loss": 0.413227915763855, + "eval_runtime": 27.1799, + "eval_samples_per_second": 3.679, + "eval_steps_per_second": 3.679, + "step": 116000 + }, + { + "epoch": 0.49805517632209373, + "grad_norm": 1.5735690593719482, + "learning_rate": 5.0411338099221304e-05, + "loss": 0.18733230829238892, + "step": 116010 + }, + { + "epoch": 0.4980981084121137, + "grad_norm": 3.668860673904419, + "learning_rate": 5.040702637910368e-05, + "loss": 0.2786614179611206, + "step": 116020 + }, + { + "epoch": 0.4981410405021337, + "grad_norm": 0.0029571440536528826, + "learning_rate": 5.040271465898606e-05, + "loss": 0.21538405418395995, + "step": 116030 + }, + { + "epoch": 0.49818397259215375, + "grad_norm": 0.014692588709294796, + "learning_rate": 5.0398402938868436e-05, + "loss": 0.2976950407028198, + "step": 116040 + }, + { + "epoch": 0.49822690468217373, + "grad_norm": 0.007402835413813591, + "learning_rate": 5.0394091218750814e-05, + "loss": 0.05344715118408203, + "step": 116050 + }, + { + "epoch": 0.4982698367721937, + "grad_norm": 3.5247769355773926, + "learning_rate": 5.0389779498633184e-05, + "loss": 0.05012491941452026, + "step": 116060 + }, + { + "epoch": 0.49831276886221376, + "grad_norm": 0.0088571198284626, + "learning_rate": 5.038546777851556e-05, + "loss": 0.2535048484802246, + "step": 116070 + }, + { + "epoch": 0.49835570095223375, + "grad_norm": 0.02746264822781086, + "learning_rate": 5.038115605839794e-05, + "loss": 0.2725075721740723, + "step": 116080 + }, + { + "epoch": 0.49839863304225374, + "grad_norm": 0.002486494602635503, + "learning_rate": 5.0376844338280316e-05, + "loss": 0.2322972297668457, + "step": 116090 + }, + { + "epoch": 0.4984415651322738, + "grad_norm": 4.655661106109619, + "learning_rate": 5.0372532618162693e-05, + "loss": 0.36279830932617185, + "step": 116100 + }, + { + "epoch": 0.49848449722229377, + "grad_norm": 0.012788623571395874, + "learning_rate": 5.036822089804507e-05, + "loss": 0.2191479206085205, + "step": 116110 + }, + { + "epoch": 0.4985274293123138, + "grad_norm": 5.198680877685547, + "learning_rate": 5.036390917792745e-05, + "loss": 0.4196025371551514, + "step": 116120 + }, + { + "epoch": 0.4985703614023338, + "grad_norm": 0.01135294046252966, + "learning_rate": 5.0359597457809826e-05, + "loss": 0.29812934398651125, + "step": 116130 + }, + { + "epoch": 0.4986132934923538, + "grad_norm": 1.5958086252212524, + "learning_rate": 5.0355285737692196e-05, + "loss": 0.24831123352050782, + "step": 116140 + }, + { + "epoch": 0.4986562255823738, + "grad_norm": 2.2128734588623047, + "learning_rate": 5.0350974017574573e-05, + "loss": 0.13145445585250853, + "step": 116150 + }, + { + "epoch": 0.4986991576723938, + "grad_norm": 40.567691802978516, + "learning_rate": 5.034666229745695e-05, + "loss": 0.2323824405670166, + "step": 116160 + }, + { + "epoch": 0.4987420897624138, + "grad_norm": 0.033427074551582336, + "learning_rate": 5.034235057733933e-05, + "loss": 0.04265216588973999, + "step": 116170 + }, + { + "epoch": 0.49878502185243384, + "grad_norm": 0.03815077617764473, + "learning_rate": 5.0338038857221705e-05, + "loss": 0.2679791212081909, + "step": 116180 + }, + { + "epoch": 0.49882795394245383, + "grad_norm": 0.0019072276772931218, + "learning_rate": 5.033372713710408e-05, + "loss": 0.17380179166793824, + "step": 116190 + }, + { + "epoch": 0.4988708860324738, + "grad_norm": 0.010305742733180523, + "learning_rate": 5.032941541698646e-05, + "loss": 0.33670363426208494, + "step": 116200 + }, + { + "epoch": 0.49891381812249386, + "grad_norm": 0.07689042389392853, + "learning_rate": 5.032510369686884e-05, + "loss": 0.16827105283737182, + "step": 116210 + }, + { + "epoch": 0.49895675021251384, + "grad_norm": 0.10213097184896469, + "learning_rate": 5.03207919767512e-05, + "loss": 0.3205841541290283, + "step": 116220 + }, + { + "epoch": 0.49899968230253383, + "grad_norm": 0.0006489379447884858, + "learning_rate": 5.031648025663358e-05, + "loss": 0.4195698738098145, + "step": 116230 + }, + { + "epoch": 0.4990426143925539, + "grad_norm": 3.99538254737854, + "learning_rate": 5.0312168536515956e-05, + "loss": 0.32759990692138674, + "step": 116240 + }, + { + "epoch": 0.49908554648257386, + "grad_norm": 1.1936728954315186, + "learning_rate": 5.030785681639833e-05, + "loss": 0.4055778503417969, + "step": 116250 + }, + { + "epoch": 0.49912847857259385, + "grad_norm": 0.5046306848526001, + "learning_rate": 5.030354509628071e-05, + "loss": 0.200126576423645, + "step": 116260 + }, + { + "epoch": 0.4991714106626139, + "grad_norm": 0.042749445885419846, + "learning_rate": 5.029923337616309e-05, + "loss": 0.33313579559326173, + "step": 116270 + }, + { + "epoch": 0.4992143427526339, + "grad_norm": 3.7502360343933105, + "learning_rate": 5.0294921656045465e-05, + "loss": 0.15824503898620607, + "step": 116280 + }, + { + "epoch": 0.49925727484265386, + "grad_norm": 1.967415690422058, + "learning_rate": 5.029060993592784e-05, + "loss": 0.19596970081329346, + "step": 116290 + }, + { + "epoch": 0.4993002069326739, + "grad_norm": 28.932861328125, + "learning_rate": 5.028629821581021e-05, + "loss": 0.1746343493461609, + "step": 116300 + }, + { + "epoch": 0.4993431390226939, + "grad_norm": 3.073617696762085, + "learning_rate": 5.028198649569259e-05, + "loss": 0.2759540557861328, + "step": 116310 + }, + { + "epoch": 0.49938607111271394, + "grad_norm": 1.3334208726882935, + "learning_rate": 5.027767477557497e-05, + "loss": 0.29316024780273436, + "step": 116320 + }, + { + "epoch": 0.4994290032027339, + "grad_norm": 0.10985347628593445, + "learning_rate": 5.0273363055457345e-05, + "loss": 0.359764838218689, + "step": 116330 + }, + { + "epoch": 0.4994719352927539, + "grad_norm": 0.09878139942884445, + "learning_rate": 5.026905133533972e-05, + "loss": 0.4100006103515625, + "step": 116340 + }, + { + "epoch": 0.49951486738277395, + "grad_norm": 0.025054529309272766, + "learning_rate": 5.02647396152221e-05, + "loss": 0.2300884246826172, + "step": 116350 + }, + { + "epoch": 0.49955779947279394, + "grad_norm": 3.4533677101135254, + "learning_rate": 5.026042789510448e-05, + "loss": 0.38540575504302976, + "step": 116360 + }, + { + "epoch": 0.4996007315628139, + "grad_norm": 0.10880227386951447, + "learning_rate": 5.0256116174986855e-05, + "loss": 0.4489396095275879, + "step": 116370 + }, + { + "epoch": 0.49964366365283397, + "grad_norm": 0.20102038979530334, + "learning_rate": 5.0251804454869225e-05, + "loss": 0.12855820655822753, + "step": 116380 + }, + { + "epoch": 0.49968659574285396, + "grad_norm": 1.3258622884750366, + "learning_rate": 5.02474927347516e-05, + "loss": 0.3025193452835083, + "step": 116390 + }, + { + "epoch": 0.49972952783287394, + "grad_norm": 3.7796823978424072, + "learning_rate": 5.024318101463398e-05, + "loss": 0.1849290609359741, + "step": 116400 + }, + { + "epoch": 0.499772459922894, + "grad_norm": 4.854727745056152, + "learning_rate": 5.023886929451636e-05, + "loss": 0.3261184930801392, + "step": 116410 + }, + { + "epoch": 0.499815392012914, + "grad_norm": 1.9094047546386719, + "learning_rate": 5.0234557574398735e-05, + "loss": 0.2567267656326294, + "step": 116420 + }, + { + "epoch": 0.49985832410293396, + "grad_norm": 1.9529017210006714, + "learning_rate": 5.023024585428111e-05, + "loss": 0.24922068119049073, + "step": 116430 + }, + { + "epoch": 0.499901256192954, + "grad_norm": 2.076061964035034, + "learning_rate": 5.022593413416349e-05, + "loss": 0.3763160467147827, + "step": 116440 + }, + { + "epoch": 0.499944188282974, + "grad_norm": 0.1453094780445099, + "learning_rate": 5.0221622414045867e-05, + "loss": 0.24987459182739258, + "step": 116450 + }, + { + "epoch": 0.499987120372994, + "grad_norm": 0.044320449233055115, + "learning_rate": 5.0217310693928244e-05, + "loss": 0.13236111402511597, + "step": 116460 + }, + { + "epoch": 0.500030052463014, + "grad_norm": 0.0024830172769725323, + "learning_rate": 5.021299897381061e-05, + "loss": 0.2829590320587158, + "step": 116470 + }, + { + "epoch": 0.5000729845530341, + "grad_norm": 0.11648912727832794, + "learning_rate": 5.0208687253692985e-05, + "loss": 0.1768946409225464, + "step": 116480 + }, + { + "epoch": 0.500115916643054, + "grad_norm": 2.029343605041504, + "learning_rate": 5.020437553357536e-05, + "loss": 0.38065204620361326, + "step": 116490 + }, + { + "epoch": 0.500158848733074, + "grad_norm": 0.008621471002697945, + "learning_rate": 5.020006381345774e-05, + "loss": 0.21182801723480224, + "step": 116500 + }, + { + "epoch": 0.5002017808230941, + "grad_norm": 0.057470615953207016, + "learning_rate": 5.019575209334012e-05, + "loss": 0.06660780906677247, + "step": 116510 + }, + { + "epoch": 0.500244712913114, + "grad_norm": 0.17172105610370636, + "learning_rate": 5.01914403732225e-05, + "loss": 0.31643340587615965, + "step": 116520 + }, + { + "epoch": 0.500287645003134, + "grad_norm": 2.378706216812134, + "learning_rate": 5.018712865310488e-05, + "loss": 0.33999631404876707, + "step": 116530 + }, + { + "epoch": 0.5003305770931541, + "grad_norm": 0.3760903477668762, + "learning_rate": 5.0182816932987256e-05, + "loss": 0.12946596145629882, + "step": 116540 + }, + { + "epoch": 0.500373509183174, + "grad_norm": 0.08980166167020798, + "learning_rate": 5.017850521286962e-05, + "loss": 0.036372536420822145, + "step": 116550 + }, + { + "epoch": 0.5004164412731941, + "grad_norm": 0.9005488157272339, + "learning_rate": 5.0174193492752e-05, + "loss": 0.22982125282287597, + "step": 116560 + }, + { + "epoch": 0.5004593733632141, + "grad_norm": 0.0490412674844265, + "learning_rate": 5.0169881772634374e-05, + "loss": 0.17876067161560058, + "step": 116570 + }, + { + "epoch": 0.500502305453234, + "grad_norm": 0.03490692749619484, + "learning_rate": 5.016557005251675e-05, + "loss": 0.23765029907226562, + "step": 116580 + }, + { + "epoch": 0.5005452375432541, + "grad_norm": 1.0722746849060059, + "learning_rate": 5.016125833239913e-05, + "loss": 0.359777045249939, + "step": 116590 + }, + { + "epoch": 0.5005881696332741, + "grad_norm": 4.997632026672363, + "learning_rate": 5.0156946612281506e-05, + "loss": 0.5328747272491455, + "step": 116600 + }, + { + "epoch": 0.5006311017232941, + "grad_norm": 4.145293712615967, + "learning_rate": 5.0152634892163884e-05, + "loss": 0.1871986985206604, + "step": 116610 + }, + { + "epoch": 0.5006740338133141, + "grad_norm": 2.7955899238586426, + "learning_rate": 5.014832317204626e-05, + "loss": 0.3289355993270874, + "step": 116620 + }, + { + "epoch": 0.5007169659033341, + "grad_norm": 0.015839189291000366, + "learning_rate": 5.014401145192863e-05, + "loss": 0.18843262195587157, + "step": 116630 + }, + { + "epoch": 0.5007598979933541, + "grad_norm": 2.8927204608917236, + "learning_rate": 5.013969973181101e-05, + "loss": 0.35386366844177247, + "step": 116640 + }, + { + "epoch": 0.5008028300833741, + "grad_norm": 1.1801815032958984, + "learning_rate": 5.0135388011693386e-05, + "loss": 0.3398331642150879, + "step": 116650 + }, + { + "epoch": 0.5008457621733942, + "grad_norm": 0.26834312081336975, + "learning_rate": 5.0131076291575764e-05, + "loss": 0.16700916290283202, + "step": 116660 + }, + { + "epoch": 0.5008886942634141, + "grad_norm": 0.03399025276303291, + "learning_rate": 5.012676457145814e-05, + "loss": 0.26661252975463867, + "step": 116670 + }, + { + "epoch": 0.5009316263534341, + "grad_norm": 0.6200994253158569, + "learning_rate": 5.012245285134052e-05, + "loss": 0.21167898178100586, + "step": 116680 + }, + { + "epoch": 0.5009745584434542, + "grad_norm": 0.026551874354481697, + "learning_rate": 5.0118141131222896e-05, + "loss": 0.052762043476104734, + "step": 116690 + }, + { + "epoch": 0.5010174905334741, + "grad_norm": 1.6990082263946533, + "learning_rate": 5.011382941110527e-05, + "loss": 0.36727452278137207, + "step": 116700 + }, + { + "epoch": 0.5010604226234942, + "grad_norm": 0.5137987732887268, + "learning_rate": 5.010951769098764e-05, + "loss": 0.16924527883529664, + "step": 116710 + }, + { + "epoch": 0.5011033547135142, + "grad_norm": 2.603691577911377, + "learning_rate": 5.0105205970870014e-05, + "loss": 0.13312125205993652, + "step": 116720 + }, + { + "epoch": 0.5011462868035341, + "grad_norm": 2.065608263015747, + "learning_rate": 5.010089425075239e-05, + "loss": 0.36140103340148927, + "step": 116730 + }, + { + "epoch": 0.5011892188935542, + "grad_norm": 0.2081688642501831, + "learning_rate": 5.0096582530634776e-05, + "loss": 0.07831467390060425, + "step": 116740 + }, + { + "epoch": 0.5012321509835742, + "grad_norm": 2.544560432434082, + "learning_rate": 5.009227081051715e-05, + "loss": 0.35781326293945315, + "step": 116750 + }, + { + "epoch": 0.5012750830735941, + "grad_norm": 0.03773084655404091, + "learning_rate": 5.008795909039953e-05, + "loss": 0.18783485889434814, + "step": 116760 + }, + { + "epoch": 0.5013180151636142, + "grad_norm": 0.04964379593729973, + "learning_rate": 5.008364737028191e-05, + "loss": 0.14863022565841674, + "step": 116770 + }, + { + "epoch": 0.5013609472536342, + "grad_norm": 0.17665715515613556, + "learning_rate": 5.0079335650164285e-05, + "loss": 0.25568058490753176, + "step": 116780 + }, + { + "epoch": 0.5014038793436542, + "grad_norm": 2.9642350673675537, + "learning_rate": 5.007502393004666e-05, + "loss": 0.21143810749053954, + "step": 116790 + }, + { + "epoch": 0.5014468114336742, + "grad_norm": 1.360346794128418, + "learning_rate": 5.0070712209929026e-05, + "loss": 0.12712502479553223, + "step": 116800 + }, + { + "epoch": 0.5014897435236942, + "grad_norm": 1.1762696504592896, + "learning_rate": 5.0066400489811404e-05, + "loss": 0.3075927972793579, + "step": 116810 + }, + { + "epoch": 0.5015326756137142, + "grad_norm": 0.009866449050605297, + "learning_rate": 5.006208876969378e-05, + "loss": 0.3283320665359497, + "step": 116820 + }, + { + "epoch": 0.5015756077037342, + "grad_norm": 1.7027781009674072, + "learning_rate": 5.005777704957616e-05, + "loss": 0.09247349500656128, + "step": 116830 + }, + { + "epoch": 0.5016185397937543, + "grad_norm": 1.6753307580947876, + "learning_rate": 5.0053465329458536e-05, + "loss": 0.05889575481414795, + "step": 116840 + }, + { + "epoch": 0.5016614718837742, + "grad_norm": 3.2303237915039062, + "learning_rate": 5.004915360934091e-05, + "loss": 0.21741180419921874, + "step": 116850 + }, + { + "epoch": 0.5017044039737942, + "grad_norm": 0.03603691607713699, + "learning_rate": 5.004484188922329e-05, + "loss": 0.013999029994010925, + "step": 116860 + }, + { + "epoch": 0.5017473360638143, + "grad_norm": 3.4006402492523193, + "learning_rate": 5.004053016910567e-05, + "loss": 0.14681098461151124, + "step": 116870 + }, + { + "epoch": 0.5017902681538343, + "grad_norm": 0.019151249900460243, + "learning_rate": 5.003621844898804e-05, + "loss": 0.052489012479782104, + "step": 116880 + }, + { + "epoch": 0.5018332002438542, + "grad_norm": 0.04653005301952362, + "learning_rate": 5.0031906728870415e-05, + "loss": 0.2115783929824829, + "step": 116890 + }, + { + "epoch": 0.5018761323338743, + "grad_norm": 0.05448725074529648, + "learning_rate": 5.002759500875279e-05, + "loss": 0.22183005809783934, + "step": 116900 + }, + { + "epoch": 0.5019190644238943, + "grad_norm": 0.02877284586429596, + "learning_rate": 5.002328328863517e-05, + "loss": 0.29645206928253176, + "step": 116910 + }, + { + "epoch": 0.5019619965139143, + "grad_norm": 0.5961487293243408, + "learning_rate": 5.001897156851755e-05, + "loss": 0.12437942028045654, + "step": 116920 + }, + { + "epoch": 0.5020049286039343, + "grad_norm": 0.03502122312784195, + "learning_rate": 5.0014659848399925e-05, + "loss": 0.31128199100494386, + "step": 116930 + }, + { + "epoch": 0.5020478606939544, + "grad_norm": 0.007991598919034004, + "learning_rate": 5.00103481282823e-05, + "loss": 0.36748878955841063, + "step": 116940 + }, + { + "epoch": 0.5020907927839743, + "grad_norm": 1.8723878860473633, + "learning_rate": 5.000603640816468e-05, + "loss": 0.16076483726501464, + "step": 116950 + }, + { + "epoch": 0.5021337248739943, + "grad_norm": 0.004238491412252188, + "learning_rate": 5.000172468804705e-05, + "loss": 0.2704092264175415, + "step": 116960 + }, + { + "epoch": 0.5021766569640144, + "grad_norm": 0.09497665613889694, + "learning_rate": 4.999741296792943e-05, + "loss": 0.00860070288181305, + "step": 116970 + }, + { + "epoch": 0.5022195890540343, + "grad_norm": 0.5038676857948303, + "learning_rate": 4.9993101247811805e-05, + "loss": 0.14574838876724244, + "step": 116980 + }, + { + "epoch": 0.5022625211440543, + "grad_norm": 0.05492497235536575, + "learning_rate": 4.998878952769418e-05, + "loss": 0.31498830318450927, + "step": 116990 + }, + { + "epoch": 0.5023054532340744, + "grad_norm": 0.40970903635025024, + "learning_rate": 4.998447780757656e-05, + "loss": 0.24021148681640625, + "step": 117000 + }, + { + "epoch": 0.5023054532340744, + "eval_loss": 0.41057536005973816, + "eval_runtime": 27.1683, + "eval_samples_per_second": 3.681, + "eval_steps_per_second": 3.681, + "step": 117000 + }, + { + "epoch": 0.5023483853240943, + "grad_norm": 0.010372740216553211, + "learning_rate": 4.998016608745894e-05, + "loss": 0.24350297451019287, + "step": 117010 + }, + { + "epoch": 0.5023913174141144, + "grad_norm": 0.09824706614017487, + "learning_rate": 4.997585436734131e-05, + "loss": 0.22170097827911378, + "step": 117020 + }, + { + "epoch": 0.5024342495041344, + "grad_norm": 0.6410292387008667, + "learning_rate": 4.9971542647223685e-05, + "loss": 0.22455050945281982, + "step": 117030 + }, + { + "epoch": 0.5024771815941543, + "grad_norm": 9.528117179870605, + "learning_rate": 4.996723092710606e-05, + "loss": 0.16655960083007812, + "step": 117040 + }, + { + "epoch": 0.5025201136841744, + "grad_norm": 0.033820074051618576, + "learning_rate": 4.996291920698844e-05, + "loss": 0.2521155118942261, + "step": 117050 + }, + { + "epoch": 0.5025630457741944, + "grad_norm": 0.9928510189056396, + "learning_rate": 4.995860748687081e-05, + "loss": 0.2452853202819824, + "step": 117060 + }, + { + "epoch": 0.5026059778642143, + "grad_norm": 0.0058114430867135525, + "learning_rate": 4.995429576675319e-05, + "loss": 0.05519225597381592, + "step": 117070 + }, + { + "epoch": 0.5026489099542344, + "grad_norm": 0.09580115973949432, + "learning_rate": 4.9949984046635565e-05, + "loss": 0.14227598905563354, + "step": 117080 + }, + { + "epoch": 0.5026918420442544, + "grad_norm": 0.11800723522901535, + "learning_rate": 4.994567232651794e-05, + "loss": 0.0920566439628601, + "step": 117090 + }, + { + "epoch": 0.5027347741342744, + "grad_norm": 4.133057594299316, + "learning_rate": 4.994136060640032e-05, + "loss": 0.22397398948669434, + "step": 117100 + }, + { + "epoch": 0.5027777062242944, + "grad_norm": 0.14109517633914948, + "learning_rate": 4.99370488862827e-05, + "loss": 0.1774275779724121, + "step": 117110 + }, + { + "epoch": 0.5028206383143144, + "grad_norm": 1.8681215047836304, + "learning_rate": 4.9932737166165074e-05, + "loss": 0.37117633819580076, + "step": 117120 + }, + { + "epoch": 0.5028635704043344, + "grad_norm": 0.02314554899930954, + "learning_rate": 4.992842544604745e-05, + "loss": 0.0521328866481781, + "step": 117130 + }, + { + "epoch": 0.5029065024943544, + "grad_norm": 0.8494148850440979, + "learning_rate": 4.992411372592983e-05, + "loss": 0.30360457897186277, + "step": 117140 + }, + { + "epoch": 0.5029494345843745, + "grad_norm": 0.003587879240512848, + "learning_rate": 4.99198020058122e-05, + "loss": 0.06846604943275451, + "step": 117150 + }, + { + "epoch": 0.5029923666743944, + "grad_norm": 24.003463745117188, + "learning_rate": 4.991549028569458e-05, + "loss": 0.16479328870773316, + "step": 117160 + }, + { + "epoch": 0.5030352987644144, + "grad_norm": 15.54176139831543, + "learning_rate": 4.9911178565576954e-05, + "loss": 0.16109709739685057, + "step": 117170 + }, + { + "epoch": 0.5030782308544345, + "grad_norm": 0.03265627101063728, + "learning_rate": 4.990686684545933e-05, + "loss": 0.12660821676254272, + "step": 117180 + }, + { + "epoch": 0.5031211629444544, + "grad_norm": 0.37020328640937805, + "learning_rate": 4.99025551253417e-05, + "loss": 0.13424829244613648, + "step": 117190 + }, + { + "epoch": 0.5031640950344745, + "grad_norm": 0.03218246251344681, + "learning_rate": 4.989824340522408e-05, + "loss": 0.2562605857849121, + "step": 117200 + }, + { + "epoch": 0.5032070271244945, + "grad_norm": 0.22651711106300354, + "learning_rate": 4.9893931685106457e-05, + "loss": 0.13423599004745485, + "step": 117210 + }, + { + "epoch": 0.5032499592145144, + "grad_norm": 1.5119297504425049, + "learning_rate": 4.988961996498884e-05, + "loss": 0.11704769134521484, + "step": 117220 + }, + { + "epoch": 0.5032928913045345, + "grad_norm": 4.174872875213623, + "learning_rate": 4.988530824487121e-05, + "loss": 0.3308922290802002, + "step": 117230 + }, + { + "epoch": 0.5033358233945545, + "grad_norm": 0.8240285515785217, + "learning_rate": 4.988099652475359e-05, + "loss": 0.2682643890380859, + "step": 117240 + }, + { + "epoch": 0.5033787554845744, + "grad_norm": 0.32659050822257996, + "learning_rate": 4.9876684804635966e-05, + "loss": 0.33257737159729006, + "step": 117250 + }, + { + "epoch": 0.5034216875745945, + "grad_norm": 0.6228490471839905, + "learning_rate": 4.987237308451834e-05, + "loss": 0.19845781326293946, + "step": 117260 + }, + { + "epoch": 0.5034646196646145, + "grad_norm": 1.7231818437576294, + "learning_rate": 4.9868061364400714e-05, + "loss": 0.3892753839492798, + "step": 117270 + }, + { + "epoch": 0.5035075517546345, + "grad_norm": 0.023025069385766983, + "learning_rate": 4.986374964428309e-05, + "loss": 0.16233171224594117, + "step": 117280 + }, + { + "epoch": 0.5035504838446545, + "grad_norm": 0.018026838079094887, + "learning_rate": 4.985943792416547e-05, + "loss": 0.16559386253356934, + "step": 117290 + }, + { + "epoch": 0.5035934159346745, + "grad_norm": 0.025355422869324684, + "learning_rate": 4.9855126204047846e-05, + "loss": 0.2693866014480591, + "step": 117300 + }, + { + "epoch": 0.5036363480246946, + "grad_norm": 0.15633586049079895, + "learning_rate": 4.9850814483930216e-05, + "loss": 0.2386932611465454, + "step": 117310 + }, + { + "epoch": 0.5036792801147145, + "grad_norm": 0.18697188794612885, + "learning_rate": 4.9846502763812594e-05, + "loss": 0.09443596601486207, + "step": 117320 + }, + { + "epoch": 0.5037222122047346, + "grad_norm": 1.1669023036956787, + "learning_rate": 4.984219104369498e-05, + "loss": 0.41175551414489747, + "step": 117330 + }, + { + "epoch": 0.5037651442947546, + "grad_norm": 2.8192169666290283, + "learning_rate": 4.9837879323577355e-05, + "loss": 0.16009420156478882, + "step": 117340 + }, + { + "epoch": 0.5038080763847745, + "grad_norm": 1.1283766031265259, + "learning_rate": 4.9833567603459726e-05, + "loss": 0.31158037185668946, + "step": 117350 + }, + { + "epoch": 0.5038510084747946, + "grad_norm": 2.5143420696258545, + "learning_rate": 4.98292558833421e-05, + "loss": 0.38306190967559817, + "step": 117360 + }, + { + "epoch": 0.5038939405648146, + "grad_norm": 0.010936361737549305, + "learning_rate": 4.982494416322448e-05, + "loss": 0.1006517767906189, + "step": 117370 + }, + { + "epoch": 0.5039368726548346, + "grad_norm": 1.180523157119751, + "learning_rate": 4.982063244310686e-05, + "loss": 0.14739004373550416, + "step": 117380 + }, + { + "epoch": 0.5039798047448546, + "grad_norm": 1.164581537246704, + "learning_rate": 4.981632072298923e-05, + "loss": 0.2527391672134399, + "step": 117390 + }, + { + "epoch": 0.5040227368348746, + "grad_norm": 0.006947158835828304, + "learning_rate": 4.9812009002871606e-05, + "loss": 0.1402176260948181, + "step": 117400 + }, + { + "epoch": 0.5040656689248946, + "grad_norm": 0.014355388469994068, + "learning_rate": 4.980769728275398e-05, + "loss": 0.3870328426361084, + "step": 117410 + }, + { + "epoch": 0.5041086010149146, + "grad_norm": 3.5852952003479004, + "learning_rate": 4.980338556263636e-05, + "loss": 0.21521148681640626, + "step": 117420 + }, + { + "epoch": 0.5041515331049347, + "grad_norm": 1.0364006757736206, + "learning_rate": 4.979907384251873e-05, + "loss": 0.1514366865158081, + "step": 117430 + }, + { + "epoch": 0.5041944651949546, + "grad_norm": 0.036176733672618866, + "learning_rate": 4.9794762122401115e-05, + "loss": 0.37396066188812255, + "step": 117440 + }, + { + "epoch": 0.5042373972849746, + "grad_norm": 1.1058789491653442, + "learning_rate": 4.979045040228349e-05, + "loss": 0.30147347450256345, + "step": 117450 + }, + { + "epoch": 0.5042803293749947, + "grad_norm": 0.6830658912658691, + "learning_rate": 4.978613868216587e-05, + "loss": 0.2363212823867798, + "step": 117460 + }, + { + "epoch": 0.5043232614650146, + "grad_norm": 1.5272681713104248, + "learning_rate": 4.978182696204824e-05, + "loss": 0.14829931259155274, + "step": 117470 + }, + { + "epoch": 0.5043661935550346, + "grad_norm": 0.2594696283340454, + "learning_rate": 4.977751524193062e-05, + "loss": 0.19520469903945922, + "step": 117480 + }, + { + "epoch": 0.5044091256450547, + "grad_norm": 0.1625552475452423, + "learning_rate": 4.9773203521812995e-05, + "loss": 0.25294084548950196, + "step": 117490 + }, + { + "epoch": 0.5044520577350746, + "grad_norm": 0.07057518512010574, + "learning_rate": 4.976889180169537e-05, + "loss": 0.2937706232070923, + "step": 117500 + }, + { + "epoch": 0.5044949898250947, + "grad_norm": 2.936040163040161, + "learning_rate": 4.976458008157775e-05, + "loss": 0.3090040683746338, + "step": 117510 + }, + { + "epoch": 0.5045379219151147, + "grad_norm": 0.03770938143134117, + "learning_rate": 4.976026836146012e-05, + "loss": 0.004101923853158951, + "step": 117520 + }, + { + "epoch": 0.5045808540051346, + "grad_norm": 0.001742324442602694, + "learning_rate": 4.97559566413425e-05, + "loss": 0.21159942150115968, + "step": 117530 + }, + { + "epoch": 0.5046237860951547, + "grad_norm": 3.898867607116699, + "learning_rate": 4.9751644921224875e-05, + "loss": 0.29637346267700193, + "step": 117540 + }, + { + "epoch": 0.5046667181851747, + "grad_norm": 1.8550221920013428, + "learning_rate": 4.974733320110725e-05, + "loss": 0.3649588108062744, + "step": 117550 + }, + { + "epoch": 0.5047096502751947, + "grad_norm": 2.0680346488952637, + "learning_rate": 4.974302148098963e-05, + "loss": 0.3192767143249512, + "step": 117560 + }, + { + "epoch": 0.5047525823652147, + "grad_norm": 0.9255982637405396, + "learning_rate": 4.973870976087201e-05, + "loss": 0.31056628227233884, + "step": 117570 + }, + { + "epoch": 0.5047955144552347, + "grad_norm": 1.4304261207580566, + "learning_rate": 4.9734398040754384e-05, + "loss": 0.30201447010040283, + "step": 117580 + }, + { + "epoch": 0.5048384465452547, + "grad_norm": 0.030385779216885567, + "learning_rate": 4.973008632063676e-05, + "loss": 0.2219762086868286, + "step": 117590 + }, + { + "epoch": 0.5048813786352747, + "grad_norm": 0.004098663106560707, + "learning_rate": 4.972577460051913e-05, + "loss": 0.14338167905807495, + "step": 117600 + }, + { + "epoch": 0.5049243107252948, + "grad_norm": 0.2958376705646515, + "learning_rate": 4.972146288040151e-05, + "loss": 0.09196502566337586, + "step": 117610 + }, + { + "epoch": 0.5049672428153147, + "grad_norm": 1.395702600479126, + "learning_rate": 4.971715116028389e-05, + "loss": 0.24472627639770508, + "step": 117620 + }, + { + "epoch": 0.5050101749053347, + "grad_norm": 8.358769416809082, + "learning_rate": 4.9712839440166264e-05, + "loss": 0.28646397590637207, + "step": 117630 + }, + { + "epoch": 0.5050531069953548, + "grad_norm": 0.11586333066225052, + "learning_rate": 4.9708527720048635e-05, + "loss": 0.26342833042144775, + "step": 117640 + }, + { + "epoch": 0.5050960390853747, + "grad_norm": 0.17295314371585846, + "learning_rate": 4.970421599993101e-05, + "loss": 0.29183268547058105, + "step": 117650 + }, + { + "epoch": 0.5051389711753947, + "grad_norm": 0.013368518091738224, + "learning_rate": 4.969990427981339e-05, + "loss": 0.16364799737930297, + "step": 117660 + }, + { + "epoch": 0.5051819032654148, + "grad_norm": 1.6267229318618774, + "learning_rate": 4.969559255969577e-05, + "loss": 0.050443482398986814, + "step": 117670 + }, + { + "epoch": 0.5052248353554347, + "grad_norm": 0.510202944278717, + "learning_rate": 4.9691280839578144e-05, + "loss": 0.122452712059021, + "step": 117680 + }, + { + "epoch": 0.5052677674454548, + "grad_norm": 6.765218257904053, + "learning_rate": 4.968696911946052e-05, + "loss": 0.20582473278045654, + "step": 117690 + }, + { + "epoch": 0.5053106995354748, + "grad_norm": 1.7956032752990723, + "learning_rate": 4.96826573993429e-05, + "loss": 0.18374691009521485, + "step": 117700 + }, + { + "epoch": 0.5053536316254947, + "grad_norm": 2.8887641429901123, + "learning_rate": 4.9678345679225276e-05, + "loss": 0.3921834945678711, + "step": 117710 + }, + { + "epoch": 0.5053965637155148, + "grad_norm": 0.5638045072555542, + "learning_rate": 4.967403395910765e-05, + "loss": 0.40248904228210447, + "step": 117720 + }, + { + "epoch": 0.5054394958055348, + "grad_norm": 0.07434771209955215, + "learning_rate": 4.9669722238990024e-05, + "loss": 0.39189608097076417, + "step": 117730 + }, + { + "epoch": 0.5054824278955549, + "grad_norm": 0.026078205555677414, + "learning_rate": 4.96654105188724e-05, + "loss": 0.1321608304977417, + "step": 117740 + }, + { + "epoch": 0.5055253599855748, + "grad_norm": 0.03281189501285553, + "learning_rate": 4.966109879875478e-05, + "loss": 0.11588050127029419, + "step": 117750 + }, + { + "epoch": 0.5055682920755948, + "grad_norm": 0.013210115022957325, + "learning_rate": 4.965678707863715e-05, + "loss": 0.12477741241455079, + "step": 117760 + }, + { + "epoch": 0.5056112241656149, + "grad_norm": 3.3516409397125244, + "learning_rate": 4.965247535851953e-05, + "loss": 0.04821877479553223, + "step": 117770 + }, + { + "epoch": 0.5056541562556348, + "grad_norm": 0.0025698766112327576, + "learning_rate": 4.9648163638401904e-05, + "loss": 0.1576859474182129, + "step": 117780 + }, + { + "epoch": 0.5056970883456549, + "grad_norm": 0.23797300457954407, + "learning_rate": 4.964385191828428e-05, + "loss": 0.26349759101867676, + "step": 117790 + }, + { + "epoch": 0.5057400204356749, + "grad_norm": 0.005504325032234192, + "learning_rate": 4.963954019816666e-05, + "loss": 0.06040756106376648, + "step": 117800 + }, + { + "epoch": 0.5057829525256948, + "grad_norm": 0.004711473826318979, + "learning_rate": 4.9635228478049036e-05, + "loss": 0.16386046409606933, + "step": 117810 + }, + { + "epoch": 0.5058258846157149, + "grad_norm": 0.019762564450502396, + "learning_rate": 4.9630916757931414e-05, + "loss": 0.1610340356826782, + "step": 117820 + }, + { + "epoch": 0.5058688167057349, + "grad_norm": 0.07730504125356674, + "learning_rate": 4.962660503781379e-05, + "loss": 0.29164934158325195, + "step": 117830 + }, + { + "epoch": 0.5059117487957548, + "grad_norm": 0.011327949352562428, + "learning_rate": 4.962229331769617e-05, + "loss": 0.10377846956253052, + "step": 117840 + }, + { + "epoch": 0.5059546808857749, + "grad_norm": 0.00887768529355526, + "learning_rate": 4.961798159757854e-05, + "loss": 0.2585068941116333, + "step": 117850 + }, + { + "epoch": 0.5059976129757949, + "grad_norm": 0.10135412216186523, + "learning_rate": 4.9613669877460916e-05, + "loss": 0.17118458747863768, + "step": 117860 + }, + { + "epoch": 0.5060405450658149, + "grad_norm": 0.12075807899236679, + "learning_rate": 4.9609358157343293e-05, + "loss": 0.16684346199035643, + "step": 117870 + }, + { + "epoch": 0.5060834771558349, + "grad_norm": 0.00484396331012249, + "learning_rate": 4.960504643722567e-05, + "loss": 0.17456436157226562, + "step": 117880 + }, + { + "epoch": 0.506126409245855, + "grad_norm": 0.04089738428592682, + "learning_rate": 4.960073471710804e-05, + "loss": 0.21289370059967042, + "step": 117890 + }, + { + "epoch": 0.5061693413358749, + "grad_norm": 0.032251860946416855, + "learning_rate": 4.959642299699042e-05, + "loss": 0.1787124514579773, + "step": 117900 + }, + { + "epoch": 0.5062122734258949, + "grad_norm": 0.22456131875514984, + "learning_rate": 4.9592111276872796e-05, + "loss": 0.2632343530654907, + "step": 117910 + }, + { + "epoch": 0.506255205515915, + "grad_norm": 4.649175643920898, + "learning_rate": 4.958779955675517e-05, + "loss": 0.2298372507095337, + "step": 117920 + }, + { + "epoch": 0.5062981376059349, + "grad_norm": 0.06047212332487106, + "learning_rate": 4.958348783663755e-05, + "loss": 0.20818607807159423, + "step": 117930 + }, + { + "epoch": 0.5063410696959549, + "grad_norm": 0.00024737833882682025, + "learning_rate": 4.957917611651993e-05, + "loss": 0.17201005220413207, + "step": 117940 + }, + { + "epoch": 0.506384001785975, + "grad_norm": 3.057046413421631, + "learning_rate": 4.9574864396402305e-05, + "loss": 0.35734987258911133, + "step": 117950 + }, + { + "epoch": 0.5064269338759949, + "grad_norm": 0.15315313637256622, + "learning_rate": 4.957055267628468e-05, + "loss": 0.09107869863510132, + "step": 117960 + }, + { + "epoch": 0.506469865966015, + "grad_norm": 0.0059364596381783485, + "learning_rate": 4.956624095616705e-05, + "loss": 0.1393264651298523, + "step": 117970 + }, + { + "epoch": 0.506512798056035, + "grad_norm": 13.115792274475098, + "learning_rate": 4.956192923604943e-05, + "loss": 0.3588475942611694, + "step": 117980 + }, + { + "epoch": 0.5065557301460549, + "grad_norm": 0.02436887100338936, + "learning_rate": 4.955761751593181e-05, + "loss": 0.005839229002594948, + "step": 117990 + }, + { + "epoch": 0.506598662236075, + "grad_norm": 0.24447183310985565, + "learning_rate": 4.9553305795814185e-05, + "loss": 0.3777074575424194, + "step": 118000 + }, + { + "epoch": 0.506598662236075, + "eval_loss": 0.403300017118454, + "eval_runtime": 27.2572, + "eval_samples_per_second": 3.669, + "eval_steps_per_second": 3.669, + "step": 118000 + }, + { + "epoch": 0.506641594326095, + "grad_norm": 0.008575129322707653, + "learning_rate": 4.9548994075696556e-05, + "loss": 0.1603256344795227, + "step": 118010 + }, + { + "epoch": 0.5066845264161149, + "grad_norm": 0.004096082877367735, + "learning_rate": 4.954468235557893e-05, + "loss": 0.31450352668762205, + "step": 118020 + }, + { + "epoch": 0.506727458506135, + "grad_norm": 0.07715465873479843, + "learning_rate": 4.954037063546131e-05, + "loss": 0.11344932317733765, + "step": 118030 + }, + { + "epoch": 0.506770390596155, + "grad_norm": 0.030346719548106194, + "learning_rate": 4.9536058915343695e-05, + "loss": 0.1910154104232788, + "step": 118040 + }, + { + "epoch": 0.506813322686175, + "grad_norm": 2.2040464878082275, + "learning_rate": 4.9531747195226065e-05, + "loss": 0.12421665191650391, + "step": 118050 + }, + { + "epoch": 0.506856254776195, + "grad_norm": 0.10436452925205231, + "learning_rate": 4.952743547510844e-05, + "loss": 0.18091598749160767, + "step": 118060 + }, + { + "epoch": 0.506899186866215, + "grad_norm": 0.2847166061401367, + "learning_rate": 4.952312375499082e-05, + "loss": 0.3907632827758789, + "step": 118070 + }, + { + "epoch": 0.506942118956235, + "grad_norm": 0.19107811152935028, + "learning_rate": 4.95188120348732e-05, + "loss": 0.16047124862670897, + "step": 118080 + }, + { + "epoch": 0.506985051046255, + "grad_norm": 0.03470180928707123, + "learning_rate": 4.951450031475557e-05, + "loss": 0.09653306007385254, + "step": 118090 + }, + { + "epoch": 0.5070279831362751, + "grad_norm": 0.01914292760193348, + "learning_rate": 4.9510188594637945e-05, + "loss": 0.1548427700996399, + "step": 118100 + }, + { + "epoch": 0.507070915226295, + "grad_norm": 4.845828533172607, + "learning_rate": 4.950587687452032e-05, + "loss": 0.23194398880004882, + "step": 118110 + }, + { + "epoch": 0.507113847316315, + "grad_norm": 0.6345359086990356, + "learning_rate": 4.95015651544027e-05, + "loss": 0.14568690061569214, + "step": 118120 + }, + { + "epoch": 0.5071567794063351, + "grad_norm": 0.39725396037101746, + "learning_rate": 4.949725343428507e-05, + "loss": 0.199809730052948, + "step": 118130 + }, + { + "epoch": 0.507199711496355, + "grad_norm": 0.0025165737606585026, + "learning_rate": 4.949294171416745e-05, + "loss": 0.19158271551132203, + "step": 118140 + }, + { + "epoch": 0.507242643586375, + "grad_norm": 0.005341788753867149, + "learning_rate": 4.948862999404983e-05, + "loss": 0.11959943771362305, + "step": 118150 + }, + { + "epoch": 0.5072855756763951, + "grad_norm": 0.16199614107608795, + "learning_rate": 4.948431827393221e-05, + "loss": 0.08734560012817383, + "step": 118160 + }, + { + "epoch": 0.5073285077664151, + "grad_norm": 0.012718594633042812, + "learning_rate": 4.948000655381458e-05, + "loss": 0.19825737476348876, + "step": 118170 + }, + { + "epoch": 0.5073714398564351, + "grad_norm": 0.008708270266652107, + "learning_rate": 4.947569483369696e-05, + "loss": 0.3302353620529175, + "step": 118180 + }, + { + "epoch": 0.5074143719464551, + "grad_norm": 0.02464340068399906, + "learning_rate": 4.9471383113579335e-05, + "loss": 0.21059913635253907, + "step": 118190 + }, + { + "epoch": 0.5074573040364752, + "grad_norm": 1.7081856727600098, + "learning_rate": 4.946707139346171e-05, + "loss": 0.3520483493804932, + "step": 118200 + }, + { + "epoch": 0.5075002361264951, + "grad_norm": 2.243417739868164, + "learning_rate": 4.946275967334409e-05, + "loss": 0.10270900726318359, + "step": 118210 + }, + { + "epoch": 0.5075431682165151, + "grad_norm": 0.32691332697868347, + "learning_rate": 4.945844795322646e-05, + "loss": 0.22291224002838134, + "step": 118220 + }, + { + "epoch": 0.5075861003065352, + "grad_norm": 0.038161855190992355, + "learning_rate": 4.945413623310884e-05, + "loss": 0.340277099609375, + "step": 118230 + }, + { + "epoch": 0.5076290323965551, + "grad_norm": 0.5222945213317871, + "learning_rate": 4.9449824512991214e-05, + "loss": 0.1506492018699646, + "step": 118240 + }, + { + "epoch": 0.5076719644865751, + "grad_norm": 0.043260689824819565, + "learning_rate": 4.944551279287359e-05, + "loss": 0.10362763404846191, + "step": 118250 + }, + { + "epoch": 0.5077148965765952, + "grad_norm": 0.003030435647815466, + "learning_rate": 4.944120107275597e-05, + "loss": 0.1216086745262146, + "step": 118260 + }, + { + "epoch": 0.5077578286666151, + "grad_norm": 0.0064033265225589275, + "learning_rate": 4.9436889352638347e-05, + "loss": 0.0058049742132425305, + "step": 118270 + }, + { + "epoch": 0.5078007607566352, + "grad_norm": 0.030727287754416466, + "learning_rate": 4.9432577632520724e-05, + "loss": 0.03154313564300537, + "step": 118280 + }, + { + "epoch": 0.5078436928466552, + "grad_norm": 0.0021809639874845743, + "learning_rate": 4.94282659124031e-05, + "loss": 0.3597226142883301, + "step": 118290 + }, + { + "epoch": 0.5078866249366751, + "grad_norm": 1.5678521394729614, + "learning_rate": 4.942395419228547e-05, + "loss": 0.09091430306434631, + "step": 118300 + }, + { + "epoch": 0.5079295570266952, + "grad_norm": 0.012192213907837868, + "learning_rate": 4.941964247216785e-05, + "loss": 0.15386734008789063, + "step": 118310 + }, + { + "epoch": 0.5079724891167152, + "grad_norm": 0.06318014115095139, + "learning_rate": 4.9415330752050226e-05, + "loss": 0.27775509357452394, + "step": 118320 + }, + { + "epoch": 0.5080154212067352, + "grad_norm": 0.0260726660490036, + "learning_rate": 4.9411019031932604e-05, + "loss": 0.2071290969848633, + "step": 118330 + }, + { + "epoch": 0.5080583532967552, + "grad_norm": 0.01756739243865013, + "learning_rate": 4.9406707311814974e-05, + "loss": 0.18228111267089844, + "step": 118340 + }, + { + "epoch": 0.5081012853867752, + "grad_norm": 2.401149034500122, + "learning_rate": 4.940239559169735e-05, + "loss": 0.06631351113319398, + "step": 118350 + }, + { + "epoch": 0.5081442174767952, + "grad_norm": 0.49134570360183716, + "learning_rate": 4.939808387157973e-05, + "loss": 0.3181680917739868, + "step": 118360 + }, + { + "epoch": 0.5081871495668152, + "grad_norm": 0.1767713576555252, + "learning_rate": 4.9393772151462106e-05, + "loss": 0.3349648714065552, + "step": 118370 + }, + { + "epoch": 0.5082300816568353, + "grad_norm": 0.002477077068760991, + "learning_rate": 4.9389460431344484e-05, + "loss": 0.3104588270187378, + "step": 118380 + }, + { + "epoch": 0.5082730137468552, + "grad_norm": 1.2304719686508179, + "learning_rate": 4.938514871122686e-05, + "loss": 0.18686811923980712, + "step": 118390 + }, + { + "epoch": 0.5083159458368752, + "grad_norm": 1.6199240684509277, + "learning_rate": 4.938083699110924e-05, + "loss": 0.6678519725799561, + "step": 118400 + }, + { + "epoch": 0.5083588779268953, + "grad_norm": 0.006245411932468414, + "learning_rate": 4.9376525270991616e-05, + "loss": 0.13646771907806396, + "step": 118410 + }, + { + "epoch": 0.5084018100169152, + "grad_norm": 1.9105803966522217, + "learning_rate": 4.9372213550873986e-05, + "loss": 0.15426579713821412, + "step": 118420 + }, + { + "epoch": 0.5084447421069352, + "grad_norm": 0.1612144559621811, + "learning_rate": 4.9367901830756364e-05, + "loss": 0.06480223536491395, + "step": 118430 + }, + { + "epoch": 0.5084876741969553, + "grad_norm": 0.00061203254153952, + "learning_rate": 4.936359011063874e-05, + "loss": 0.21237497329711913, + "step": 118440 + }, + { + "epoch": 0.5085306062869752, + "grad_norm": 18.185203552246094, + "learning_rate": 4.935927839052112e-05, + "loss": 0.5285263538360596, + "step": 118450 + }, + { + "epoch": 0.5085735383769953, + "grad_norm": 0.09754689037799835, + "learning_rate": 4.935496667040349e-05, + "loss": 0.3198472261428833, + "step": 118460 + }, + { + "epoch": 0.5086164704670153, + "grad_norm": 0.00233909348025918, + "learning_rate": 4.9350654950285866e-05, + "loss": 0.12812756299972533, + "step": 118470 + }, + { + "epoch": 0.5086594025570352, + "grad_norm": 0.07711761444807053, + "learning_rate": 4.9346343230168244e-05, + "loss": 0.30146214962005613, + "step": 118480 + }, + { + "epoch": 0.5087023346470553, + "grad_norm": 1.0213673114776611, + "learning_rate": 4.934203151005062e-05, + "loss": 0.17265766859054565, + "step": 118490 + }, + { + "epoch": 0.5087452667370753, + "grad_norm": 3.939648151397705, + "learning_rate": 4.9337719789933e-05, + "loss": 0.10305318832397461, + "step": 118500 + }, + { + "epoch": 0.5087881988270953, + "grad_norm": 0.01822766847908497, + "learning_rate": 4.9333408069815376e-05, + "loss": 0.22157607078552247, + "step": 118510 + }, + { + "epoch": 0.5088311309171153, + "grad_norm": 0.013578972779214382, + "learning_rate": 4.932909634969775e-05, + "loss": 0.03819190561771393, + "step": 118520 + }, + { + "epoch": 0.5088740630071353, + "grad_norm": 3.911149263381958, + "learning_rate": 4.932478462958013e-05, + "loss": 0.10772855281829834, + "step": 118530 + }, + { + "epoch": 0.5089169950971553, + "grad_norm": 0.11129100620746613, + "learning_rate": 4.93204729094625e-05, + "loss": 0.021247430145740508, + "step": 118540 + }, + { + "epoch": 0.5089599271871753, + "grad_norm": 0.060637425631284714, + "learning_rate": 4.931616118934488e-05, + "loss": 0.19166574478149415, + "step": 118550 + }, + { + "epoch": 0.5090028592771954, + "grad_norm": 0.006027820520102978, + "learning_rate": 4.9311849469227256e-05, + "loss": 0.09662819504737855, + "step": 118560 + }, + { + "epoch": 0.5090457913672153, + "grad_norm": 0.03252384066581726, + "learning_rate": 4.930753774910963e-05, + "loss": 0.17281004190444946, + "step": 118570 + }, + { + "epoch": 0.5090887234572353, + "grad_norm": 0.010324299335479736, + "learning_rate": 4.930322602899201e-05, + "loss": 0.22369747161865233, + "step": 118580 + }, + { + "epoch": 0.5091316555472554, + "grad_norm": 0.9110094904899597, + "learning_rate": 4.929891430887438e-05, + "loss": 0.1740880250930786, + "step": 118590 + }, + { + "epoch": 0.5091745876372754, + "grad_norm": 1.0386021137237549, + "learning_rate": 4.929460258875676e-05, + "loss": 0.19195722341537474, + "step": 118600 + }, + { + "epoch": 0.5092175197272953, + "grad_norm": 1.992875337600708, + "learning_rate": 4.9290290868639136e-05, + "loss": 0.31609842777252195, + "step": 118610 + }, + { + "epoch": 0.5092604518173154, + "grad_norm": 14.978772163391113, + "learning_rate": 4.928597914852151e-05, + "loss": 0.34784390926361086, + "step": 118620 + }, + { + "epoch": 0.5093033839073354, + "grad_norm": 0.0040539707988500595, + "learning_rate": 4.928166742840389e-05, + "loss": 0.2548795223236084, + "step": 118630 + }, + { + "epoch": 0.5093463159973554, + "grad_norm": 0.03759922459721565, + "learning_rate": 4.927735570828627e-05, + "loss": 0.13416372537612914, + "step": 118640 + }, + { + "epoch": 0.5093892480873754, + "grad_norm": 1.1706840991973877, + "learning_rate": 4.9273043988168645e-05, + "loss": 0.28877017498016355, + "step": 118650 + }, + { + "epoch": 0.5094321801773954, + "grad_norm": 0.03230566531419754, + "learning_rate": 4.926873226805102e-05, + "loss": 0.16305909156799317, + "step": 118660 + }, + { + "epoch": 0.5094751122674154, + "grad_norm": 0.005947391968220472, + "learning_rate": 4.926442054793339e-05, + "loss": 0.1977899193763733, + "step": 118670 + }, + { + "epoch": 0.5095180443574354, + "grad_norm": 0.025694016367197037, + "learning_rate": 4.926010882781577e-05, + "loss": 0.3352708339691162, + "step": 118680 + }, + { + "epoch": 0.5095609764474555, + "grad_norm": 0.11599358171224594, + "learning_rate": 4.925579710769815e-05, + "loss": 0.1837789535522461, + "step": 118690 + }, + { + "epoch": 0.5096039085374754, + "grad_norm": 0.008364609442651272, + "learning_rate": 4.9251485387580525e-05, + "loss": 0.25064802169799805, + "step": 118700 + }, + { + "epoch": 0.5096468406274954, + "grad_norm": 1.6999173164367676, + "learning_rate": 4.9247173667462895e-05, + "loss": 0.16257811784744264, + "step": 118710 + }, + { + "epoch": 0.5096897727175155, + "grad_norm": 1.4505658149719238, + "learning_rate": 4.924286194734527e-05, + "loss": 0.22113513946533203, + "step": 118720 + }, + { + "epoch": 0.5097327048075354, + "grad_norm": 1.9441550970077515, + "learning_rate": 4.923855022722765e-05, + "loss": 0.28859150409698486, + "step": 118730 + }, + { + "epoch": 0.5097756368975555, + "grad_norm": 0.007706368342041969, + "learning_rate": 4.9234238507110034e-05, + "loss": 0.1654113531112671, + "step": 118740 + }, + { + "epoch": 0.5098185689875755, + "grad_norm": 0.0734504833817482, + "learning_rate": 4.9229926786992405e-05, + "loss": 0.21147520542144777, + "step": 118750 + }, + { + "epoch": 0.5098615010775954, + "grad_norm": 0.9500401616096497, + "learning_rate": 4.922561506687478e-05, + "loss": 0.12635742425918578, + "step": 118760 + }, + { + "epoch": 0.5099044331676155, + "grad_norm": 1.7449133396148682, + "learning_rate": 4.922130334675716e-05, + "loss": 0.2086487293243408, + "step": 118770 + }, + { + "epoch": 0.5099473652576355, + "grad_norm": 0.0021773355547338724, + "learning_rate": 4.921699162663954e-05, + "loss": 0.4723085880279541, + "step": 118780 + }, + { + "epoch": 0.5099902973476554, + "grad_norm": 0.013304928317666054, + "learning_rate": 4.921267990652191e-05, + "loss": 0.24113748073577881, + "step": 118790 + }, + { + "epoch": 0.5100332294376755, + "grad_norm": 1.3635761737823486, + "learning_rate": 4.9208368186404285e-05, + "loss": 0.2260056495666504, + "step": 118800 + }, + { + "epoch": 0.5100761615276955, + "grad_norm": 1.5315487384796143, + "learning_rate": 4.920405646628666e-05, + "loss": 0.19219486713409423, + "step": 118810 + }, + { + "epoch": 0.5101190936177155, + "grad_norm": 1.452343225479126, + "learning_rate": 4.919974474616904e-05, + "loss": 0.24570739269256592, + "step": 118820 + }, + { + "epoch": 0.5101620257077355, + "grad_norm": 0.014296891167759895, + "learning_rate": 4.919543302605141e-05, + "loss": 0.1545376181602478, + "step": 118830 + }, + { + "epoch": 0.5102049577977555, + "grad_norm": 0.02429923228919506, + "learning_rate": 4.919112130593379e-05, + "loss": 0.30917627811431886, + "step": 118840 + }, + { + "epoch": 0.5102478898877755, + "grad_norm": 2.3836729526519775, + "learning_rate": 4.918680958581617e-05, + "loss": 0.2750249862670898, + "step": 118850 + }, + { + "epoch": 0.5102908219777955, + "grad_norm": 0.08615285903215408, + "learning_rate": 4.918249786569855e-05, + "loss": 0.0027405740693211555, + "step": 118860 + }, + { + "epoch": 0.5103337540678156, + "grad_norm": 2.3817460536956787, + "learning_rate": 4.917818614558092e-05, + "loss": 0.17307817935943604, + "step": 118870 + }, + { + "epoch": 0.5103766861578355, + "grad_norm": 0.009556726552546024, + "learning_rate": 4.91738744254633e-05, + "loss": 0.13647642135620117, + "step": 118880 + }, + { + "epoch": 0.5104196182478555, + "grad_norm": 0.01113821566104889, + "learning_rate": 4.9169562705345674e-05, + "loss": 0.21420552730560302, + "step": 118890 + }, + { + "epoch": 0.5104625503378756, + "grad_norm": 0.1573408842086792, + "learning_rate": 4.916525098522805e-05, + "loss": 0.08282997012138367, + "step": 118900 + }, + { + "epoch": 0.5105054824278955, + "grad_norm": 0.009088823571801186, + "learning_rate": 4.916093926511042e-05, + "loss": 0.4955763339996338, + "step": 118910 + }, + { + "epoch": 0.5105484145179155, + "grad_norm": 3.0080113410949707, + "learning_rate": 4.91566275449928e-05, + "loss": 0.18253889083862304, + "step": 118920 + }, + { + "epoch": 0.5105913466079356, + "grad_norm": 0.040299657732248306, + "learning_rate": 4.9152315824875177e-05, + "loss": 0.13436200618743896, + "step": 118930 + }, + { + "epoch": 0.5106342786979555, + "grad_norm": 2.8525309562683105, + "learning_rate": 4.9148004104757554e-05, + "loss": 0.15410553216934203, + "step": 118940 + }, + { + "epoch": 0.5106772107879756, + "grad_norm": 2.061488151550293, + "learning_rate": 4.914369238463993e-05, + "loss": 0.2232752561569214, + "step": 118950 + }, + { + "epoch": 0.5107201428779956, + "grad_norm": 0.007045496255159378, + "learning_rate": 4.913938066452231e-05, + "loss": 0.15050796270370484, + "step": 118960 + }, + { + "epoch": 0.5107630749680155, + "grad_norm": 1.3432217836380005, + "learning_rate": 4.9135068944404686e-05, + "loss": 0.4139756202697754, + "step": 118970 + }, + { + "epoch": 0.5108060070580356, + "grad_norm": 3.406526803970337, + "learning_rate": 4.913075722428706e-05, + "loss": 0.17657510042190552, + "step": 118980 + }, + { + "epoch": 0.5108489391480556, + "grad_norm": 0.16910117864608765, + "learning_rate": 4.912644550416944e-05, + "loss": 0.30909993648529055, + "step": 118990 + }, + { + "epoch": 0.5108918712380756, + "grad_norm": 0.06404928863048553, + "learning_rate": 4.912213378405181e-05, + "loss": 0.11059070825576782, + "step": 119000 + }, + { + "epoch": 0.5108918712380756, + "eval_loss": 0.40124770998954773, + "eval_runtime": 27.1649, + "eval_samples_per_second": 3.681, + "eval_steps_per_second": 3.681, + "step": 119000 + }, + { + "epoch": 0.5109348033280956, + "grad_norm": 0.006009700242429972, + "learning_rate": 4.911782206393419e-05, + "loss": 0.35542969703674315, + "step": 119010 + }, + { + "epoch": 0.5109777354181156, + "grad_norm": 4.224801540374756, + "learning_rate": 4.9113510343816566e-05, + "loss": 0.11748298406600952, + "step": 119020 + }, + { + "epoch": 0.5110206675081357, + "grad_norm": 2.7295689582824707, + "learning_rate": 4.910919862369894e-05, + "loss": 0.2602656602859497, + "step": 119030 + }, + { + "epoch": 0.5110635995981556, + "grad_norm": 0.4950346052646637, + "learning_rate": 4.9104886903581314e-05, + "loss": 0.16591581106185913, + "step": 119040 + }, + { + "epoch": 0.5111065316881757, + "grad_norm": 0.09243180602788925, + "learning_rate": 4.910057518346369e-05, + "loss": 0.09594988822937012, + "step": 119050 + }, + { + "epoch": 0.5111494637781957, + "grad_norm": 0.20909126102924347, + "learning_rate": 4.909626346334607e-05, + "loss": 0.21844382286071778, + "step": 119060 + }, + { + "epoch": 0.5111923958682156, + "grad_norm": 0.017204085364937782, + "learning_rate": 4.9091951743228446e-05, + "loss": 0.035818496346473695, + "step": 119070 + }, + { + "epoch": 0.5112353279582357, + "grad_norm": 0.016038501635193825, + "learning_rate": 4.908764002311082e-05, + "loss": 0.15340156555175782, + "step": 119080 + }, + { + "epoch": 0.5112782600482557, + "grad_norm": 0.12767556309700012, + "learning_rate": 4.90833283029932e-05, + "loss": 0.1794809341430664, + "step": 119090 + }, + { + "epoch": 0.5113211921382756, + "grad_norm": 1.6308916807174683, + "learning_rate": 4.907901658287558e-05, + "loss": 0.28574433326721194, + "step": 119100 + }, + { + "epoch": 0.5113641242282957, + "grad_norm": 0.07135585695505142, + "learning_rate": 4.9074704862757955e-05, + "loss": 0.10939698219299317, + "step": 119110 + }, + { + "epoch": 0.5114070563183157, + "grad_norm": 0.0036214394494891167, + "learning_rate": 4.9070393142640326e-05, + "loss": 0.02490311712026596, + "step": 119120 + }, + { + "epoch": 0.5114499884083357, + "grad_norm": 0.07109871506690979, + "learning_rate": 4.90660814225227e-05, + "loss": 0.22565762996673583, + "step": 119130 + }, + { + "epoch": 0.5114929204983557, + "grad_norm": 0.05331380292773247, + "learning_rate": 4.906176970240508e-05, + "loss": 0.05728686451911926, + "step": 119140 + }, + { + "epoch": 0.5115358525883758, + "grad_norm": 2.1476547718048096, + "learning_rate": 4.905745798228746e-05, + "loss": 0.18775432109832763, + "step": 119150 + }, + { + "epoch": 0.5115787846783957, + "grad_norm": 0.019761990755796432, + "learning_rate": 4.905314626216983e-05, + "loss": 0.07299734950065613, + "step": 119160 + }, + { + "epoch": 0.5116217167684157, + "grad_norm": 6.146413803100586, + "learning_rate": 4.9048834542052206e-05, + "loss": 0.331719183921814, + "step": 119170 + }, + { + "epoch": 0.5116646488584358, + "grad_norm": 0.03874233737587929, + "learning_rate": 4.904452282193458e-05, + "loss": 0.17403292655944824, + "step": 119180 + }, + { + "epoch": 0.5117075809484557, + "grad_norm": 0.5045718550682068, + "learning_rate": 4.904021110181696e-05, + "loss": 0.14510732889175415, + "step": 119190 + }, + { + "epoch": 0.5117505130384757, + "grad_norm": 1.7479007244110107, + "learning_rate": 4.903589938169934e-05, + "loss": 0.4036064147949219, + "step": 119200 + }, + { + "epoch": 0.5117934451284958, + "grad_norm": 0.01966957002878189, + "learning_rate": 4.9031587661581715e-05, + "loss": 0.28596360683441163, + "step": 119210 + }, + { + "epoch": 0.5118363772185157, + "grad_norm": 0.0067664021626114845, + "learning_rate": 4.902727594146409e-05, + "loss": 0.1794750690460205, + "step": 119220 + }, + { + "epoch": 0.5118793093085358, + "grad_norm": 0.01229294016957283, + "learning_rate": 4.902296422134647e-05, + "loss": 0.26773154735565186, + "step": 119230 + }, + { + "epoch": 0.5119222413985558, + "grad_norm": 1.3651846647262573, + "learning_rate": 4.901865250122884e-05, + "loss": 0.25625336170196533, + "step": 119240 + }, + { + "epoch": 0.5119651734885757, + "grad_norm": 0.006748523097485304, + "learning_rate": 4.901434078111122e-05, + "loss": 0.21946640014648439, + "step": 119250 + }, + { + "epoch": 0.5120081055785958, + "grad_norm": 7.10434627532959, + "learning_rate": 4.9010029060993595e-05, + "loss": 0.1346738815307617, + "step": 119260 + }, + { + "epoch": 0.5120510376686158, + "grad_norm": 0.001597086084075272, + "learning_rate": 4.900571734087597e-05, + "loss": 0.3982970714569092, + "step": 119270 + }, + { + "epoch": 0.5120939697586357, + "grad_norm": 6.853429794311523, + "learning_rate": 4.900140562075834e-05, + "loss": 0.37209444046020507, + "step": 119280 + }, + { + "epoch": 0.5121369018486558, + "grad_norm": 0.052766378968954086, + "learning_rate": 4.899709390064072e-05, + "loss": 0.150531005859375, + "step": 119290 + }, + { + "epoch": 0.5121798339386758, + "grad_norm": 1.664138913154602, + "learning_rate": 4.89927821805231e-05, + "loss": 0.2832683324813843, + "step": 119300 + }, + { + "epoch": 0.5122227660286958, + "grad_norm": 0.017171716317534447, + "learning_rate": 4.8988470460405475e-05, + "loss": 0.047049257159233096, + "step": 119310 + }, + { + "epoch": 0.5122656981187158, + "grad_norm": 3.7572007179260254, + "learning_rate": 4.898415874028785e-05, + "loss": 0.19532525539398193, + "step": 119320 + }, + { + "epoch": 0.5123086302087358, + "grad_norm": 2.1995935440063477, + "learning_rate": 4.897984702017023e-05, + "loss": 0.29763474464416506, + "step": 119330 + }, + { + "epoch": 0.5123515622987558, + "grad_norm": 0.06697755306959152, + "learning_rate": 4.897553530005261e-05, + "loss": 0.16395822763442994, + "step": 119340 + }, + { + "epoch": 0.5123944943887758, + "grad_norm": 2.4659860134124756, + "learning_rate": 4.8971223579934984e-05, + "loss": 0.3857898712158203, + "step": 119350 + }, + { + "epoch": 0.5124374264787959, + "grad_norm": 4.263999938964844, + "learning_rate": 4.896691185981736e-05, + "loss": 0.34842002391815186, + "step": 119360 + }, + { + "epoch": 0.5124803585688158, + "grad_norm": 0.004874889738857746, + "learning_rate": 4.896260013969973e-05, + "loss": 0.2140347480773926, + "step": 119370 + }, + { + "epoch": 0.5125232906588358, + "grad_norm": 0.015676314011216164, + "learning_rate": 4.895828841958211e-05, + "loss": 0.15982836484909058, + "step": 119380 + }, + { + "epoch": 0.5125662227488559, + "grad_norm": 0.0671662986278534, + "learning_rate": 4.895397669946449e-05, + "loss": 0.37576262950897216, + "step": 119390 + }, + { + "epoch": 0.5126091548388758, + "grad_norm": 0.5890264511108398, + "learning_rate": 4.8949664979346864e-05, + "loss": 0.2299262046813965, + "step": 119400 + }, + { + "epoch": 0.5126520869288959, + "grad_norm": 0.02277068980038166, + "learning_rate": 4.8945353259229235e-05, + "loss": 0.3137866735458374, + "step": 119410 + }, + { + "epoch": 0.5126950190189159, + "grad_norm": 1.7423510551452637, + "learning_rate": 4.894104153911161e-05, + "loss": 0.42569632530212403, + "step": 119420 + }, + { + "epoch": 0.5127379511089358, + "grad_norm": 0.08310827612876892, + "learning_rate": 4.893672981899399e-05, + "loss": 0.09291516542434693, + "step": 119430 + }, + { + "epoch": 0.5127808831989559, + "grad_norm": 0.0016216287622228265, + "learning_rate": 4.8932418098876374e-05, + "loss": 0.031064292788505553, + "step": 119440 + }, + { + "epoch": 0.5128238152889759, + "grad_norm": 4.090322017669678, + "learning_rate": 4.8928106378758744e-05, + "loss": 0.43431825637817384, + "step": 119450 + }, + { + "epoch": 0.512866747378996, + "grad_norm": 0.022212985903024673, + "learning_rate": 4.892379465864112e-05, + "loss": 0.07562644481658935, + "step": 119460 + }, + { + "epoch": 0.5129096794690159, + "grad_norm": 1.6900726556777954, + "learning_rate": 4.89194829385235e-05, + "loss": 0.17154940366744995, + "step": 119470 + }, + { + "epoch": 0.5129526115590359, + "grad_norm": 0.9105339646339417, + "learning_rate": 4.8915171218405876e-05, + "loss": 0.05909296274185181, + "step": 119480 + }, + { + "epoch": 0.512995543649056, + "grad_norm": 4.826657295227051, + "learning_rate": 4.891085949828825e-05, + "loss": 0.21760139465332032, + "step": 119490 + }, + { + "epoch": 0.5130384757390759, + "grad_norm": 0.024427048861980438, + "learning_rate": 4.8906547778170624e-05, + "loss": 0.21957478523254395, + "step": 119500 + }, + { + "epoch": 0.513081407829096, + "grad_norm": 1.8900254964828491, + "learning_rate": 4.8902236058053e-05, + "loss": 0.24639198780059815, + "step": 119510 + }, + { + "epoch": 0.513124339919116, + "grad_norm": 0.015116676688194275, + "learning_rate": 4.889792433793538e-05, + "loss": 0.07427915930747986, + "step": 119520 + }, + { + "epoch": 0.5131672720091359, + "grad_norm": 1.8460825681686401, + "learning_rate": 4.889361261781775e-05, + "loss": 0.29505224227905275, + "step": 119530 + }, + { + "epoch": 0.513210204099156, + "grad_norm": 0.009372652508318424, + "learning_rate": 4.888930089770013e-05, + "loss": 0.17351727485656737, + "step": 119540 + }, + { + "epoch": 0.513253136189176, + "grad_norm": 9.46724796295166, + "learning_rate": 4.888498917758251e-05, + "loss": 0.2226557493209839, + "step": 119550 + }, + { + "epoch": 0.5132960682791959, + "grad_norm": 0.013306156732141972, + "learning_rate": 4.888067745746489e-05, + "loss": 0.2522123336791992, + "step": 119560 + }, + { + "epoch": 0.513339000369216, + "grad_norm": 2.3092589378356934, + "learning_rate": 4.887636573734726e-05, + "loss": 0.31043570041656493, + "step": 119570 + }, + { + "epoch": 0.513381932459236, + "grad_norm": 1.2920349836349487, + "learning_rate": 4.8872054017229636e-05, + "loss": 0.26676900386810304, + "step": 119580 + }, + { + "epoch": 0.513424864549256, + "grad_norm": 2.2212910652160645, + "learning_rate": 4.8867742297112013e-05, + "loss": 0.19261443614959717, + "step": 119590 + }, + { + "epoch": 0.513467796639276, + "grad_norm": 0.011586138047277927, + "learning_rate": 4.886343057699439e-05, + "loss": 0.09315774440765381, + "step": 119600 + }, + { + "epoch": 0.513510728729296, + "grad_norm": 4.446086883544922, + "learning_rate": 4.885911885687676e-05, + "loss": 0.11330181360244751, + "step": 119610 + }, + { + "epoch": 0.513553660819316, + "grad_norm": 3.613748550415039, + "learning_rate": 4.885480713675914e-05, + "loss": 0.36722991466522215, + "step": 119620 + }, + { + "epoch": 0.513596592909336, + "grad_norm": 0.012964880093932152, + "learning_rate": 4.8850495416641516e-05, + "loss": 0.21476566791534424, + "step": 119630 + }, + { + "epoch": 0.5136395249993561, + "grad_norm": 0.008148579858243465, + "learning_rate": 4.8846183696523893e-05, + "loss": 0.3132286310195923, + "step": 119640 + }, + { + "epoch": 0.513682457089376, + "grad_norm": 10.44156551361084, + "learning_rate": 4.8841871976406264e-05, + "loss": 0.3538014888763428, + "step": 119650 + }, + { + "epoch": 0.513725389179396, + "grad_norm": 0.010352738201618195, + "learning_rate": 4.883756025628865e-05, + "loss": 0.09612705111503601, + "step": 119660 + }, + { + "epoch": 0.5137683212694161, + "grad_norm": 0.028367938473820686, + "learning_rate": 4.8833248536171025e-05, + "loss": 0.12962062358856202, + "step": 119670 + }, + { + "epoch": 0.513811253359436, + "grad_norm": 1.7438544034957886, + "learning_rate": 4.88289368160534e-05, + "loss": 0.09261025190353393, + "step": 119680 + }, + { + "epoch": 0.513854185449456, + "grad_norm": 0.004251221194863319, + "learning_rate": 4.882462509593578e-05, + "loss": 0.169364333152771, + "step": 119690 + }, + { + "epoch": 0.5138971175394761, + "grad_norm": 2.1003313064575195, + "learning_rate": 4.882031337581815e-05, + "loss": 0.26846721172332766, + "step": 119700 + }, + { + "epoch": 0.513940049629496, + "grad_norm": 2.0436689853668213, + "learning_rate": 4.881600165570053e-05, + "loss": 0.25663022994995116, + "step": 119710 + }, + { + "epoch": 0.5139829817195161, + "grad_norm": 0.13745620846748352, + "learning_rate": 4.8811689935582905e-05, + "loss": 0.13834238052368164, + "step": 119720 + }, + { + "epoch": 0.5140259138095361, + "grad_norm": 1.7277699708938599, + "learning_rate": 4.880737821546528e-05, + "loss": 0.3672468662261963, + "step": 119730 + }, + { + "epoch": 0.514068845899556, + "grad_norm": 1.5249468088150024, + "learning_rate": 4.880306649534765e-05, + "loss": 0.26362793445587157, + "step": 119740 + }, + { + "epoch": 0.5141117779895761, + "grad_norm": 0.02339192107319832, + "learning_rate": 4.879875477523003e-05, + "loss": 0.12202959060668946, + "step": 119750 + }, + { + "epoch": 0.5141547100795961, + "grad_norm": 0.012677889317274094, + "learning_rate": 4.879444305511241e-05, + "loss": 0.2084029197692871, + "step": 119760 + }, + { + "epoch": 0.514197642169616, + "grad_norm": 0.8021616339683533, + "learning_rate": 4.8790131334994785e-05, + "loss": 0.11093568801879883, + "step": 119770 + }, + { + "epoch": 0.5142405742596361, + "grad_norm": 0.0014070516917854548, + "learning_rate": 4.878581961487716e-05, + "loss": 0.35231709480285645, + "step": 119780 + }, + { + "epoch": 0.5142835063496561, + "grad_norm": 5.40916109085083, + "learning_rate": 4.878150789475954e-05, + "loss": 0.19839794635772706, + "step": 119790 + }, + { + "epoch": 0.5143264384396761, + "grad_norm": 0.730078935623169, + "learning_rate": 4.877719617464192e-05, + "loss": 0.050247853994369505, + "step": 119800 + }, + { + "epoch": 0.5143693705296961, + "grad_norm": 0.39397040009498596, + "learning_rate": 4.8772884454524295e-05, + "loss": 0.06387539505958557, + "step": 119810 + }, + { + "epoch": 0.5144123026197162, + "grad_norm": 0.00646138796582818, + "learning_rate": 4.8768572734406665e-05, + "loss": 0.19105199575424195, + "step": 119820 + }, + { + "epoch": 0.5144552347097361, + "grad_norm": 0.008075837977230549, + "learning_rate": 4.876426101428904e-05, + "loss": 0.1513899803161621, + "step": 119830 + }, + { + "epoch": 0.5144981667997561, + "grad_norm": 1.2805685997009277, + "learning_rate": 4.875994929417142e-05, + "loss": 0.18917036056518555, + "step": 119840 + }, + { + "epoch": 0.5145410988897762, + "grad_norm": 0.06163414567708969, + "learning_rate": 4.87556375740538e-05, + "loss": 0.1583377242088318, + "step": 119850 + }, + { + "epoch": 0.5145840309797961, + "grad_norm": 1.6436272859573364, + "learning_rate": 4.875132585393617e-05, + "loss": 0.17201461791992187, + "step": 119860 + }, + { + "epoch": 0.5146269630698161, + "grad_norm": 1.2979214191436768, + "learning_rate": 4.8747014133818545e-05, + "loss": 0.2652168273925781, + "step": 119870 + }, + { + "epoch": 0.5146698951598362, + "grad_norm": 2.1574161052703857, + "learning_rate": 4.874270241370092e-05, + "loss": 0.2809084415435791, + "step": 119880 + }, + { + "epoch": 0.5147128272498562, + "grad_norm": 0.0017827115952968597, + "learning_rate": 4.87383906935833e-05, + "loss": 0.3289920806884766, + "step": 119890 + }, + { + "epoch": 0.5147557593398762, + "grad_norm": 3.049043655395508, + "learning_rate": 4.873407897346568e-05, + "loss": 0.2237234592437744, + "step": 119900 + }, + { + "epoch": 0.5147986914298962, + "grad_norm": 0.01375525165349245, + "learning_rate": 4.8729767253348055e-05, + "loss": 0.13331135511398315, + "step": 119910 + }, + { + "epoch": 0.5148416235199162, + "grad_norm": 0.5105711817741394, + "learning_rate": 4.872545553323043e-05, + "loss": 0.22515881061553955, + "step": 119920 + }, + { + "epoch": 0.5148845556099362, + "grad_norm": 0.6324796676635742, + "learning_rate": 4.872114381311281e-05, + "loss": 0.20807430744171143, + "step": 119930 + }, + { + "epoch": 0.5149274876999562, + "grad_norm": 1.271294355392456, + "learning_rate": 4.871683209299518e-05, + "loss": 0.10505075454711914, + "step": 119940 + }, + { + "epoch": 0.5149704197899763, + "grad_norm": 0.09919047355651855, + "learning_rate": 4.871252037287756e-05, + "loss": 0.07043569684028625, + "step": 119950 + }, + { + "epoch": 0.5150133518799962, + "grad_norm": 0.12949281930923462, + "learning_rate": 4.8708208652759935e-05, + "loss": 0.19191750288009643, + "step": 119960 + }, + { + "epoch": 0.5150562839700162, + "grad_norm": 0.09126322716474533, + "learning_rate": 4.870389693264231e-05, + "loss": 0.19513965845108033, + "step": 119970 + }, + { + "epoch": 0.5150992160600363, + "grad_norm": 0.8619544506072998, + "learning_rate": 4.869958521252468e-05, + "loss": 0.36798958778381347, + "step": 119980 + }, + { + "epoch": 0.5151421481500562, + "grad_norm": 0.046924423426389694, + "learning_rate": 4.869527349240706e-05, + "loss": 0.2555686473846436, + "step": 119990 + }, + { + "epoch": 0.5151850802400763, + "grad_norm": 0.0034495419822633266, + "learning_rate": 4.869096177228944e-05, + "loss": 0.278632378578186, + "step": 120000 + }, + { + "epoch": 0.5151850802400763, + "eval_loss": 0.4063738286495209, + "eval_runtime": 27.1171, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 120000 + }, + { + "epoch": 0.5152280123300963, + "grad_norm": 0.049443356692790985, + "learning_rate": 4.8686650052171814e-05, + "loss": 0.4057520866394043, + "step": 120010 + }, + { + "epoch": 0.5152709444201162, + "grad_norm": 0.0015346826985478401, + "learning_rate": 4.868233833205419e-05, + "loss": 0.10722736120224, + "step": 120020 + }, + { + "epoch": 0.5153138765101363, + "grad_norm": 0.0473739318549633, + "learning_rate": 4.867802661193657e-05, + "loss": 0.14505916833877563, + "step": 120030 + }, + { + "epoch": 0.5153568086001563, + "grad_norm": 0.023228967562317848, + "learning_rate": 4.8673714891818946e-05, + "loss": 0.2812461853027344, + "step": 120040 + }, + { + "epoch": 0.5153997406901762, + "grad_norm": 0.006087969522923231, + "learning_rate": 4.8669403171701324e-05, + "loss": 0.2892911911010742, + "step": 120050 + }, + { + "epoch": 0.5154426727801963, + "grad_norm": 0.00686612306162715, + "learning_rate": 4.86650914515837e-05, + "loss": 0.2778724431991577, + "step": 120060 + }, + { + "epoch": 0.5154856048702163, + "grad_norm": 0.6555600166320801, + "learning_rate": 4.866077973146607e-05, + "loss": 0.051532381772994997, + "step": 120070 + }, + { + "epoch": 0.5155285369602363, + "grad_norm": 6.1181464195251465, + "learning_rate": 4.865646801134845e-05, + "loss": 0.46932668685913087, + "step": 120080 + }, + { + "epoch": 0.5155714690502563, + "grad_norm": 0.0027987684588879347, + "learning_rate": 4.8652156291230826e-05, + "loss": 0.09915638566017151, + "step": 120090 + }, + { + "epoch": 0.5156144011402763, + "grad_norm": 1.746132493019104, + "learning_rate": 4.8647844571113204e-05, + "loss": 0.19194701910018921, + "step": 120100 + }, + { + "epoch": 0.5156573332302963, + "grad_norm": 0.0009860859718173742, + "learning_rate": 4.8643532850995574e-05, + "loss": 0.03388981223106384, + "step": 120110 + }, + { + "epoch": 0.5157002653203163, + "grad_norm": 0.7049172520637512, + "learning_rate": 4.863922113087795e-05, + "loss": 0.278376030921936, + "step": 120120 + }, + { + "epoch": 0.5157431974103364, + "grad_norm": 0.0033822215627878904, + "learning_rate": 4.863490941076033e-05, + "loss": 0.134699809551239, + "step": 120130 + }, + { + "epoch": 0.5157861295003563, + "grad_norm": 0.8424586653709412, + "learning_rate": 4.8630597690642706e-05, + "loss": 0.33212087154388426, + "step": 120140 + }, + { + "epoch": 0.5158290615903763, + "grad_norm": 2.1320204734802246, + "learning_rate": 4.8626285970525084e-05, + "loss": 0.26734697818756104, + "step": 120150 + }, + { + "epoch": 0.5158719936803964, + "grad_norm": 1.3596432209014893, + "learning_rate": 4.862197425040746e-05, + "loss": 0.10061935186386109, + "step": 120160 + }, + { + "epoch": 0.5159149257704163, + "grad_norm": 1.0820353031158447, + "learning_rate": 4.861766253028984e-05, + "loss": 0.17431243658065795, + "step": 120170 + }, + { + "epoch": 0.5159578578604364, + "grad_norm": 0.0008929084287956357, + "learning_rate": 4.8613350810172216e-05, + "loss": 0.14325375556945802, + "step": 120180 + }, + { + "epoch": 0.5160007899504564, + "grad_norm": 2.565387010574341, + "learning_rate": 4.8609039090054586e-05, + "loss": 0.33260061740875246, + "step": 120190 + }, + { + "epoch": 0.5160437220404763, + "grad_norm": 0.011970599181950092, + "learning_rate": 4.8604727369936964e-05, + "loss": 0.18246814012527465, + "step": 120200 + }, + { + "epoch": 0.5160866541304964, + "grad_norm": 0.02352987229824066, + "learning_rate": 4.860041564981934e-05, + "loss": 0.21905741691589356, + "step": 120210 + }, + { + "epoch": 0.5161295862205164, + "grad_norm": 0.03168490156531334, + "learning_rate": 4.859610392970172e-05, + "loss": 0.2257610082626343, + "step": 120220 + }, + { + "epoch": 0.5161725183105363, + "grad_norm": 0.12218201160430908, + "learning_rate": 4.859179220958409e-05, + "loss": 0.32497642040252683, + "step": 120230 + }, + { + "epoch": 0.5162154504005564, + "grad_norm": 4.0117034912109375, + "learning_rate": 4.8587480489466466e-05, + "loss": 0.00710456520318985, + "step": 120240 + }, + { + "epoch": 0.5162583824905764, + "grad_norm": 0.032607097178697586, + "learning_rate": 4.8583168769348844e-05, + "loss": 0.28474650382995603, + "step": 120250 + }, + { + "epoch": 0.5163013145805964, + "grad_norm": 0.1141975000500679, + "learning_rate": 4.857885704923123e-05, + "loss": 0.18312805891036987, + "step": 120260 + }, + { + "epoch": 0.5163442466706164, + "grad_norm": 0.008268962614238262, + "learning_rate": 4.85745453291136e-05, + "loss": 0.18339786529541016, + "step": 120270 + }, + { + "epoch": 0.5163871787606364, + "grad_norm": 5.46415376663208, + "learning_rate": 4.8570233608995976e-05, + "loss": 0.37812774181365966, + "step": 120280 + }, + { + "epoch": 0.5164301108506564, + "grad_norm": 4.220892429351807, + "learning_rate": 4.856592188887835e-05, + "loss": 0.24817111492156982, + "step": 120290 + }, + { + "epoch": 0.5164730429406764, + "grad_norm": 2.257537841796875, + "learning_rate": 4.856161016876073e-05, + "loss": 0.31221191883087157, + "step": 120300 + }, + { + "epoch": 0.5165159750306965, + "grad_norm": 0.1420465111732483, + "learning_rate": 4.85572984486431e-05, + "loss": 0.08064374923706055, + "step": 120310 + }, + { + "epoch": 0.5165589071207165, + "grad_norm": 2.9821970462799072, + "learning_rate": 4.855298672852548e-05, + "loss": 0.33227217197418213, + "step": 120320 + }, + { + "epoch": 0.5166018392107364, + "grad_norm": 0.23036722838878632, + "learning_rate": 4.8548675008407856e-05, + "loss": 0.15705791711807252, + "step": 120330 + }, + { + "epoch": 0.5166447713007565, + "grad_norm": 0.01970132440328598, + "learning_rate": 4.854436328829023e-05, + "loss": 0.3197164058685303, + "step": 120340 + }, + { + "epoch": 0.5166877033907765, + "grad_norm": 1.0530132055282593, + "learning_rate": 4.8540051568172603e-05, + "loss": 0.10031181573867798, + "step": 120350 + }, + { + "epoch": 0.5167306354807965, + "grad_norm": 0.05745401605963707, + "learning_rate": 4.853573984805498e-05, + "loss": 0.15597434043884278, + "step": 120360 + }, + { + "epoch": 0.5167735675708165, + "grad_norm": 0.003141403431072831, + "learning_rate": 4.8531428127937365e-05, + "loss": 0.035469675064086915, + "step": 120370 + }, + { + "epoch": 0.5168164996608365, + "grad_norm": 2.398857831954956, + "learning_rate": 4.852711640781974e-05, + "loss": 0.32916827201843263, + "step": 120380 + }, + { + "epoch": 0.5168594317508565, + "grad_norm": 3.834961175918579, + "learning_rate": 4.852280468770212e-05, + "loss": 0.08573095798492432, + "step": 120390 + }, + { + "epoch": 0.5169023638408765, + "grad_norm": 0.13391876220703125, + "learning_rate": 4.851849296758449e-05, + "loss": 0.15842015743255616, + "step": 120400 + }, + { + "epoch": 0.5169452959308966, + "grad_norm": 0.6579734086990356, + "learning_rate": 4.851418124746687e-05, + "loss": 0.16875349283218383, + "step": 120410 + }, + { + "epoch": 0.5169882280209165, + "grad_norm": 7.994966506958008, + "learning_rate": 4.8509869527349245e-05, + "loss": 0.18427584171295167, + "step": 120420 + }, + { + "epoch": 0.5170311601109365, + "grad_norm": 0.0009446104522794485, + "learning_rate": 4.850555780723162e-05, + "loss": 0.18941471576690674, + "step": 120430 + }, + { + "epoch": 0.5170740922009566, + "grad_norm": 0.09993302077054977, + "learning_rate": 4.850124608711399e-05, + "loss": 0.14466171264648436, + "step": 120440 + }, + { + "epoch": 0.5171170242909765, + "grad_norm": 0.06350284069776535, + "learning_rate": 4.849693436699637e-05, + "loss": 0.3882965326309204, + "step": 120450 + }, + { + "epoch": 0.5171599563809965, + "grad_norm": 0.24465759098529816, + "learning_rate": 4.849262264687875e-05, + "loss": 0.2968221426010132, + "step": 120460 + }, + { + "epoch": 0.5172028884710166, + "grad_norm": 0.36420390009880066, + "learning_rate": 4.8488310926761125e-05, + "loss": 0.20713613033294678, + "step": 120470 + }, + { + "epoch": 0.5172458205610365, + "grad_norm": 1.466484546661377, + "learning_rate": 4.84839992066435e-05, + "loss": 0.1342691659927368, + "step": 120480 + }, + { + "epoch": 0.5172887526510566, + "grad_norm": 4.7978620529174805, + "learning_rate": 4.847968748652588e-05, + "loss": 0.3547311305999756, + "step": 120490 + }, + { + "epoch": 0.5173316847410766, + "grad_norm": 0.018700161948800087, + "learning_rate": 4.847537576640826e-05, + "loss": 0.12048419713973998, + "step": 120500 + }, + { + "epoch": 0.5173746168310965, + "grad_norm": 0.02476024068892002, + "learning_rate": 4.8471064046290634e-05, + "loss": 0.10956237316131592, + "step": 120510 + }, + { + "epoch": 0.5174175489211166, + "grad_norm": 0.3054960370063782, + "learning_rate": 4.8466752326173005e-05, + "loss": 0.014632061123847961, + "step": 120520 + }, + { + "epoch": 0.5174604810111366, + "grad_norm": 0.015139803290367126, + "learning_rate": 4.846244060605538e-05, + "loss": 0.05507909655570984, + "step": 120530 + }, + { + "epoch": 0.5175034131011566, + "grad_norm": 0.054174065589904785, + "learning_rate": 4.845812888593776e-05, + "loss": 0.1126629114151001, + "step": 120540 + }, + { + "epoch": 0.5175463451911766, + "grad_norm": 0.22714291512966156, + "learning_rate": 4.845381716582014e-05, + "loss": 0.40899171829223635, + "step": 120550 + }, + { + "epoch": 0.5175892772811966, + "grad_norm": 0.03834908828139305, + "learning_rate": 4.844950544570251e-05, + "loss": 0.2631853103637695, + "step": 120560 + }, + { + "epoch": 0.5176322093712166, + "grad_norm": 0.007033985573798418, + "learning_rate": 4.8445193725584885e-05, + "loss": 0.2556145429611206, + "step": 120570 + }, + { + "epoch": 0.5176751414612366, + "grad_norm": 2.763967990875244, + "learning_rate": 4.844088200546726e-05, + "loss": 0.37081136703491213, + "step": 120580 + }, + { + "epoch": 0.5177180735512567, + "grad_norm": 0.0014255971182137728, + "learning_rate": 4.843657028534964e-05, + "loss": 0.14834992885589598, + "step": 120590 + }, + { + "epoch": 0.5177610056412766, + "grad_norm": 0.9102689027786255, + "learning_rate": 4.843225856523202e-05, + "loss": 0.23219048976898193, + "step": 120600 + }, + { + "epoch": 0.5178039377312966, + "grad_norm": 0.015008285641670227, + "learning_rate": 4.8427946845114394e-05, + "loss": 0.1743320941925049, + "step": 120610 + }, + { + "epoch": 0.5178468698213167, + "grad_norm": 1.1973248720169067, + "learning_rate": 4.842363512499677e-05, + "loss": 0.27267420291900635, + "step": 120620 + }, + { + "epoch": 0.5178898019113366, + "grad_norm": 0.023041389882564545, + "learning_rate": 4.841932340487915e-05, + "loss": 0.18333592414855956, + "step": 120630 + }, + { + "epoch": 0.5179327340013566, + "grad_norm": 0.01211979053914547, + "learning_rate": 4.841501168476152e-05, + "loss": 0.10341588258743287, + "step": 120640 + }, + { + "epoch": 0.5179756660913767, + "grad_norm": 5.778181552886963, + "learning_rate": 4.84106999646439e-05, + "loss": 0.1535136342048645, + "step": 120650 + }, + { + "epoch": 0.5180185981813966, + "grad_norm": 0.2170112133026123, + "learning_rate": 4.8406388244526274e-05, + "loss": 0.2336575508117676, + "step": 120660 + }, + { + "epoch": 0.5180615302714167, + "grad_norm": 0.015231816098093987, + "learning_rate": 4.840207652440865e-05, + "loss": 0.2085479497909546, + "step": 120670 + }, + { + "epoch": 0.5181044623614367, + "grad_norm": 0.02248145081102848, + "learning_rate": 4.839776480429102e-05, + "loss": 0.06460311412811279, + "step": 120680 + }, + { + "epoch": 0.5181473944514566, + "grad_norm": 5.987329006195068, + "learning_rate": 4.83934530841734e-05, + "loss": 0.30335426330566406, + "step": 120690 + }, + { + "epoch": 0.5181903265414767, + "grad_norm": 3.8316006660461426, + "learning_rate": 4.8389141364055777e-05, + "loss": 0.2360456943511963, + "step": 120700 + }, + { + "epoch": 0.5182332586314967, + "grad_norm": 0.05581430345773697, + "learning_rate": 4.8384829643938154e-05, + "loss": 0.07034187912940978, + "step": 120710 + }, + { + "epoch": 0.5182761907215166, + "grad_norm": 1.4331754446029663, + "learning_rate": 4.838051792382053e-05, + "loss": 0.2972614288330078, + "step": 120720 + }, + { + "epoch": 0.5183191228115367, + "grad_norm": 0.013728511519730091, + "learning_rate": 4.837620620370291e-05, + "loss": 0.06504011750221253, + "step": 120730 + }, + { + "epoch": 0.5183620549015567, + "grad_norm": 0.8004436492919922, + "learning_rate": 4.8371894483585286e-05, + "loss": 0.2647553443908691, + "step": 120740 + }, + { + "epoch": 0.5184049869915768, + "grad_norm": 0.23927471041679382, + "learning_rate": 4.836758276346766e-05, + "loss": 0.17371283769607543, + "step": 120750 + }, + { + "epoch": 0.5184479190815967, + "grad_norm": 1.3092291355133057, + "learning_rate": 4.836327104335004e-05, + "loss": 0.20586049556732178, + "step": 120760 + }, + { + "epoch": 0.5184908511716168, + "grad_norm": 0.6740944981575012, + "learning_rate": 4.835895932323241e-05, + "loss": 0.2501313924789429, + "step": 120770 + }, + { + "epoch": 0.5185337832616368, + "grad_norm": 0.01920218952000141, + "learning_rate": 4.835464760311479e-05, + "loss": 0.38378133773803713, + "step": 120780 + }, + { + "epoch": 0.5185767153516567, + "grad_norm": 1.44028639793396, + "learning_rate": 4.8350335882997166e-05, + "loss": 0.3875690698623657, + "step": 120790 + }, + { + "epoch": 0.5186196474416768, + "grad_norm": 1.6943719387054443, + "learning_rate": 4.834602416287954e-05, + "loss": 0.21529688835144042, + "step": 120800 + }, + { + "epoch": 0.5186625795316968, + "grad_norm": 1.4529924392700195, + "learning_rate": 4.8341712442761914e-05, + "loss": 0.13335733413696288, + "step": 120810 + }, + { + "epoch": 0.5187055116217167, + "grad_norm": 0.08313264697790146, + "learning_rate": 4.833740072264429e-05, + "loss": 0.2530683517456055, + "step": 120820 + }, + { + "epoch": 0.5187484437117368, + "grad_norm": 0.0469275526702404, + "learning_rate": 4.833308900252667e-05, + "loss": 0.09530457854270935, + "step": 120830 + }, + { + "epoch": 0.5187913758017568, + "grad_norm": 0.0735456719994545, + "learning_rate": 4.8328777282409046e-05, + "loss": 0.28104963302612307, + "step": 120840 + }, + { + "epoch": 0.5188343078917768, + "grad_norm": 8.66446590423584, + "learning_rate": 4.832446556229142e-05, + "loss": 0.2639842748641968, + "step": 120850 + }, + { + "epoch": 0.5188772399817968, + "grad_norm": 1.215241551399231, + "learning_rate": 4.83201538421738e-05, + "loss": 0.15650783777236937, + "step": 120860 + }, + { + "epoch": 0.5189201720718168, + "grad_norm": 1.2033604383468628, + "learning_rate": 4.831584212205618e-05, + "loss": 0.2996173620223999, + "step": 120870 + }, + { + "epoch": 0.5189631041618368, + "grad_norm": 0.020583370700478554, + "learning_rate": 4.8311530401938555e-05, + "loss": 0.05468282699584961, + "step": 120880 + }, + { + "epoch": 0.5190060362518568, + "grad_norm": 0.12773597240447998, + "learning_rate": 4.8307218681820926e-05, + "loss": 0.2769580602645874, + "step": 120890 + }, + { + "epoch": 0.5190489683418769, + "grad_norm": 0.0935334786772728, + "learning_rate": 4.83029069617033e-05, + "loss": 0.03761700987815857, + "step": 120900 + }, + { + "epoch": 0.5190919004318968, + "grad_norm": 0.11925974488258362, + "learning_rate": 4.829859524158568e-05, + "loss": 0.26186795234680177, + "step": 120910 + }, + { + "epoch": 0.5191348325219168, + "grad_norm": 7.694597244262695, + "learning_rate": 4.829428352146806e-05, + "loss": 0.32566494941711427, + "step": 120920 + }, + { + "epoch": 0.5191777646119369, + "grad_norm": 0.3018726408481598, + "learning_rate": 4.828997180135043e-05, + "loss": 0.1309618830680847, + "step": 120930 + }, + { + "epoch": 0.5192206967019568, + "grad_norm": 0.0553029403090477, + "learning_rate": 4.8285660081232806e-05, + "loss": 0.09666863679885865, + "step": 120940 + }, + { + "epoch": 0.5192636287919769, + "grad_norm": 3.9531891345977783, + "learning_rate": 4.828134836111518e-05, + "loss": 0.12105257511138916, + "step": 120950 + }, + { + "epoch": 0.5193065608819969, + "grad_norm": 0.8712236285209656, + "learning_rate": 4.827703664099757e-05, + "loss": 0.1712932825088501, + "step": 120960 + }, + { + "epoch": 0.5193494929720168, + "grad_norm": 0.032162413001060486, + "learning_rate": 4.827272492087994e-05, + "loss": 0.1988675117492676, + "step": 120970 + }, + { + "epoch": 0.5193924250620369, + "grad_norm": 5.646240234375, + "learning_rate": 4.8268413200762315e-05, + "loss": 0.21210241317749023, + "step": 120980 + }, + { + "epoch": 0.5194353571520569, + "grad_norm": 0.017885640263557434, + "learning_rate": 4.826410148064469e-05, + "loss": 0.0828397035598755, + "step": 120990 + }, + { + "epoch": 0.5194782892420768, + "grad_norm": 0.05320185795426369, + "learning_rate": 4.825978976052707e-05, + "loss": 0.22818078994750976, + "step": 121000 + }, + { + "epoch": 0.5194782892420768, + "eval_loss": 0.4079383909702301, + "eval_runtime": 27.187, + "eval_samples_per_second": 3.678, + "eval_steps_per_second": 3.678, + "step": 121000 + }, + { + "epoch": 0.5195212213320969, + "grad_norm": 0.0048073879443109035, + "learning_rate": 4.825547804040944e-05, + "loss": 0.33350112438201907, + "step": 121010 + }, + { + "epoch": 0.5195641534221169, + "grad_norm": 2.622666120529175, + "learning_rate": 4.825116632029182e-05, + "loss": 0.2112447738647461, + "step": 121020 + }, + { + "epoch": 0.5196070855121369, + "grad_norm": 0.03589218854904175, + "learning_rate": 4.8246854600174195e-05, + "loss": 0.2726699113845825, + "step": 121030 + }, + { + "epoch": 0.5196500176021569, + "grad_norm": 0.6701458692550659, + "learning_rate": 4.824254288005657e-05, + "loss": 0.09031559228897094, + "step": 121040 + }, + { + "epoch": 0.5196929496921769, + "grad_norm": 0.08984647691249847, + "learning_rate": 4.823823115993894e-05, + "loss": 0.043091869354248045, + "step": 121050 + }, + { + "epoch": 0.5197358817821969, + "grad_norm": 0.08775527030229568, + "learning_rate": 4.823391943982132e-05, + "loss": 0.32499117851257325, + "step": 121060 + }, + { + "epoch": 0.5197788138722169, + "grad_norm": 0.07303040474653244, + "learning_rate": 4.8229607719703704e-05, + "loss": 0.24889826774597168, + "step": 121070 + }, + { + "epoch": 0.519821745962237, + "grad_norm": 0.01556952204555273, + "learning_rate": 4.822529599958608e-05, + "loss": 0.30057711601257325, + "step": 121080 + }, + { + "epoch": 0.5198646780522569, + "grad_norm": 1.1740511655807495, + "learning_rate": 4.822098427946845e-05, + "loss": 0.2835972309112549, + "step": 121090 + }, + { + "epoch": 0.5199076101422769, + "grad_norm": 3.6784210205078125, + "learning_rate": 4.821667255935083e-05, + "loss": 0.15858445167541504, + "step": 121100 + }, + { + "epoch": 0.519950542232297, + "grad_norm": 0.025471484288573265, + "learning_rate": 4.821236083923321e-05, + "loss": 0.09517346024513244, + "step": 121110 + }, + { + "epoch": 0.5199934743223169, + "grad_norm": 0.07156090438365936, + "learning_rate": 4.8208049119115584e-05, + "loss": 0.1761907935142517, + "step": 121120 + }, + { + "epoch": 0.520036406412337, + "grad_norm": 3.4601879119873047, + "learning_rate": 4.820373739899796e-05, + "loss": 0.15483657121658326, + "step": 121130 + }, + { + "epoch": 0.520079338502357, + "grad_norm": 1.6287287473678589, + "learning_rate": 4.819942567888033e-05, + "loss": 0.3838613271713257, + "step": 121140 + }, + { + "epoch": 0.5201222705923769, + "grad_norm": 2.1926848888397217, + "learning_rate": 4.819511395876271e-05, + "loss": 0.3708859920501709, + "step": 121150 + }, + { + "epoch": 0.520165202682397, + "grad_norm": 0.10348998755216599, + "learning_rate": 4.819080223864509e-05, + "loss": 0.13804128170013427, + "step": 121160 + }, + { + "epoch": 0.520208134772417, + "grad_norm": 0.029189372435212135, + "learning_rate": 4.8186490518527464e-05, + "loss": 0.1679968476295471, + "step": 121170 + }, + { + "epoch": 0.520251066862437, + "grad_norm": 0.614188551902771, + "learning_rate": 4.818217879840984e-05, + "loss": 0.17322077751159667, + "step": 121180 + }, + { + "epoch": 0.520293998952457, + "grad_norm": 1.1057333946228027, + "learning_rate": 4.817786707829222e-05, + "loss": 0.20551702976226807, + "step": 121190 + }, + { + "epoch": 0.520336931042477, + "grad_norm": 0.011034536175429821, + "learning_rate": 4.8173555358174596e-05, + "loss": 0.08477430939674377, + "step": 121200 + }, + { + "epoch": 0.5203798631324971, + "grad_norm": 0.0020858512725681067, + "learning_rate": 4.8169243638056974e-05, + "loss": 0.19864094257354736, + "step": 121210 + }, + { + "epoch": 0.520422795222517, + "grad_norm": 0.018683254718780518, + "learning_rate": 4.8164931917939344e-05, + "loss": 0.2104574680328369, + "step": 121220 + }, + { + "epoch": 0.520465727312537, + "grad_norm": 1.2795490026474, + "learning_rate": 4.816062019782172e-05, + "loss": 0.19712650775909424, + "step": 121230 + }, + { + "epoch": 0.5205086594025571, + "grad_norm": 6.894408226013184, + "learning_rate": 4.81563084777041e-05, + "loss": 0.28277971744537356, + "step": 121240 + }, + { + "epoch": 0.520551591492577, + "grad_norm": 1.9820345640182495, + "learning_rate": 4.8151996757586476e-05, + "loss": 0.15565208196640015, + "step": 121250 + }, + { + "epoch": 0.5205945235825971, + "grad_norm": 3.0856571197509766, + "learning_rate": 4.814768503746885e-05, + "loss": 0.17781033515930175, + "step": 121260 + }, + { + "epoch": 0.5206374556726171, + "grad_norm": 0.05069053918123245, + "learning_rate": 4.8143373317351224e-05, + "loss": 0.195988667011261, + "step": 121270 + }, + { + "epoch": 0.520680387762637, + "grad_norm": 0.011564699932932854, + "learning_rate": 4.81390615972336e-05, + "loss": 0.33460540771484376, + "step": 121280 + }, + { + "epoch": 0.5207233198526571, + "grad_norm": 0.003371889004483819, + "learning_rate": 4.813474987711598e-05, + "loss": 0.1601473569869995, + "step": 121290 + }, + { + "epoch": 0.5207662519426771, + "grad_norm": 0.013386544771492481, + "learning_rate": 4.8130438156998356e-05, + "loss": 0.2145592212677002, + "step": 121300 + }, + { + "epoch": 0.520809184032697, + "grad_norm": 4.002100944519043, + "learning_rate": 4.8126126436880733e-05, + "loss": 0.3833177089691162, + "step": 121310 + }, + { + "epoch": 0.5208521161227171, + "grad_norm": 5.617799758911133, + "learning_rate": 4.812181471676311e-05, + "loss": 0.32094109058380127, + "step": 121320 + }, + { + "epoch": 0.5208950482127371, + "grad_norm": 3.256422519683838, + "learning_rate": 4.811750299664549e-05, + "loss": 0.16867411136627197, + "step": 121330 + }, + { + "epoch": 0.5209379803027571, + "grad_norm": 1.2565315961837769, + "learning_rate": 4.811319127652786e-05, + "loss": 0.3385310649871826, + "step": 121340 + }, + { + "epoch": 0.5209809123927771, + "grad_norm": 1.6213438510894775, + "learning_rate": 4.8108879556410236e-05, + "loss": 0.3337943315505981, + "step": 121350 + }, + { + "epoch": 0.5210238444827971, + "grad_norm": 5.023358345031738, + "learning_rate": 4.8104567836292613e-05, + "loss": 0.09535663723945617, + "step": 121360 + }, + { + "epoch": 0.5210667765728171, + "grad_norm": 1.1963624954223633, + "learning_rate": 4.810025611617499e-05, + "loss": 0.32074360847473143, + "step": 121370 + }, + { + "epoch": 0.5211097086628371, + "grad_norm": 2.154831647872925, + "learning_rate": 4.809594439605736e-05, + "loss": 0.42052149772644043, + "step": 121380 + }, + { + "epoch": 0.5211526407528572, + "grad_norm": 0.126048281788826, + "learning_rate": 4.809163267593974e-05, + "loss": 0.26276843547821044, + "step": 121390 + }, + { + "epoch": 0.5211955728428771, + "grad_norm": 0.3626399338245392, + "learning_rate": 4.8087320955822116e-05, + "loss": 0.24476270675659179, + "step": 121400 + }, + { + "epoch": 0.5212385049328971, + "grad_norm": 0.47341856360435486, + "learning_rate": 4.808300923570449e-05, + "loss": 0.10128217935562134, + "step": 121410 + }, + { + "epoch": 0.5212814370229172, + "grad_norm": 0.6205283999443054, + "learning_rate": 4.807869751558687e-05, + "loss": 0.31277463436126707, + "step": 121420 + }, + { + "epoch": 0.5213243691129371, + "grad_norm": 0.01310188602656126, + "learning_rate": 4.807438579546925e-05, + "loss": 0.2594514608383179, + "step": 121430 + }, + { + "epoch": 0.5213673012029572, + "grad_norm": 0.26950836181640625, + "learning_rate": 4.8070074075351625e-05, + "loss": 0.2451323986053467, + "step": 121440 + }, + { + "epoch": 0.5214102332929772, + "grad_norm": 0.004093456082046032, + "learning_rate": 4.8065762355234e-05, + "loss": 0.2703315496444702, + "step": 121450 + }, + { + "epoch": 0.5214531653829971, + "grad_norm": 1.2792869806289673, + "learning_rate": 4.806145063511637e-05, + "loss": 0.2487691879272461, + "step": 121460 + }, + { + "epoch": 0.5214960974730172, + "grad_norm": 1.077581524848938, + "learning_rate": 4.805713891499875e-05, + "loss": 0.2256721019744873, + "step": 121470 + }, + { + "epoch": 0.5215390295630372, + "grad_norm": 0.6290398240089417, + "learning_rate": 4.805282719488113e-05, + "loss": 0.11926252841949463, + "step": 121480 + }, + { + "epoch": 0.5215819616530571, + "grad_norm": 0.009173483587801456, + "learning_rate": 4.8048515474763505e-05, + "loss": 0.17269976139068605, + "step": 121490 + }, + { + "epoch": 0.5216248937430772, + "grad_norm": 0.09031513333320618, + "learning_rate": 4.804420375464588e-05, + "loss": 0.22257568836212158, + "step": 121500 + }, + { + "epoch": 0.5216678258330972, + "grad_norm": 0.06083008274435997, + "learning_rate": 4.803989203452825e-05, + "loss": 0.15785495042800904, + "step": 121510 + }, + { + "epoch": 0.5217107579231172, + "grad_norm": 0.10042346268892288, + "learning_rate": 4.803558031441063e-05, + "loss": 0.16853535175323486, + "step": 121520 + }, + { + "epoch": 0.5217536900131372, + "grad_norm": 0.5119935274124146, + "learning_rate": 4.803126859429301e-05, + "loss": 0.22263917922973633, + "step": 121530 + }, + { + "epoch": 0.5217966221031572, + "grad_norm": 0.029625194147229195, + "learning_rate": 4.8026956874175385e-05, + "loss": 0.29853482246398927, + "step": 121540 + }, + { + "epoch": 0.5218395541931772, + "grad_norm": 1.4290279150009155, + "learning_rate": 4.802264515405776e-05, + "loss": 0.17295366525650024, + "step": 121550 + }, + { + "epoch": 0.5218824862831972, + "grad_norm": 0.2421247512102127, + "learning_rate": 4.801833343394014e-05, + "loss": 0.24902019500732422, + "step": 121560 + }, + { + "epoch": 0.5219254183732173, + "grad_norm": 0.017065497115254402, + "learning_rate": 4.801402171382252e-05, + "loss": 0.17499693632125854, + "step": 121570 + }, + { + "epoch": 0.5219683504632372, + "grad_norm": 1.5281258821487427, + "learning_rate": 4.8009709993704895e-05, + "loss": 0.2719564914703369, + "step": 121580 + }, + { + "epoch": 0.5220112825532572, + "grad_norm": 0.008518542163074017, + "learning_rate": 4.8005398273587265e-05, + "loss": 0.23640027046203613, + "step": 121590 + }, + { + "epoch": 0.5220542146432773, + "grad_norm": 0.04925156384706497, + "learning_rate": 4.800108655346964e-05, + "loss": 0.13170560598373413, + "step": 121600 + }, + { + "epoch": 0.5220971467332973, + "grad_norm": 0.30054551362991333, + "learning_rate": 4.799677483335202e-05, + "loss": 0.3642754554748535, + "step": 121610 + }, + { + "epoch": 0.5221400788233173, + "grad_norm": 0.5951288938522339, + "learning_rate": 4.79924631132344e-05, + "loss": 0.042504727840423584, + "step": 121620 + }, + { + "epoch": 0.5221830109133373, + "grad_norm": 0.07040644437074661, + "learning_rate": 4.798815139311677e-05, + "loss": 0.29732842445373536, + "step": 121630 + }, + { + "epoch": 0.5222259430033573, + "grad_norm": 0.005738586187362671, + "learning_rate": 4.7983839672999145e-05, + "loss": 0.15685839653015138, + "step": 121640 + }, + { + "epoch": 0.5222688750933773, + "grad_norm": 0.9687967896461487, + "learning_rate": 4.797952795288152e-05, + "loss": 0.05645252466201782, + "step": 121650 + }, + { + "epoch": 0.5223118071833973, + "grad_norm": 0.8229928612709045, + "learning_rate": 4.79752162327639e-05, + "loss": 0.11232262849807739, + "step": 121660 + }, + { + "epoch": 0.5223547392734174, + "grad_norm": 0.00725303590297699, + "learning_rate": 4.797090451264628e-05, + "loss": 0.25438005924224855, + "step": 121670 + }, + { + "epoch": 0.5223976713634373, + "grad_norm": 0.046411290764808655, + "learning_rate": 4.7966592792528655e-05, + "loss": 0.12618789672851563, + "step": 121680 + }, + { + "epoch": 0.5224406034534573, + "grad_norm": 0.37380075454711914, + "learning_rate": 4.796228107241103e-05, + "loss": 0.2813948392868042, + "step": 121690 + }, + { + "epoch": 0.5224835355434774, + "grad_norm": 1.166673183441162, + "learning_rate": 4.795796935229341e-05, + "loss": 0.5213551998138428, + "step": 121700 + }, + { + "epoch": 0.5225264676334973, + "grad_norm": 0.4081331491470337, + "learning_rate": 4.795365763217578e-05, + "loss": 0.1555892825126648, + "step": 121710 + }, + { + "epoch": 0.5225693997235173, + "grad_norm": 3.880535840988159, + "learning_rate": 4.794934591205816e-05, + "loss": 0.3266066789627075, + "step": 121720 + }, + { + "epoch": 0.5226123318135374, + "grad_norm": 0.028656592592597008, + "learning_rate": 4.7945034191940534e-05, + "loss": 0.12044739723205566, + "step": 121730 + }, + { + "epoch": 0.5226552639035573, + "grad_norm": 3.4097096920013428, + "learning_rate": 4.794072247182291e-05, + "loss": 0.26642622947692873, + "step": 121740 + }, + { + "epoch": 0.5226981959935774, + "grad_norm": 0.35419461131095886, + "learning_rate": 4.793641075170528e-05, + "loss": 0.24461143016815184, + "step": 121750 + }, + { + "epoch": 0.5227411280835974, + "grad_norm": 4.800034046173096, + "learning_rate": 4.793209903158766e-05, + "loss": 0.15310922861099244, + "step": 121760 + }, + { + "epoch": 0.5227840601736173, + "grad_norm": 0.08995475620031357, + "learning_rate": 4.792778731147004e-05, + "loss": 0.40237984657287595, + "step": 121770 + }, + { + "epoch": 0.5228269922636374, + "grad_norm": 0.35849398374557495, + "learning_rate": 4.792347559135242e-05, + "loss": 0.17247823476791382, + "step": 121780 + }, + { + "epoch": 0.5228699243536574, + "grad_norm": 0.0034624820109456778, + "learning_rate": 4.791916387123479e-05, + "loss": 0.048566815257072446, + "step": 121790 + }, + { + "epoch": 0.5229128564436774, + "grad_norm": 0.023735811933875084, + "learning_rate": 4.791485215111717e-05, + "loss": 0.21525065898895263, + "step": 121800 + }, + { + "epoch": 0.5229557885336974, + "grad_norm": 2.3782215118408203, + "learning_rate": 4.7910540430999546e-05, + "loss": 0.21360011100769044, + "step": 121810 + }, + { + "epoch": 0.5229987206237174, + "grad_norm": 2.4215610027313232, + "learning_rate": 4.7906228710881924e-05, + "loss": 0.24774274826049805, + "step": 121820 + }, + { + "epoch": 0.5230416527137374, + "grad_norm": 0.03247027471661568, + "learning_rate": 4.7901916990764294e-05, + "loss": 0.2859419107437134, + "step": 121830 + }, + { + "epoch": 0.5230845848037574, + "grad_norm": 0.0032771273981779814, + "learning_rate": 4.789760527064667e-05, + "loss": 0.08464333415031433, + "step": 121840 + }, + { + "epoch": 0.5231275168937775, + "grad_norm": 0.9314298033714294, + "learning_rate": 4.789329355052905e-05, + "loss": 0.3259695529937744, + "step": 121850 + }, + { + "epoch": 0.5231704489837974, + "grad_norm": 0.08188939094543457, + "learning_rate": 4.7888981830411426e-05, + "loss": 0.07389405965805054, + "step": 121860 + }, + { + "epoch": 0.5232133810738174, + "grad_norm": 0.027831045910716057, + "learning_rate": 4.7884670110293804e-05, + "loss": 0.18900372982025146, + "step": 121870 + }, + { + "epoch": 0.5232563131638375, + "grad_norm": 0.04473964497447014, + "learning_rate": 4.7880358390176174e-05, + "loss": 0.15805684328079223, + "step": 121880 + }, + { + "epoch": 0.5232992452538574, + "grad_norm": 0.005574927665293217, + "learning_rate": 4.787604667005856e-05, + "loss": 0.1679774761199951, + "step": 121890 + }, + { + "epoch": 0.5233421773438774, + "grad_norm": 1.2569983005523682, + "learning_rate": 4.7871734949940936e-05, + "loss": 0.20932137966156006, + "step": 121900 + }, + { + "epoch": 0.5233851094338975, + "grad_norm": 0.020131045952439308, + "learning_rate": 4.786742322982331e-05, + "loss": 0.19063591957092285, + "step": 121910 + }, + { + "epoch": 0.5234280415239174, + "grad_norm": 0.026454292237758636, + "learning_rate": 4.7863111509705684e-05, + "loss": 0.1315123200416565, + "step": 121920 + }, + { + "epoch": 0.5234709736139375, + "grad_norm": 0.20026637613773346, + "learning_rate": 4.785879978958806e-05, + "loss": 0.14659813642501832, + "step": 121930 + }, + { + "epoch": 0.5235139057039575, + "grad_norm": 0.018991755321621895, + "learning_rate": 4.785448806947044e-05, + "loss": 0.28632752895355223, + "step": 121940 + }, + { + "epoch": 0.5235568377939774, + "grad_norm": 0.01908653788268566, + "learning_rate": 4.7850176349352816e-05, + "loss": 0.3429946184158325, + "step": 121950 + }, + { + "epoch": 0.5235997698839975, + "grad_norm": 3.118744373321533, + "learning_rate": 4.7845864629235186e-05, + "loss": 0.20299386978149414, + "step": 121960 + }, + { + "epoch": 0.5236427019740175, + "grad_norm": 0.13232176005840302, + "learning_rate": 4.7841552909117564e-05, + "loss": 0.2375958204269409, + "step": 121970 + }, + { + "epoch": 0.5236856340640375, + "grad_norm": 1.2320634126663208, + "learning_rate": 4.783724118899994e-05, + "loss": 0.2093639373779297, + "step": 121980 + }, + { + "epoch": 0.5237285661540575, + "grad_norm": 1.9221858978271484, + "learning_rate": 4.783292946888232e-05, + "loss": 0.1392124056816101, + "step": 121990 + }, + { + "epoch": 0.5237714982440775, + "grad_norm": 0.579514741897583, + "learning_rate": 4.7828617748764696e-05, + "loss": 0.3182239532470703, + "step": 122000 + }, + { + "epoch": 0.5237714982440775, + "eval_loss": 0.4002816081047058, + "eval_runtime": 27.3332, + "eval_samples_per_second": 3.659, + "eval_steps_per_second": 3.659, + "step": 122000 + }, + { + "epoch": 0.5238144303340975, + "grad_norm": 2.985659599304199, + "learning_rate": 4.782430602864707e-05, + "loss": 0.10451849699020385, + "step": 122010 + }, + { + "epoch": 0.5238573624241175, + "grad_norm": 0.028085991740226746, + "learning_rate": 4.781999430852945e-05, + "loss": 0.2830367565155029, + "step": 122020 + }, + { + "epoch": 0.5239002945141376, + "grad_norm": 0.0021224322263151407, + "learning_rate": 4.781568258841183e-05, + "loss": 0.37227163314819334, + "step": 122030 + }, + { + "epoch": 0.5239432266041576, + "grad_norm": 0.025949900969862938, + "learning_rate": 4.78113708682942e-05, + "loss": 0.19543081521987915, + "step": 122040 + }, + { + "epoch": 0.5239861586941775, + "grad_norm": 0.020989255979657173, + "learning_rate": 4.7807059148176576e-05, + "loss": 0.27584066390991213, + "step": 122050 + }, + { + "epoch": 0.5240290907841976, + "grad_norm": 1.4416751861572266, + "learning_rate": 4.780274742805895e-05, + "loss": 0.15617939233779907, + "step": 122060 + }, + { + "epoch": 0.5240720228742176, + "grad_norm": 3.7463674545288086, + "learning_rate": 4.779843570794133e-05, + "loss": 0.4121096611022949, + "step": 122070 + }, + { + "epoch": 0.5241149549642375, + "grad_norm": 0.019098132848739624, + "learning_rate": 4.77941239878237e-05, + "loss": 0.20457956790924073, + "step": 122080 + }, + { + "epoch": 0.5241578870542576, + "grad_norm": 0.05528656765818596, + "learning_rate": 4.778981226770608e-05, + "loss": 0.21979448795318604, + "step": 122090 + }, + { + "epoch": 0.5242008191442776, + "grad_norm": 0.5462521910667419, + "learning_rate": 4.7785500547588455e-05, + "loss": 0.14519410133361815, + "step": 122100 + }, + { + "epoch": 0.5242437512342976, + "grad_norm": 4.878800392150879, + "learning_rate": 4.778118882747083e-05, + "loss": 0.4063398361206055, + "step": 122110 + }, + { + "epoch": 0.5242866833243176, + "grad_norm": 0.035689231008291245, + "learning_rate": 4.777687710735321e-05, + "loss": 0.06844155192375183, + "step": 122120 + }, + { + "epoch": 0.5243296154143376, + "grad_norm": 1.3210026025772095, + "learning_rate": 4.777256538723559e-05, + "loss": 0.11408164501190185, + "step": 122130 + }, + { + "epoch": 0.5243725475043576, + "grad_norm": 0.03476414084434509, + "learning_rate": 4.7768253667117965e-05, + "loss": 0.1776628613471985, + "step": 122140 + }, + { + "epoch": 0.5244154795943776, + "grad_norm": 0.0011379508068785071, + "learning_rate": 4.776394194700034e-05, + "loss": 0.16172831058502196, + "step": 122150 + }, + { + "epoch": 0.5244584116843977, + "grad_norm": 0.4380556344985962, + "learning_rate": 4.775963022688271e-05, + "loss": 0.23161208629608154, + "step": 122160 + }, + { + "epoch": 0.5245013437744176, + "grad_norm": 1.6195735931396484, + "learning_rate": 4.775531850676509e-05, + "loss": 0.250444221496582, + "step": 122170 + }, + { + "epoch": 0.5245442758644376, + "grad_norm": 3.016010046005249, + "learning_rate": 4.775100678664747e-05, + "loss": 0.34918644428253176, + "step": 122180 + }, + { + "epoch": 0.5245872079544577, + "grad_norm": 0.0846981480717659, + "learning_rate": 4.7746695066529845e-05, + "loss": 0.32720112800598145, + "step": 122190 + }, + { + "epoch": 0.5246301400444776, + "grad_norm": 0.0056806099601089954, + "learning_rate": 4.7742383346412215e-05, + "loss": 0.015474987030029298, + "step": 122200 + }, + { + "epoch": 0.5246730721344977, + "grad_norm": 0.9824272990226746, + "learning_rate": 4.773807162629459e-05, + "loss": 0.32699992656707766, + "step": 122210 + }, + { + "epoch": 0.5247160042245177, + "grad_norm": 1.053566336631775, + "learning_rate": 4.773375990617697e-05, + "loss": 0.4157695293426514, + "step": 122220 + }, + { + "epoch": 0.5247589363145376, + "grad_norm": 0.001544310594908893, + "learning_rate": 4.772944818605935e-05, + "loss": 0.1750641345977783, + "step": 122230 + }, + { + "epoch": 0.5248018684045577, + "grad_norm": 0.019139086827635765, + "learning_rate": 4.7725136465941725e-05, + "loss": 0.31319055557250974, + "step": 122240 + }, + { + "epoch": 0.5248448004945777, + "grad_norm": 0.5469844937324524, + "learning_rate": 4.77208247458241e-05, + "loss": 0.19357837438583375, + "step": 122250 + }, + { + "epoch": 0.5248877325845976, + "grad_norm": 0.00616453168913722, + "learning_rate": 4.771651302570648e-05, + "loss": 0.11588430404663086, + "step": 122260 + }, + { + "epoch": 0.5249306646746177, + "grad_norm": 2.7403957843780518, + "learning_rate": 4.771220130558886e-05, + "loss": 0.21277878284454346, + "step": 122270 + }, + { + "epoch": 0.5249735967646377, + "grad_norm": 0.18290357291698456, + "learning_rate": 4.7707889585471234e-05, + "loss": 0.28863661289215087, + "step": 122280 + }, + { + "epoch": 0.5250165288546577, + "grad_norm": 0.15102115273475647, + "learning_rate": 4.7703577865353605e-05, + "loss": 0.21934478282928466, + "step": 122290 + }, + { + "epoch": 0.5250594609446777, + "grad_norm": 0.009661280550062656, + "learning_rate": 4.769926614523598e-05, + "loss": 0.22365834712982177, + "step": 122300 + }, + { + "epoch": 0.5251023930346977, + "grad_norm": 0.04367806389927864, + "learning_rate": 4.769495442511836e-05, + "loss": 0.1199500560760498, + "step": 122310 + }, + { + "epoch": 0.5251453251247177, + "grad_norm": 0.009130306541919708, + "learning_rate": 4.769064270500074e-05, + "loss": 0.12299952507019044, + "step": 122320 + }, + { + "epoch": 0.5251882572147377, + "grad_norm": 0.0018692787270992994, + "learning_rate": 4.768633098488311e-05, + "loss": 0.19006558656692504, + "step": 122330 + }, + { + "epoch": 0.5252311893047578, + "grad_norm": 0.5307639837265015, + "learning_rate": 4.7682019264765485e-05, + "loss": 0.35393648147583007, + "step": 122340 + }, + { + "epoch": 0.5252741213947777, + "grad_norm": 2.1476428508758545, + "learning_rate": 4.767770754464786e-05, + "loss": 0.32211828231811523, + "step": 122350 + }, + { + "epoch": 0.5253170534847977, + "grad_norm": 0.0038509471341967583, + "learning_rate": 4.767339582453024e-05, + "loss": 0.167023229598999, + "step": 122360 + }, + { + "epoch": 0.5253599855748178, + "grad_norm": 0.22803162038326263, + "learning_rate": 4.766908410441262e-05, + "loss": 0.3219106435775757, + "step": 122370 + }, + { + "epoch": 0.5254029176648377, + "grad_norm": 0.9811261892318726, + "learning_rate": 4.7664772384294994e-05, + "loss": 0.05802839994430542, + "step": 122380 + }, + { + "epoch": 0.5254458497548578, + "grad_norm": 0.08851549029350281, + "learning_rate": 4.766046066417737e-05, + "loss": 0.0848920226097107, + "step": 122390 + }, + { + "epoch": 0.5254887818448778, + "grad_norm": 4.3986124992370605, + "learning_rate": 4.765614894405975e-05, + "loss": 0.07044092416763306, + "step": 122400 + }, + { + "epoch": 0.5255317139348977, + "grad_norm": 0.05002744123339653, + "learning_rate": 4.765183722394212e-05, + "loss": 0.1601981997489929, + "step": 122410 + }, + { + "epoch": 0.5255746460249178, + "grad_norm": 0.029643887653946877, + "learning_rate": 4.7647525503824497e-05, + "loss": 0.2490626811981201, + "step": 122420 + }, + { + "epoch": 0.5256175781149378, + "grad_norm": 0.0011830313596874475, + "learning_rate": 4.7643213783706874e-05, + "loss": 0.10901587009429932, + "step": 122430 + }, + { + "epoch": 0.5256605102049577, + "grad_norm": 0.0028918858151882887, + "learning_rate": 4.763890206358925e-05, + "loss": 0.4196751594543457, + "step": 122440 + }, + { + "epoch": 0.5257034422949778, + "grad_norm": 0.028822220861911774, + "learning_rate": 4.763459034347162e-05, + "loss": 0.15838117599487306, + "step": 122450 + }, + { + "epoch": 0.5257463743849978, + "grad_norm": 2.5338051319122314, + "learning_rate": 4.7630278623354e-05, + "loss": 0.3437318801879883, + "step": 122460 + }, + { + "epoch": 0.5257893064750179, + "grad_norm": 0.00381831219419837, + "learning_rate": 4.7625966903236377e-05, + "loss": 0.1670363187789917, + "step": 122470 + }, + { + "epoch": 0.5258322385650378, + "grad_norm": 0.0786067470908165, + "learning_rate": 4.762165518311876e-05, + "loss": 0.2615427732467651, + "step": 122480 + }, + { + "epoch": 0.5258751706550578, + "grad_norm": 8.087389945983887, + "learning_rate": 4.761734346300113e-05, + "loss": 0.4246623992919922, + "step": 122490 + }, + { + "epoch": 0.5259181027450779, + "grad_norm": 0.005552296061068773, + "learning_rate": 4.761303174288351e-05, + "loss": 0.1938941717147827, + "step": 122500 + }, + { + "epoch": 0.5259610348350978, + "grad_norm": 0.047699443995952606, + "learning_rate": 4.7608720022765886e-05, + "loss": 0.32389297485351565, + "step": 122510 + }, + { + "epoch": 0.5260039669251179, + "grad_norm": 0.04659678041934967, + "learning_rate": 4.760440830264826e-05, + "loss": 0.10534238815307617, + "step": 122520 + }, + { + "epoch": 0.5260468990151379, + "grad_norm": 0.01996210776269436, + "learning_rate": 4.7600096582530634e-05, + "loss": 0.2690871238708496, + "step": 122530 + }, + { + "epoch": 0.5260898311051578, + "grad_norm": 0.12138628214597702, + "learning_rate": 4.759578486241301e-05, + "loss": 0.15548104047775269, + "step": 122540 + }, + { + "epoch": 0.5261327631951779, + "grad_norm": 0.45005640387535095, + "learning_rate": 4.759147314229539e-05, + "loss": 0.2214979648590088, + "step": 122550 + }, + { + "epoch": 0.5261756952851979, + "grad_norm": 0.0013523201923817396, + "learning_rate": 4.7587161422177766e-05, + "loss": 0.3759445667266846, + "step": 122560 + }, + { + "epoch": 0.5262186273752179, + "grad_norm": 0.012963851913809776, + "learning_rate": 4.758284970206014e-05, + "loss": 0.3076311111450195, + "step": 122570 + }, + { + "epoch": 0.5262615594652379, + "grad_norm": 0.36688700318336487, + "learning_rate": 4.7578537981942514e-05, + "loss": 0.2342298746109009, + "step": 122580 + }, + { + "epoch": 0.5263044915552579, + "grad_norm": 1.1012883186340332, + "learning_rate": 4.75742262618249e-05, + "loss": 0.13014146089553832, + "step": 122590 + }, + { + "epoch": 0.5263474236452779, + "grad_norm": 1.8125981092453003, + "learning_rate": 4.7569914541707275e-05, + "loss": 0.3951392412185669, + "step": 122600 + }, + { + "epoch": 0.5263903557352979, + "grad_norm": 0.04614179953932762, + "learning_rate": 4.756560282158965e-05, + "loss": 0.1506880760192871, + "step": 122610 + }, + { + "epoch": 0.526433287825318, + "grad_norm": 0.05191076546907425, + "learning_rate": 4.756129110147202e-05, + "loss": 0.0537009060382843, + "step": 122620 + }, + { + "epoch": 0.5264762199153379, + "grad_norm": 2.3037290573120117, + "learning_rate": 4.75569793813544e-05, + "loss": 0.23060684204101561, + "step": 122630 + }, + { + "epoch": 0.5265191520053579, + "grad_norm": 0.06191762164235115, + "learning_rate": 4.755266766123678e-05, + "loss": 0.19760955572128297, + "step": 122640 + }, + { + "epoch": 0.526562084095378, + "grad_norm": 0.018758106976747513, + "learning_rate": 4.7548355941119155e-05, + "loss": 0.1421829104423523, + "step": 122650 + }, + { + "epoch": 0.5266050161853979, + "grad_norm": 3.5670557022094727, + "learning_rate": 4.7544044221001526e-05, + "loss": 0.1848344922065735, + "step": 122660 + }, + { + "epoch": 0.5266479482754179, + "grad_norm": 0.0048149810172617435, + "learning_rate": 4.75397325008839e-05, + "loss": 0.1379055380821228, + "step": 122670 + }, + { + "epoch": 0.526690880365438, + "grad_norm": 4.6595234870910645, + "learning_rate": 4.753542078076628e-05, + "loss": 0.24073870182037355, + "step": 122680 + }, + { + "epoch": 0.5267338124554579, + "grad_norm": 0.7435564994812012, + "learning_rate": 4.753110906064866e-05, + "loss": 0.2687734603881836, + "step": 122690 + }, + { + "epoch": 0.526776744545478, + "grad_norm": 0.32143405079841614, + "learning_rate": 4.7526797340531035e-05, + "loss": 0.13571771383285522, + "step": 122700 + }, + { + "epoch": 0.526819676635498, + "grad_norm": 0.2254776656627655, + "learning_rate": 4.752248562041341e-05, + "loss": 0.24260897636413575, + "step": 122710 + }, + { + "epoch": 0.5268626087255179, + "grad_norm": 0.33794358372688293, + "learning_rate": 4.751817390029579e-05, + "loss": 0.1821001172065735, + "step": 122720 + }, + { + "epoch": 0.526905540815538, + "grad_norm": 0.010684690438210964, + "learning_rate": 4.751386218017817e-05, + "loss": 0.14596030712127686, + "step": 122730 + }, + { + "epoch": 0.526948472905558, + "grad_norm": 0.005989206023514271, + "learning_rate": 4.750955046006054e-05, + "loss": 0.03930140435695648, + "step": 122740 + }, + { + "epoch": 0.526991404995578, + "grad_norm": 0.2036050409078598, + "learning_rate": 4.7505238739942915e-05, + "loss": 0.22046027183532715, + "step": 122750 + }, + { + "epoch": 0.527034337085598, + "grad_norm": 0.004267920274287462, + "learning_rate": 4.750092701982529e-05, + "loss": 0.559827184677124, + "step": 122760 + }, + { + "epoch": 0.527077269175618, + "grad_norm": 0.9968888163566589, + "learning_rate": 4.749661529970767e-05, + "loss": 0.2507550954818726, + "step": 122770 + }, + { + "epoch": 0.527120201265638, + "grad_norm": 0.015453596599400043, + "learning_rate": 4.749230357959004e-05, + "loss": 0.1077796459197998, + "step": 122780 + }, + { + "epoch": 0.527163133355658, + "grad_norm": 0.10399241745471954, + "learning_rate": 4.748799185947242e-05, + "loss": 0.21866052150726317, + "step": 122790 + }, + { + "epoch": 0.527206065445678, + "grad_norm": 0.0058853523805737495, + "learning_rate": 4.7483680139354795e-05, + "loss": 0.2769232034683228, + "step": 122800 + }, + { + "epoch": 0.527248997535698, + "grad_norm": 0.6753274202346802, + "learning_rate": 4.747936841923717e-05, + "loss": 0.07704102993011475, + "step": 122810 + }, + { + "epoch": 0.527291929625718, + "grad_norm": 0.010032746009528637, + "learning_rate": 4.747505669911955e-05, + "loss": 0.16229265928268433, + "step": 122820 + }, + { + "epoch": 0.5273348617157381, + "grad_norm": 0.12110492587089539, + "learning_rate": 4.747074497900193e-05, + "loss": 0.26483950614929197, + "step": 122830 + }, + { + "epoch": 0.527377793805758, + "grad_norm": 2.6830825805664062, + "learning_rate": 4.7466433258884304e-05, + "loss": 0.3436319351196289, + "step": 122840 + }, + { + "epoch": 0.527420725895778, + "grad_norm": 0.017832154408097267, + "learning_rate": 4.746212153876668e-05, + "loss": 0.12279645204544068, + "step": 122850 + }, + { + "epoch": 0.5274636579857981, + "grad_norm": 6.973689556121826, + "learning_rate": 4.745780981864905e-05, + "loss": 0.2290245532989502, + "step": 122860 + }, + { + "epoch": 0.527506590075818, + "grad_norm": 3.703192710876465, + "learning_rate": 4.745349809853143e-05, + "loss": 0.20900051593780516, + "step": 122870 + }, + { + "epoch": 0.5275495221658381, + "grad_norm": 1.3952735662460327, + "learning_rate": 4.744918637841381e-05, + "loss": 0.18111791610717773, + "step": 122880 + }, + { + "epoch": 0.5275924542558581, + "grad_norm": 0.038036517798900604, + "learning_rate": 4.7444874658296184e-05, + "loss": 0.11206220388412476, + "step": 122890 + }, + { + "epoch": 0.5276353863458781, + "grad_norm": 1.2479053735733032, + "learning_rate": 4.7440562938178555e-05, + "loss": 0.27330703735351564, + "step": 122900 + }, + { + "epoch": 0.5276783184358981, + "grad_norm": 0.05419410392642021, + "learning_rate": 4.743625121806093e-05, + "loss": 0.10534077882766724, + "step": 122910 + }, + { + "epoch": 0.5277212505259181, + "grad_norm": 2.2733733654022217, + "learning_rate": 4.743193949794331e-05, + "loss": 0.35091605186462405, + "step": 122920 + }, + { + "epoch": 0.5277641826159382, + "grad_norm": 0.20193079113960266, + "learning_rate": 4.742762777782569e-05, + "loss": 0.21767075061798097, + "step": 122930 + }, + { + "epoch": 0.5278071147059581, + "grad_norm": 0.024202005937695503, + "learning_rate": 4.7423316057708064e-05, + "loss": 0.08835142850875854, + "step": 122940 + }, + { + "epoch": 0.5278500467959781, + "grad_norm": 0.003215222619473934, + "learning_rate": 4.741900433759044e-05, + "loss": 0.2013381004333496, + "step": 122950 + }, + { + "epoch": 0.5278929788859982, + "grad_norm": 0.051946718245744705, + "learning_rate": 4.741469261747282e-05, + "loss": 0.4013204097747803, + "step": 122960 + }, + { + "epoch": 0.5279359109760181, + "grad_norm": 0.05380121245980263, + "learning_rate": 4.7410380897355196e-05, + "loss": 0.1067008137702942, + "step": 122970 + }, + { + "epoch": 0.5279788430660382, + "grad_norm": 1.3877129554748535, + "learning_rate": 4.7406069177237574e-05, + "loss": 0.30157127380371096, + "step": 122980 + }, + { + "epoch": 0.5280217751560582, + "grad_norm": 1.0387351512908936, + "learning_rate": 4.7401757457119944e-05, + "loss": 0.23853034973144532, + "step": 122990 + }, + { + "epoch": 0.5280647072460781, + "grad_norm": 0.2567809224128723, + "learning_rate": 4.739744573700232e-05, + "loss": 0.2381465196609497, + "step": 123000 + }, + { + "epoch": 0.5280647072460781, + "eval_loss": 0.40844210982322693, + "eval_runtime": 27.1299, + "eval_samples_per_second": 3.686, + "eval_steps_per_second": 3.686, + "step": 123000 + }, + { + "epoch": 0.5281076393360982, + "grad_norm": 0.0031749005429446697, + "learning_rate": 4.73931340168847e-05, + "loss": 0.08643372058868408, + "step": 123010 + }, + { + "epoch": 0.5281505714261182, + "grad_norm": 0.02016310580074787, + "learning_rate": 4.7388822296767076e-05, + "loss": 0.11112987995147705, + "step": 123020 + }, + { + "epoch": 0.5281935035161381, + "grad_norm": 0.9397972822189331, + "learning_rate": 4.738451057664945e-05, + "loss": 0.2053551197052002, + "step": 123030 + }, + { + "epoch": 0.5282364356061582, + "grad_norm": 0.024424588307738304, + "learning_rate": 4.7380198856531824e-05, + "loss": 0.18155556917190552, + "step": 123040 + }, + { + "epoch": 0.5282793676961782, + "grad_norm": 0.015114293433725834, + "learning_rate": 4.73758871364142e-05, + "loss": 0.10795472860336304, + "step": 123050 + }, + { + "epoch": 0.5283222997861982, + "grad_norm": 0.02613051049411297, + "learning_rate": 4.737157541629658e-05, + "loss": 0.1803101658821106, + "step": 123060 + }, + { + "epoch": 0.5283652318762182, + "grad_norm": 2.2235827445983887, + "learning_rate": 4.7367263696178956e-05, + "loss": 0.2081122636795044, + "step": 123070 + }, + { + "epoch": 0.5284081639662382, + "grad_norm": 0.23082391917705536, + "learning_rate": 4.7362951976061333e-05, + "loss": 0.16542841196060182, + "step": 123080 + }, + { + "epoch": 0.5284510960562582, + "grad_norm": 0.2546762526035309, + "learning_rate": 4.735864025594371e-05, + "loss": 0.17060707807540892, + "step": 123090 + }, + { + "epoch": 0.5284940281462782, + "grad_norm": 8.392339706420898, + "learning_rate": 4.735432853582609e-05, + "loss": 0.24863219261169434, + "step": 123100 + }, + { + "epoch": 0.5285369602362983, + "grad_norm": 0.11422450095415115, + "learning_rate": 4.735001681570846e-05, + "loss": 0.19755786657333374, + "step": 123110 + }, + { + "epoch": 0.5285798923263182, + "grad_norm": 0.0031740881968289614, + "learning_rate": 4.7345705095590836e-05, + "loss": 0.18587934970855713, + "step": 123120 + }, + { + "epoch": 0.5286228244163382, + "grad_norm": 0.0874519944190979, + "learning_rate": 4.7341393375473213e-05, + "loss": 0.21035304069519042, + "step": 123130 + }, + { + "epoch": 0.5286657565063583, + "grad_norm": 0.6707064509391785, + "learning_rate": 4.733708165535559e-05, + "loss": 0.09855753779411316, + "step": 123140 + }, + { + "epoch": 0.5287086885963782, + "grad_norm": 1.0456721782684326, + "learning_rate": 4.733276993523796e-05, + "loss": 0.34525909423828127, + "step": 123150 + }, + { + "epoch": 0.5287516206863982, + "grad_norm": 0.0036562460009008646, + "learning_rate": 4.732845821512034e-05, + "loss": 0.15217121839523315, + "step": 123160 + }, + { + "epoch": 0.5287945527764183, + "grad_norm": 0.022391438484191895, + "learning_rate": 4.7324146495002716e-05, + "loss": 0.1648841142654419, + "step": 123170 + }, + { + "epoch": 0.5288374848664382, + "grad_norm": 63.53861999511719, + "learning_rate": 4.73198347748851e-05, + "loss": 0.273171591758728, + "step": 123180 + }, + { + "epoch": 0.5288804169564583, + "grad_norm": 30.67633819580078, + "learning_rate": 4.731552305476747e-05, + "loss": 0.2636192798614502, + "step": 123190 + }, + { + "epoch": 0.5289233490464783, + "grad_norm": 2.2658443450927734, + "learning_rate": 4.731121133464985e-05, + "loss": 0.2464972734451294, + "step": 123200 + }, + { + "epoch": 0.5289662811364982, + "grad_norm": 4.664850234985352, + "learning_rate": 4.7306899614532225e-05, + "loss": 0.37572057247161866, + "step": 123210 + }, + { + "epoch": 0.5290092132265183, + "grad_norm": 1.0588277578353882, + "learning_rate": 4.73025878944146e-05, + "loss": 0.521503496170044, + "step": 123220 + }, + { + "epoch": 0.5290521453165383, + "grad_norm": 0.036139827221632004, + "learning_rate": 4.729827617429697e-05, + "loss": 0.2847903728485107, + "step": 123230 + }, + { + "epoch": 0.5290950774065583, + "grad_norm": 0.06087147817015648, + "learning_rate": 4.729396445417935e-05, + "loss": 0.21260426044464112, + "step": 123240 + }, + { + "epoch": 0.5291380094965783, + "grad_norm": 2.3397505283355713, + "learning_rate": 4.728965273406173e-05, + "loss": 0.1197127103805542, + "step": 123250 + }, + { + "epoch": 0.5291809415865983, + "grad_norm": 1.8557378053665161, + "learning_rate": 4.7285341013944105e-05, + "loss": 0.43969006538391114, + "step": 123260 + }, + { + "epoch": 0.5292238736766183, + "grad_norm": 14.346938133239746, + "learning_rate": 4.7281029293826476e-05, + "loss": 0.19784982204437257, + "step": 123270 + }, + { + "epoch": 0.5292668057666383, + "grad_norm": 0.6737160682678223, + "learning_rate": 4.727671757370885e-05, + "loss": 0.2786275863647461, + "step": 123280 + }, + { + "epoch": 0.5293097378566584, + "grad_norm": 1.9091942310333252, + "learning_rate": 4.727240585359124e-05, + "loss": 0.13269058465957642, + "step": 123290 + }, + { + "epoch": 0.5293526699466783, + "grad_norm": 0.02550147846341133, + "learning_rate": 4.7268094133473615e-05, + "loss": 0.20986504554748536, + "step": 123300 + }, + { + "epoch": 0.5293956020366983, + "grad_norm": 0.08701743930578232, + "learning_rate": 4.726378241335599e-05, + "loss": 0.2978894948959351, + "step": 123310 + }, + { + "epoch": 0.5294385341267184, + "grad_norm": 15.151698112487793, + "learning_rate": 4.725947069323836e-05, + "loss": 0.39813718795776365, + "step": 123320 + }, + { + "epoch": 0.5294814662167384, + "grad_norm": 0.26273056864738464, + "learning_rate": 4.725515897312074e-05, + "loss": 0.20274786949157714, + "step": 123330 + }, + { + "epoch": 0.5295243983067583, + "grad_norm": 1.5247877836227417, + "learning_rate": 4.725084725300312e-05, + "loss": 0.28449985980987547, + "step": 123340 + }, + { + "epoch": 0.5295673303967784, + "grad_norm": 0.25654134154319763, + "learning_rate": 4.7246535532885495e-05, + "loss": 0.27149922847747804, + "step": 123350 + }, + { + "epoch": 0.5296102624867984, + "grad_norm": 0.0787576287984848, + "learning_rate": 4.7242223812767865e-05, + "loss": 0.2487567663192749, + "step": 123360 + }, + { + "epoch": 0.5296531945768184, + "grad_norm": 0.6149705648422241, + "learning_rate": 4.723791209265024e-05, + "loss": 0.10222749710083008, + "step": 123370 + }, + { + "epoch": 0.5296961266668384, + "grad_norm": 0.02223723568022251, + "learning_rate": 4.723360037253262e-05, + "loss": 0.2543445348739624, + "step": 123380 + }, + { + "epoch": 0.5297390587568584, + "grad_norm": 0.005376000422984362, + "learning_rate": 4.7229288652415e-05, + "loss": 0.34765603542327883, + "step": 123390 + }, + { + "epoch": 0.5297819908468784, + "grad_norm": 1.3201158046722412, + "learning_rate": 4.7224976932297375e-05, + "loss": 0.13802237510681153, + "step": 123400 + }, + { + "epoch": 0.5298249229368984, + "grad_norm": 0.019846005365252495, + "learning_rate": 4.722066521217975e-05, + "loss": 0.09360415935516357, + "step": 123410 + }, + { + "epoch": 0.5298678550269185, + "grad_norm": 0.03975502401590347, + "learning_rate": 4.721635349206213e-05, + "loss": 0.17938061952590942, + "step": 123420 + }, + { + "epoch": 0.5299107871169384, + "grad_norm": 0.024910472333431244, + "learning_rate": 4.7212041771944507e-05, + "loss": 0.30192625522613525, + "step": 123430 + }, + { + "epoch": 0.5299537192069584, + "grad_norm": 0.2583577334880829, + "learning_rate": 4.720773005182688e-05, + "loss": 0.20642051696777344, + "step": 123440 + }, + { + "epoch": 0.5299966512969785, + "grad_norm": 10.739752769470215, + "learning_rate": 4.7203418331709254e-05, + "loss": 0.35497708320617677, + "step": 123450 + }, + { + "epoch": 0.5300395833869984, + "grad_norm": 19.12002182006836, + "learning_rate": 4.719910661159163e-05, + "loss": 0.41153979301452637, + "step": 123460 + }, + { + "epoch": 0.5300825154770185, + "grad_norm": 0.04216668754816055, + "learning_rate": 4.719479489147401e-05, + "loss": 0.2261986255645752, + "step": 123470 + }, + { + "epoch": 0.5301254475670385, + "grad_norm": 0.1999616026878357, + "learning_rate": 4.719048317135638e-05, + "loss": 0.23764095306396485, + "step": 123480 + }, + { + "epoch": 0.5301683796570584, + "grad_norm": 4.300946235656738, + "learning_rate": 4.718617145123876e-05, + "loss": 0.17631118297576903, + "step": 123490 + }, + { + "epoch": 0.5302113117470785, + "grad_norm": 2.1016876697540283, + "learning_rate": 4.7181859731121134e-05, + "loss": 0.366302490234375, + "step": 123500 + }, + { + "epoch": 0.5302542438370985, + "grad_norm": 0.1305193454027176, + "learning_rate": 4.717754801100351e-05, + "loss": 0.1007119059562683, + "step": 123510 + }, + { + "epoch": 0.5302971759271184, + "grad_norm": 0.10050223022699356, + "learning_rate": 4.717323629088589e-05, + "loss": 0.13999812602996825, + "step": 123520 + }, + { + "epoch": 0.5303401080171385, + "grad_norm": 0.003933720290660858, + "learning_rate": 4.7168924570768266e-05, + "loss": 0.05451310276985168, + "step": 123530 + }, + { + "epoch": 0.5303830401071585, + "grad_norm": 1.8059325218200684, + "learning_rate": 4.7164612850650644e-05, + "loss": 0.23415093421936034, + "step": 123540 + }, + { + "epoch": 0.5304259721971785, + "grad_norm": 0.011281573213636875, + "learning_rate": 4.716030113053302e-05, + "loss": 0.15493345260620117, + "step": 123550 + }, + { + "epoch": 0.5304689042871985, + "grad_norm": 0.015826251357793808, + "learning_rate": 4.715598941041539e-05, + "loss": 0.0832473635673523, + "step": 123560 + }, + { + "epoch": 0.5305118363772185, + "grad_norm": 0.03475892171263695, + "learning_rate": 4.715167769029777e-05, + "loss": 0.19375609159469603, + "step": 123570 + }, + { + "epoch": 0.5305547684672385, + "grad_norm": 0.022834021598100662, + "learning_rate": 4.7147365970180146e-05, + "loss": 0.47736759185791017, + "step": 123580 + }, + { + "epoch": 0.5305977005572585, + "grad_norm": 14.527593612670898, + "learning_rate": 4.7143054250062524e-05, + "loss": 0.11089812517166138, + "step": 123590 + }, + { + "epoch": 0.5306406326472786, + "grad_norm": 0.8763924837112427, + "learning_rate": 4.7138742529944894e-05, + "loss": 0.2431772232055664, + "step": 123600 + }, + { + "epoch": 0.5306835647372985, + "grad_norm": 0.01501018088310957, + "learning_rate": 4.713443080982727e-05, + "loss": 0.1265444278717041, + "step": 123610 + }, + { + "epoch": 0.5307264968273185, + "grad_norm": 0.02495141699910164, + "learning_rate": 4.713011908970965e-05, + "loss": 0.11369569301605224, + "step": 123620 + }, + { + "epoch": 0.5307694289173386, + "grad_norm": 0.03437520191073418, + "learning_rate": 4.7125807369592026e-05, + "loss": 0.3247214317321777, + "step": 123630 + }, + { + "epoch": 0.5308123610073585, + "grad_norm": 0.08870543539524078, + "learning_rate": 4.7121495649474404e-05, + "loss": 0.1731701374053955, + "step": 123640 + }, + { + "epoch": 0.5308552930973786, + "grad_norm": 3.2430336475372314, + "learning_rate": 4.711718392935678e-05, + "loss": 0.25606422424316405, + "step": 123650 + }, + { + "epoch": 0.5308982251873986, + "grad_norm": 23.563745498657227, + "learning_rate": 4.711287220923916e-05, + "loss": 0.16244746446609498, + "step": 123660 + }, + { + "epoch": 0.5309411572774185, + "grad_norm": 3.8496384620666504, + "learning_rate": 4.7108560489121536e-05, + "loss": 0.22676796913146974, + "step": 123670 + }, + { + "epoch": 0.5309840893674386, + "grad_norm": 0.01404566876590252, + "learning_rate": 4.710424876900391e-05, + "loss": 0.08764925003051757, + "step": 123680 + }, + { + "epoch": 0.5310270214574586, + "grad_norm": 0.02645733766257763, + "learning_rate": 4.7099937048886284e-05, + "loss": 0.24724922180175782, + "step": 123690 + }, + { + "epoch": 0.5310699535474785, + "grad_norm": 0.026252178475260735, + "learning_rate": 4.709562532876866e-05, + "loss": 0.055978184938430785, + "step": 123700 + }, + { + "epoch": 0.5311128856374986, + "grad_norm": 3.005872964859009, + "learning_rate": 4.709131360865104e-05, + "loss": 0.27054011821746826, + "step": 123710 + }, + { + "epoch": 0.5311558177275186, + "grad_norm": 0.014248663559556007, + "learning_rate": 4.7087001888533416e-05, + "loss": 0.20635383129119872, + "step": 123720 + }, + { + "epoch": 0.5311987498175386, + "grad_norm": 0.1993233859539032, + "learning_rate": 4.7082690168415786e-05, + "loss": 0.03018401861190796, + "step": 123730 + }, + { + "epoch": 0.5312416819075586, + "grad_norm": 0.053939998149871826, + "learning_rate": 4.7078378448298164e-05, + "loss": 0.15145121812820433, + "step": 123740 + }, + { + "epoch": 0.5312846139975786, + "grad_norm": 0.07720816135406494, + "learning_rate": 4.707406672818054e-05, + "loss": 0.24113116264343262, + "step": 123750 + }, + { + "epoch": 0.5313275460875987, + "grad_norm": 0.032163117080926895, + "learning_rate": 4.706975500806292e-05, + "loss": 0.08998279571533203, + "step": 123760 + }, + { + "epoch": 0.5313704781776186, + "grad_norm": 0.05457861348986626, + "learning_rate": 4.7065443287945296e-05, + "loss": 0.18543795347213746, + "step": 123770 + }, + { + "epoch": 0.5314134102676387, + "grad_norm": 0.015958484262228012, + "learning_rate": 4.706113156782767e-05, + "loss": 0.14293922185897828, + "step": 123780 + }, + { + "epoch": 0.5314563423576587, + "grad_norm": 0.0022434417624026537, + "learning_rate": 4.705681984771005e-05, + "loss": 0.3324514150619507, + "step": 123790 + }, + { + "epoch": 0.5314992744476786, + "grad_norm": 0.02598206140100956, + "learning_rate": 4.705250812759243e-05, + "loss": 0.3836965560913086, + "step": 123800 + }, + { + "epoch": 0.5315422065376987, + "grad_norm": 0.031823523342609406, + "learning_rate": 4.70481964074748e-05, + "loss": 0.37702322006225586, + "step": 123810 + }, + { + "epoch": 0.5315851386277187, + "grad_norm": 2.530062675476074, + "learning_rate": 4.7043884687357176e-05, + "loss": 0.2851561546325684, + "step": 123820 + }, + { + "epoch": 0.5316280707177387, + "grad_norm": 0.5450431108474731, + "learning_rate": 4.703957296723955e-05, + "loss": 0.2075505018234253, + "step": 123830 + }, + { + "epoch": 0.5316710028077587, + "grad_norm": 2.2344067096710205, + "learning_rate": 4.703526124712193e-05, + "loss": 0.3167113780975342, + "step": 123840 + }, + { + "epoch": 0.5317139348977787, + "grad_norm": 0.0959930494427681, + "learning_rate": 4.70309495270043e-05, + "loss": 0.09773850440979004, + "step": 123850 + }, + { + "epoch": 0.5317568669877987, + "grad_norm": 0.0059434291906654835, + "learning_rate": 4.702663780688668e-05, + "loss": 0.09885787367820739, + "step": 123860 + }, + { + "epoch": 0.5317997990778187, + "grad_norm": 3.9031686782836914, + "learning_rate": 4.7022326086769055e-05, + "loss": 0.34107317924499514, + "step": 123870 + }, + { + "epoch": 0.5318427311678388, + "grad_norm": 0.011664203368127346, + "learning_rate": 4.701801436665143e-05, + "loss": 0.20996668338775634, + "step": 123880 + }, + { + "epoch": 0.5318856632578587, + "grad_norm": 4.821422576904297, + "learning_rate": 4.701370264653381e-05, + "loss": 0.1527780294418335, + "step": 123890 + }, + { + "epoch": 0.5319285953478787, + "grad_norm": 0.02439550682902336, + "learning_rate": 4.700939092641619e-05, + "loss": 0.10840038061141968, + "step": 123900 + }, + { + "epoch": 0.5319715274378988, + "grad_norm": 2.292161226272583, + "learning_rate": 4.7005079206298565e-05, + "loss": 0.32342574596405027, + "step": 123910 + }, + { + "epoch": 0.5320144595279187, + "grad_norm": 0.783797562122345, + "learning_rate": 4.700076748618094e-05, + "loss": 0.11824526786804199, + "step": 123920 + }, + { + "epoch": 0.5320573916179387, + "grad_norm": 0.008362102322280407, + "learning_rate": 4.699645576606331e-05, + "loss": 0.19398202896118164, + "step": 123930 + }, + { + "epoch": 0.5321003237079588, + "grad_norm": 0.007039368152618408, + "learning_rate": 4.699214404594569e-05, + "loss": 0.2674627065658569, + "step": 123940 + }, + { + "epoch": 0.5321432557979787, + "grad_norm": 1.1618794202804565, + "learning_rate": 4.698783232582807e-05, + "loss": 0.17610886096954345, + "step": 123950 + }, + { + "epoch": 0.5321861878879988, + "grad_norm": 0.01650865189731121, + "learning_rate": 4.6983520605710445e-05, + "loss": 0.09232497215270996, + "step": 123960 + }, + { + "epoch": 0.5322291199780188, + "grad_norm": 2.4566853046417236, + "learning_rate": 4.6979208885592815e-05, + "loss": 0.13579462766647338, + "step": 123970 + }, + { + "epoch": 0.5322720520680387, + "grad_norm": 1.1726164817810059, + "learning_rate": 4.697489716547519e-05, + "loss": 0.31279146671295166, + "step": 123980 + }, + { + "epoch": 0.5323149841580588, + "grad_norm": 0.13944868743419647, + "learning_rate": 4.697058544535757e-05, + "loss": 0.38595051765441896, + "step": 123990 + }, + { + "epoch": 0.5323579162480788, + "grad_norm": 0.43487343192100525, + "learning_rate": 4.6966273725239954e-05, + "loss": 0.2199528455734253, + "step": 124000 + }, + { + "epoch": 0.5323579162480788, + "eval_loss": 0.4122171103954315, + "eval_runtime": 27.083, + "eval_samples_per_second": 3.692, + "eval_steps_per_second": 3.692, + "step": 124000 + }, + { + "epoch": 0.5324008483380988, + "grad_norm": 3.2876408100128174, + "learning_rate": 4.6961962005122325e-05, + "loss": 0.2153330087661743, + "step": 124010 + }, + { + "epoch": 0.5324437804281188, + "grad_norm": 0.03849121183156967, + "learning_rate": 4.69576502850047e-05, + "loss": 0.33109848499298095, + "step": 124020 + }, + { + "epoch": 0.5324867125181388, + "grad_norm": 1.7557862997055054, + "learning_rate": 4.695333856488708e-05, + "loss": 0.17541335821151732, + "step": 124030 + }, + { + "epoch": 0.5325296446081588, + "grad_norm": 1.1375805139541626, + "learning_rate": 4.694902684476946e-05, + "loss": 0.1093302845954895, + "step": 124040 + }, + { + "epoch": 0.5325725766981788, + "grad_norm": 0.16217398643493652, + "learning_rate": 4.6944715124651834e-05, + "loss": 0.18123213052749634, + "step": 124050 + }, + { + "epoch": 0.5326155087881989, + "grad_norm": 2.6146907806396484, + "learning_rate": 4.6940403404534205e-05, + "loss": 0.1957655668258667, + "step": 124060 + }, + { + "epoch": 0.5326584408782188, + "grad_norm": 0.575221598148346, + "learning_rate": 4.693609168441658e-05, + "loss": 0.1171625018119812, + "step": 124070 + }, + { + "epoch": 0.5327013729682388, + "grad_norm": 1.2705405950546265, + "learning_rate": 4.693177996429896e-05, + "loss": 0.12601308822631835, + "step": 124080 + }, + { + "epoch": 0.5327443050582589, + "grad_norm": 0.022567199543118477, + "learning_rate": 4.692746824418134e-05, + "loss": 0.2454216480255127, + "step": 124090 + }, + { + "epoch": 0.5327872371482788, + "grad_norm": 0.0011852540774270892, + "learning_rate": 4.692315652406371e-05, + "loss": 0.17979726791381836, + "step": 124100 + }, + { + "epoch": 0.5328301692382988, + "grad_norm": 0.01909797079861164, + "learning_rate": 4.691884480394609e-05, + "loss": 0.42721829414367674, + "step": 124110 + }, + { + "epoch": 0.5328731013283189, + "grad_norm": 0.006946041248738766, + "learning_rate": 4.691453308382847e-05, + "loss": 0.06694617271423339, + "step": 124120 + }, + { + "epoch": 0.5329160334183388, + "grad_norm": 5.16895866394043, + "learning_rate": 4.6910221363710846e-05, + "loss": 0.49288372993469237, + "step": 124130 + }, + { + "epoch": 0.5329589655083589, + "grad_norm": 1.4857653379440308, + "learning_rate": 4.690590964359322e-05, + "loss": 0.20648424625396727, + "step": 124140 + }, + { + "epoch": 0.5330018975983789, + "grad_norm": 0.9698376655578613, + "learning_rate": 4.6901597923475594e-05, + "loss": 0.26796138286590576, + "step": 124150 + }, + { + "epoch": 0.5330448296883988, + "grad_norm": 2.552114486694336, + "learning_rate": 4.689728620335797e-05, + "loss": 0.5469475746154785, + "step": 124160 + }, + { + "epoch": 0.5330877617784189, + "grad_norm": 3.912579298019409, + "learning_rate": 4.689297448324035e-05, + "loss": 0.2692500352859497, + "step": 124170 + }, + { + "epoch": 0.5331306938684389, + "grad_norm": 0.17444412410259247, + "learning_rate": 4.688866276312272e-05, + "loss": 0.23923828601837158, + "step": 124180 + }, + { + "epoch": 0.533173625958459, + "grad_norm": 1.8246395587921143, + "learning_rate": 4.6884351043005097e-05, + "loss": 0.2045379400253296, + "step": 124190 + }, + { + "epoch": 0.5332165580484789, + "grad_norm": 0.006754355505108833, + "learning_rate": 4.6880039322887474e-05, + "loss": 0.334868335723877, + "step": 124200 + }, + { + "epoch": 0.5332594901384989, + "grad_norm": 0.019909320399165154, + "learning_rate": 4.687572760276985e-05, + "loss": 0.32880401611328125, + "step": 124210 + }, + { + "epoch": 0.533302422228519, + "grad_norm": 0.026598718017339706, + "learning_rate": 4.687141588265223e-05, + "loss": 0.04149231910705566, + "step": 124220 + }, + { + "epoch": 0.5333453543185389, + "grad_norm": 0.2718566358089447, + "learning_rate": 4.6867104162534606e-05, + "loss": 0.13211541175842284, + "step": 124230 + }, + { + "epoch": 0.533388286408559, + "grad_norm": 0.05366518348455429, + "learning_rate": 4.686279244241698e-05, + "loss": 0.15838115215301513, + "step": 124240 + }, + { + "epoch": 0.533431218498579, + "grad_norm": 0.02057690918445587, + "learning_rate": 4.685848072229936e-05, + "loss": 0.19188928604125977, + "step": 124250 + }, + { + "epoch": 0.5334741505885989, + "grad_norm": 2.503087282180786, + "learning_rate": 4.685416900218173e-05, + "loss": 0.25132534503936765, + "step": 124260 + }, + { + "epoch": 0.533517082678619, + "grad_norm": 0.10337914526462555, + "learning_rate": 4.684985728206411e-05, + "loss": 0.3497090101242065, + "step": 124270 + }, + { + "epoch": 0.533560014768639, + "grad_norm": 5.80239200592041, + "learning_rate": 4.6845545561946486e-05, + "loss": 0.4027796745300293, + "step": 124280 + }, + { + "epoch": 0.5336029468586589, + "grad_norm": 0.9824345111846924, + "learning_rate": 4.684123384182886e-05, + "loss": 0.511225700378418, + "step": 124290 + }, + { + "epoch": 0.533645878948679, + "grad_norm": 0.02229215018451214, + "learning_rate": 4.6836922121711234e-05, + "loss": 0.05699042677879333, + "step": 124300 + }, + { + "epoch": 0.533688811038699, + "grad_norm": 0.01698029786348343, + "learning_rate": 4.683261040159361e-05, + "loss": 0.32143568992614746, + "step": 124310 + }, + { + "epoch": 0.533731743128719, + "grad_norm": 2.416086435317993, + "learning_rate": 4.682829868147599e-05, + "loss": 0.333212685585022, + "step": 124320 + }, + { + "epoch": 0.533774675218739, + "grad_norm": 0.6163731813430786, + "learning_rate": 4.6823986961358366e-05, + "loss": 0.396596884727478, + "step": 124330 + }, + { + "epoch": 0.533817607308759, + "grad_norm": 0.07051825523376465, + "learning_rate": 4.681967524124074e-05, + "loss": 0.27659101486206056, + "step": 124340 + }, + { + "epoch": 0.533860539398779, + "grad_norm": 0.10191439092159271, + "learning_rate": 4.681536352112312e-05, + "loss": 0.30634074211120604, + "step": 124350 + }, + { + "epoch": 0.533903471488799, + "grad_norm": 2.1454920768737793, + "learning_rate": 4.68110518010055e-05, + "loss": 0.169644033908844, + "step": 124360 + }, + { + "epoch": 0.5339464035788191, + "grad_norm": 0.3064160645008087, + "learning_rate": 4.6806740080887875e-05, + "loss": 0.19605166912078859, + "step": 124370 + }, + { + "epoch": 0.533989335668839, + "grad_norm": 0.005128131248056889, + "learning_rate": 4.6802428360770246e-05, + "loss": 0.16128937005996705, + "step": 124380 + }, + { + "epoch": 0.534032267758859, + "grad_norm": 3.605325222015381, + "learning_rate": 4.679811664065262e-05, + "loss": 0.15976146459579468, + "step": 124390 + }, + { + "epoch": 0.5340751998488791, + "grad_norm": 0.13232477009296417, + "learning_rate": 4.6793804920535e-05, + "loss": 0.1932427167892456, + "step": 124400 + }, + { + "epoch": 0.534118131938899, + "grad_norm": 0.008470027707517147, + "learning_rate": 4.678949320041738e-05, + "loss": 0.20527310371398927, + "step": 124410 + }, + { + "epoch": 0.534161064028919, + "grad_norm": 0.7935301661491394, + "learning_rate": 4.6785181480299755e-05, + "loss": 0.25196199417114257, + "step": 124420 + }, + { + "epoch": 0.5342039961189391, + "grad_norm": 0.017037170007824898, + "learning_rate": 4.6780869760182126e-05, + "loss": 0.20903384685516357, + "step": 124430 + }, + { + "epoch": 0.534246928208959, + "grad_norm": 5.581609725952148, + "learning_rate": 4.67765580400645e-05, + "loss": 0.06474516391754151, + "step": 124440 + }, + { + "epoch": 0.5342898602989791, + "grad_norm": 0.0051388125866651535, + "learning_rate": 4.677224631994688e-05, + "loss": 0.05504382252693176, + "step": 124450 + }, + { + "epoch": 0.5343327923889991, + "grad_norm": 4.44582986831665, + "learning_rate": 4.676793459982926e-05, + "loss": 0.3161288261413574, + "step": 124460 + }, + { + "epoch": 0.534375724479019, + "grad_norm": 0.9643357992172241, + "learning_rate": 4.6763622879711635e-05, + "loss": 0.27371511459350584, + "step": 124470 + }, + { + "epoch": 0.5344186565690391, + "grad_norm": 1.267791509628296, + "learning_rate": 4.675931115959401e-05, + "loss": 0.10485801696777344, + "step": 124480 + }, + { + "epoch": 0.5344615886590591, + "grad_norm": 0.0774456262588501, + "learning_rate": 4.675499943947639e-05, + "loss": 0.23535749912261963, + "step": 124490 + }, + { + "epoch": 0.5345045207490791, + "grad_norm": 0.023274218663573265, + "learning_rate": 4.675068771935877e-05, + "loss": 0.40593762397766114, + "step": 124500 + }, + { + "epoch": 0.5345474528390991, + "grad_norm": 0.30989450216293335, + "learning_rate": 4.674637599924114e-05, + "loss": 0.07787411212921143, + "step": 124510 + }, + { + "epoch": 0.5345903849291191, + "grad_norm": 0.14625364542007446, + "learning_rate": 4.6742064279123515e-05, + "loss": 0.13913246393203735, + "step": 124520 + }, + { + "epoch": 0.5346333170191391, + "grad_norm": 0.0016882647760212421, + "learning_rate": 4.673775255900589e-05, + "loss": 0.24402050971984862, + "step": 124530 + }, + { + "epoch": 0.5346762491091591, + "grad_norm": 0.2706347703933716, + "learning_rate": 4.673344083888827e-05, + "loss": 0.1284398317337036, + "step": 124540 + }, + { + "epoch": 0.5347191811991792, + "grad_norm": 14.831525802612305, + "learning_rate": 4.672912911877064e-05, + "loss": 0.3947265148162842, + "step": 124550 + }, + { + "epoch": 0.5347621132891991, + "grad_norm": 0.07119249552488327, + "learning_rate": 4.672481739865302e-05, + "loss": 0.1879923701286316, + "step": 124560 + }, + { + "epoch": 0.5348050453792191, + "grad_norm": 8.25128173828125, + "learning_rate": 4.6720505678535395e-05, + "loss": 0.16921440362930298, + "step": 124570 + }, + { + "epoch": 0.5348479774692392, + "grad_norm": 0.17425376176834106, + "learning_rate": 4.671619395841777e-05, + "loss": 0.20272107124328614, + "step": 124580 + }, + { + "epoch": 0.5348909095592591, + "grad_norm": 0.05234595015645027, + "learning_rate": 4.671188223830015e-05, + "loss": 0.3504507064819336, + "step": 124590 + }, + { + "epoch": 0.5349338416492792, + "grad_norm": 0.008333982899785042, + "learning_rate": 4.670757051818253e-05, + "loss": 0.11752091646194458, + "step": 124600 + }, + { + "epoch": 0.5349767737392992, + "grad_norm": 0.008150231093168259, + "learning_rate": 4.6703258798064904e-05, + "loss": 0.27742657661437986, + "step": 124610 + }, + { + "epoch": 0.5350197058293192, + "grad_norm": 0.011584865860641003, + "learning_rate": 4.669894707794728e-05, + "loss": 0.1437814712524414, + "step": 124620 + }, + { + "epoch": 0.5350626379193392, + "grad_norm": 0.4360119104385376, + "learning_rate": 4.669463535782965e-05, + "loss": 0.2068427562713623, + "step": 124630 + }, + { + "epoch": 0.5351055700093592, + "grad_norm": 1.7382227182388306, + "learning_rate": 4.669032363771203e-05, + "loss": 0.20213468074798585, + "step": 124640 + }, + { + "epoch": 0.5351485020993793, + "grad_norm": 0.0036807823926210403, + "learning_rate": 4.668601191759441e-05, + "loss": 0.2461169481277466, + "step": 124650 + }, + { + "epoch": 0.5351914341893992, + "grad_norm": 0.02192399837076664, + "learning_rate": 4.6681700197476784e-05, + "loss": 0.33381974697113037, + "step": 124660 + }, + { + "epoch": 0.5352343662794192, + "grad_norm": 0.022471271455287933, + "learning_rate": 4.6677388477359155e-05, + "loss": 0.08194748163223267, + "step": 124670 + }, + { + "epoch": 0.5352772983694393, + "grad_norm": 1.029158115386963, + "learning_rate": 4.667307675724153e-05, + "loss": 0.2929563045501709, + "step": 124680 + }, + { + "epoch": 0.5353202304594592, + "grad_norm": 0.5128310918807983, + "learning_rate": 4.666876503712391e-05, + "loss": 0.37208659648895265, + "step": 124690 + }, + { + "epoch": 0.5353631625494792, + "grad_norm": 0.00456015020608902, + "learning_rate": 4.6664453317006294e-05, + "loss": 0.013756263256072997, + "step": 124700 + }, + { + "epoch": 0.5354060946394993, + "grad_norm": 2.446556568145752, + "learning_rate": 4.6660141596888664e-05, + "loss": 0.13984161615371704, + "step": 124710 + }, + { + "epoch": 0.5354490267295192, + "grad_norm": 0.08037838339805603, + "learning_rate": 4.665582987677104e-05, + "loss": 0.17831218242645264, + "step": 124720 + }, + { + "epoch": 0.5354919588195393, + "grad_norm": 0.05052134767174721, + "learning_rate": 4.665151815665342e-05, + "loss": 0.11403819322586059, + "step": 124730 + }, + { + "epoch": 0.5355348909095593, + "grad_norm": 5.381241798400879, + "learning_rate": 4.6647206436535796e-05, + "loss": 0.2679091691970825, + "step": 124740 + }, + { + "epoch": 0.5355778229995792, + "grad_norm": 4.498979091644287, + "learning_rate": 4.664289471641817e-05, + "loss": 0.07987023591995239, + "step": 124750 + }, + { + "epoch": 0.5356207550895993, + "grad_norm": 0.004411382135003805, + "learning_rate": 4.6638582996300544e-05, + "loss": 0.07055896520614624, + "step": 124760 + }, + { + "epoch": 0.5356636871796193, + "grad_norm": 0.209492027759552, + "learning_rate": 4.663427127618292e-05, + "loss": 0.11898517608642578, + "step": 124770 + }, + { + "epoch": 0.5357066192696393, + "grad_norm": 0.750752866268158, + "learning_rate": 4.66299595560653e-05, + "loss": 0.12291603088378907, + "step": 124780 + }, + { + "epoch": 0.5357495513596593, + "grad_norm": 0.0013108792481943965, + "learning_rate": 4.6625647835947676e-05, + "loss": 0.10511652231216431, + "step": 124790 + }, + { + "epoch": 0.5357924834496793, + "grad_norm": 0.011161865666508675, + "learning_rate": 4.662133611583005e-05, + "loss": 0.2695204734802246, + "step": 124800 + }, + { + "epoch": 0.5358354155396993, + "grad_norm": 0.7787624001502991, + "learning_rate": 4.661702439571243e-05, + "loss": 0.16277107000350952, + "step": 124810 + }, + { + "epoch": 0.5358783476297193, + "grad_norm": 0.3624718487262726, + "learning_rate": 4.661271267559481e-05, + "loss": 0.33316256999969485, + "step": 124820 + }, + { + "epoch": 0.5359212797197394, + "grad_norm": 0.002750060521066189, + "learning_rate": 4.6608400955477186e-05, + "loss": 0.055188989639282225, + "step": 124830 + }, + { + "epoch": 0.5359642118097593, + "grad_norm": 0.0005346073885448277, + "learning_rate": 4.6604089235359556e-05, + "loss": 0.10296810865402221, + "step": 124840 + }, + { + "epoch": 0.5360071438997793, + "grad_norm": 2.2687649726867676, + "learning_rate": 4.6599777515241933e-05, + "loss": 0.4165465831756592, + "step": 124850 + }, + { + "epoch": 0.5360500759897994, + "grad_norm": 0.014060785993933678, + "learning_rate": 4.659546579512431e-05, + "loss": 0.23372213840484618, + "step": 124860 + }, + { + "epoch": 0.5360930080798193, + "grad_norm": 1.1599324941635132, + "learning_rate": 4.659115407500669e-05, + "loss": 0.23743939399719238, + "step": 124870 + }, + { + "epoch": 0.5361359401698393, + "grad_norm": 0.017570368945598602, + "learning_rate": 4.658684235488906e-05, + "loss": 0.1744380474090576, + "step": 124880 + }, + { + "epoch": 0.5361788722598594, + "grad_norm": 4.546342372894287, + "learning_rate": 4.6582530634771436e-05, + "loss": 0.3124069690704346, + "step": 124890 + }, + { + "epoch": 0.5362218043498793, + "grad_norm": 6.73213529586792, + "learning_rate": 4.657821891465381e-05, + "loss": 0.17486449480056762, + "step": 124900 + }, + { + "epoch": 0.5362647364398994, + "grad_norm": 1.787294864654541, + "learning_rate": 4.657390719453619e-05, + "loss": 0.44237399101257324, + "step": 124910 + }, + { + "epoch": 0.5363076685299194, + "grad_norm": 0.009232389740645885, + "learning_rate": 4.656959547441857e-05, + "loss": 0.389838719367981, + "step": 124920 + }, + { + "epoch": 0.5363506006199393, + "grad_norm": 0.020139772444963455, + "learning_rate": 4.6565283754300945e-05, + "loss": 0.3448354482650757, + "step": 124930 + }, + { + "epoch": 0.5363935327099594, + "grad_norm": 1.5293805599212646, + "learning_rate": 4.656097203418332e-05, + "loss": 0.3066279888153076, + "step": 124940 + }, + { + "epoch": 0.5364364647999794, + "grad_norm": 0.040268316864967346, + "learning_rate": 4.65566603140657e-05, + "loss": 0.16958212852478027, + "step": 124950 + }, + { + "epoch": 0.5364793968899993, + "grad_norm": 0.10245020687580109, + "learning_rate": 4.655234859394807e-05, + "loss": 0.12522271871566773, + "step": 124960 + }, + { + "epoch": 0.5365223289800194, + "grad_norm": 0.008734374307096004, + "learning_rate": 4.654803687383045e-05, + "loss": 0.051020973920822145, + "step": 124970 + }, + { + "epoch": 0.5365652610700394, + "grad_norm": 2.040437698364258, + "learning_rate": 4.6543725153712825e-05, + "loss": 0.4698540210723877, + "step": 124980 + }, + { + "epoch": 0.5366081931600594, + "grad_norm": 0.544279932975769, + "learning_rate": 4.65394134335952e-05, + "loss": 0.1698075771331787, + "step": 124990 + }, + { + "epoch": 0.5366511252500794, + "grad_norm": 0.049962181597948074, + "learning_rate": 4.653510171347757e-05, + "loss": 0.1048028826713562, + "step": 125000 + }, + { + "epoch": 0.5366511252500794, + "eval_loss": 0.41585448384284973, + "eval_runtime": 27.1534, + "eval_samples_per_second": 3.683, + "eval_steps_per_second": 3.683, + "step": 125000 + }, + { + "epoch": 0.5366940573400995, + "grad_norm": 4.228307723999023, + "learning_rate": 4.653078999335995e-05, + "loss": 0.3834389209747314, + "step": 125010 + }, + { + "epoch": 0.5367369894301194, + "grad_norm": 0.5724061727523804, + "learning_rate": 4.652647827324233e-05, + "loss": 0.28278398513793945, + "step": 125020 + }, + { + "epoch": 0.5367799215201394, + "grad_norm": 1.0082062482833862, + "learning_rate": 4.6522166553124705e-05, + "loss": 0.21530394554138182, + "step": 125030 + }, + { + "epoch": 0.5368228536101595, + "grad_norm": 0.10343755036592484, + "learning_rate": 4.651785483300708e-05, + "loss": 0.3212897777557373, + "step": 125040 + }, + { + "epoch": 0.5368657857001795, + "grad_norm": 0.04396173357963562, + "learning_rate": 4.651354311288946e-05, + "loss": 0.28393266201019285, + "step": 125050 + }, + { + "epoch": 0.5369087177901994, + "grad_norm": 0.003487182315438986, + "learning_rate": 4.650923139277184e-05, + "loss": 0.09013047218322753, + "step": 125060 + }, + { + "epoch": 0.5369516498802195, + "grad_norm": 2.7585346698760986, + "learning_rate": 4.6504919672654215e-05, + "loss": 0.09315488934516906, + "step": 125070 + }, + { + "epoch": 0.5369945819702395, + "grad_norm": 5.236676216125488, + "learning_rate": 4.6500607952536585e-05, + "loss": 0.18706350326538085, + "step": 125080 + }, + { + "epoch": 0.5370375140602595, + "grad_norm": 0.20571869611740112, + "learning_rate": 4.649629623241896e-05, + "loss": 0.09702978134155274, + "step": 125090 + }, + { + "epoch": 0.5370804461502795, + "grad_norm": 0.003286497900262475, + "learning_rate": 4.649198451230134e-05, + "loss": 0.12894362211227417, + "step": 125100 + }, + { + "epoch": 0.5371233782402995, + "grad_norm": 1.3346261978149414, + "learning_rate": 4.648767279218372e-05, + "loss": 0.4185777187347412, + "step": 125110 + }, + { + "epoch": 0.5371663103303195, + "grad_norm": 8.176697731018066, + "learning_rate": 4.6483361072066095e-05, + "loss": 0.2767629146575928, + "step": 125120 + }, + { + "epoch": 0.5372092424203395, + "grad_norm": 1.114349365234375, + "learning_rate": 4.6479049351948465e-05, + "loss": 0.20764529705047607, + "step": 125130 + }, + { + "epoch": 0.5372521745103596, + "grad_norm": 1.419968605041504, + "learning_rate": 4.647473763183084e-05, + "loss": 0.16337499618530274, + "step": 125140 + }, + { + "epoch": 0.5372951066003795, + "grad_norm": 0.01131153292953968, + "learning_rate": 4.647042591171322e-05, + "loss": 0.2339179515838623, + "step": 125150 + }, + { + "epoch": 0.5373380386903995, + "grad_norm": 0.006680184509605169, + "learning_rate": 4.64661141915956e-05, + "loss": 0.20181403160095215, + "step": 125160 + }, + { + "epoch": 0.5373809707804196, + "grad_norm": 0.006397879216820002, + "learning_rate": 4.6461802471477975e-05, + "loss": 0.26559085845947267, + "step": 125170 + }, + { + "epoch": 0.5374239028704395, + "grad_norm": 0.08209287375211716, + "learning_rate": 4.645749075136035e-05, + "loss": 0.11008179187774658, + "step": 125180 + }, + { + "epoch": 0.5374668349604595, + "grad_norm": 0.004642953164875507, + "learning_rate": 4.645317903124273e-05, + "loss": 0.08794822096824646, + "step": 125190 + }, + { + "epoch": 0.5375097670504796, + "grad_norm": 0.34100475907325745, + "learning_rate": 4.6448867311125107e-05, + "loss": 0.18877944946289063, + "step": 125200 + }, + { + "epoch": 0.5375526991404995, + "grad_norm": 3.0378482341766357, + "learning_rate": 4.644455559100748e-05, + "loss": 0.34736883640289307, + "step": 125210 + }, + { + "epoch": 0.5375956312305196, + "grad_norm": 0.012231471948325634, + "learning_rate": 4.6440243870889854e-05, + "loss": 0.13005506992340088, + "step": 125220 + }, + { + "epoch": 0.5376385633205396, + "grad_norm": 1.5531368255615234, + "learning_rate": 4.643593215077223e-05, + "loss": 0.28213071823120117, + "step": 125230 + }, + { + "epoch": 0.5376814954105595, + "grad_norm": 0.022608119994401932, + "learning_rate": 4.643162043065461e-05, + "loss": 0.09758681654930115, + "step": 125240 + }, + { + "epoch": 0.5377244275005796, + "grad_norm": 0.006008662283420563, + "learning_rate": 4.642730871053698e-05, + "loss": 0.32650175094604494, + "step": 125250 + }, + { + "epoch": 0.5377673595905996, + "grad_norm": 0.020104752853512764, + "learning_rate": 4.642299699041936e-05, + "loss": 0.024312908947467803, + "step": 125260 + }, + { + "epoch": 0.5378102916806196, + "grad_norm": 0.0024187087547034025, + "learning_rate": 4.6418685270301734e-05, + "loss": 0.1383475184440613, + "step": 125270 + }, + { + "epoch": 0.5378532237706396, + "grad_norm": 0.08186737447977066, + "learning_rate": 4.641437355018411e-05, + "loss": 0.18166197538375856, + "step": 125280 + }, + { + "epoch": 0.5378961558606596, + "grad_norm": 0.06082170829176903, + "learning_rate": 4.641006183006649e-05, + "loss": 0.1468608021736145, + "step": 125290 + }, + { + "epoch": 0.5379390879506796, + "grad_norm": 0.5789875984191895, + "learning_rate": 4.6405750109948866e-05, + "loss": 0.17679991722106933, + "step": 125300 + }, + { + "epoch": 0.5379820200406996, + "grad_norm": 5.591590881347656, + "learning_rate": 4.6401438389831244e-05, + "loss": 0.2792628049850464, + "step": 125310 + }, + { + "epoch": 0.5380249521307197, + "grad_norm": 0.12889571487903595, + "learning_rate": 4.639712666971362e-05, + "loss": 0.34988381862640383, + "step": 125320 + }, + { + "epoch": 0.5380678842207396, + "grad_norm": 0.001045150333084166, + "learning_rate": 4.639281494959599e-05, + "loss": 0.11754976511001587, + "step": 125330 + }, + { + "epoch": 0.5381108163107596, + "grad_norm": 0.4624530076980591, + "learning_rate": 4.638850322947837e-05, + "loss": 0.11660113334655761, + "step": 125340 + }, + { + "epoch": 0.5381537484007797, + "grad_norm": 0.006142620462924242, + "learning_rate": 4.6384191509360746e-05, + "loss": 0.3070712089538574, + "step": 125350 + }, + { + "epoch": 0.5381966804907996, + "grad_norm": 0.0023859955836087465, + "learning_rate": 4.6379879789243124e-05, + "loss": 0.33227355480194093, + "step": 125360 + }, + { + "epoch": 0.5382396125808196, + "grad_norm": 1.9227700233459473, + "learning_rate": 4.6375568069125494e-05, + "loss": 0.33113319873809816, + "step": 125370 + }, + { + "epoch": 0.5382825446708397, + "grad_norm": 0.9203783869743347, + "learning_rate": 4.637125634900787e-05, + "loss": 0.1220745325088501, + "step": 125380 + }, + { + "epoch": 0.5383254767608596, + "grad_norm": 0.005649822298437357, + "learning_rate": 4.636694462889025e-05, + "loss": 0.147062349319458, + "step": 125390 + }, + { + "epoch": 0.5383684088508797, + "grad_norm": 0.13790778815746307, + "learning_rate": 4.6362632908772626e-05, + "loss": 0.2776653289794922, + "step": 125400 + }, + { + "epoch": 0.5384113409408997, + "grad_norm": 0.009624933823943138, + "learning_rate": 4.6358321188655004e-05, + "loss": 0.1833699584007263, + "step": 125410 + }, + { + "epoch": 0.5384542730309196, + "grad_norm": 0.9451597929000854, + "learning_rate": 4.635400946853738e-05, + "loss": 0.14015096426010132, + "step": 125420 + }, + { + "epoch": 0.5384972051209397, + "grad_norm": 0.1229841560125351, + "learning_rate": 4.634969774841976e-05, + "loss": 0.21246821880340577, + "step": 125430 + }, + { + "epoch": 0.5385401372109597, + "grad_norm": 0.9702297449111938, + "learning_rate": 4.6345386028302136e-05, + "loss": 0.18682045936584474, + "step": 125440 + }, + { + "epoch": 0.5385830693009797, + "grad_norm": 1.5991899967193604, + "learning_rate": 4.6341074308184506e-05, + "loss": 0.22341518402099608, + "step": 125450 + }, + { + "epoch": 0.5386260013909997, + "grad_norm": 0.04757259413599968, + "learning_rate": 4.6336762588066884e-05, + "loss": 0.24468319416046141, + "step": 125460 + }, + { + "epoch": 0.5386689334810197, + "grad_norm": 0.15516333281993866, + "learning_rate": 4.633245086794926e-05, + "loss": 0.2808207035064697, + "step": 125470 + }, + { + "epoch": 0.5387118655710398, + "grad_norm": 0.002046056091785431, + "learning_rate": 4.632813914783164e-05, + "loss": 0.03726526498794556, + "step": 125480 + }, + { + "epoch": 0.5387547976610597, + "grad_norm": 0.0016733687371015549, + "learning_rate": 4.6323827427714016e-05, + "loss": 0.21174945831298828, + "step": 125490 + }, + { + "epoch": 0.5387977297510798, + "grad_norm": 0.9975637197494507, + "learning_rate": 4.6319515707596386e-05, + "loss": 0.20011298656463622, + "step": 125500 + }, + { + "epoch": 0.5388406618410998, + "grad_norm": 0.001489278394728899, + "learning_rate": 4.6315203987478764e-05, + "loss": 0.21532111167907714, + "step": 125510 + }, + { + "epoch": 0.5388835939311197, + "grad_norm": 0.0019053419819101691, + "learning_rate": 4.631089226736115e-05, + "loss": 0.06455349326133727, + "step": 125520 + }, + { + "epoch": 0.5389265260211398, + "grad_norm": 0.12390390038490295, + "learning_rate": 4.6306580547243525e-05, + "loss": 0.25326411724090575, + "step": 125530 + }, + { + "epoch": 0.5389694581111598, + "grad_norm": 2.383126735687256, + "learning_rate": 4.6302268827125896e-05, + "loss": 0.23500394821166992, + "step": 125540 + }, + { + "epoch": 0.5390123902011797, + "grad_norm": 1.2543652057647705, + "learning_rate": 4.629795710700827e-05, + "loss": 0.43595452308654786, + "step": 125550 + }, + { + "epoch": 0.5390553222911998, + "grad_norm": 0.01510920561850071, + "learning_rate": 4.629364538689065e-05, + "loss": 0.24233436584472656, + "step": 125560 + }, + { + "epoch": 0.5390982543812198, + "grad_norm": 0.0008209710358642042, + "learning_rate": 4.628933366677303e-05, + "loss": 0.14674782752990723, + "step": 125570 + }, + { + "epoch": 0.5391411864712398, + "grad_norm": 2.904439687728882, + "learning_rate": 4.62850219466554e-05, + "loss": 0.2904531955718994, + "step": 125580 + }, + { + "epoch": 0.5391841185612598, + "grad_norm": 0.5623788237571716, + "learning_rate": 4.6280710226537775e-05, + "loss": 0.2724529027938843, + "step": 125590 + }, + { + "epoch": 0.5392270506512798, + "grad_norm": 0.030613474547863007, + "learning_rate": 4.627639850642015e-05, + "loss": 0.3674274206161499, + "step": 125600 + }, + { + "epoch": 0.5392699827412998, + "grad_norm": 0.08635491877794266, + "learning_rate": 4.627208678630253e-05, + "loss": 0.06295985579490662, + "step": 125610 + }, + { + "epoch": 0.5393129148313198, + "grad_norm": 0.2094467282295227, + "learning_rate": 4.62677750661849e-05, + "loss": 0.1437790036201477, + "step": 125620 + }, + { + "epoch": 0.5393558469213399, + "grad_norm": 0.5031962990760803, + "learning_rate": 4.6263463346067285e-05, + "loss": 0.2865575313568115, + "step": 125630 + }, + { + "epoch": 0.5393987790113598, + "grad_norm": 5.898693084716797, + "learning_rate": 4.625915162594966e-05, + "loss": 0.2831977367401123, + "step": 125640 + }, + { + "epoch": 0.5394417111013798, + "grad_norm": 0.5221174955368042, + "learning_rate": 4.625483990583204e-05, + "loss": 0.018702538311481477, + "step": 125650 + }, + { + "epoch": 0.5394846431913999, + "grad_norm": 1.3053752183914185, + "learning_rate": 4.625052818571441e-05, + "loss": 0.2751390218734741, + "step": 125660 + }, + { + "epoch": 0.5395275752814198, + "grad_norm": 1.5146193504333496, + "learning_rate": 4.624621646559679e-05, + "loss": 0.14024045467376708, + "step": 125670 + }, + { + "epoch": 0.5395705073714399, + "grad_norm": 0.007814999669790268, + "learning_rate": 4.6241904745479165e-05, + "loss": 0.37259511947631835, + "step": 125680 + }, + { + "epoch": 0.5396134394614599, + "grad_norm": 2.595086097717285, + "learning_rate": 4.623759302536154e-05, + "loss": 0.33952600955963136, + "step": 125690 + }, + { + "epoch": 0.5396563715514798, + "grad_norm": 1.5511152744293213, + "learning_rate": 4.623328130524391e-05, + "loss": 0.33948214054107667, + "step": 125700 + }, + { + "epoch": 0.5396993036414999, + "grad_norm": 0.6245251297950745, + "learning_rate": 4.622896958512629e-05, + "loss": 0.009097591787576676, + "step": 125710 + }, + { + "epoch": 0.5397422357315199, + "grad_norm": 0.017570942640304565, + "learning_rate": 4.622465786500867e-05, + "loss": 0.13392648696899415, + "step": 125720 + }, + { + "epoch": 0.5397851678215398, + "grad_norm": 0.0011469552991911769, + "learning_rate": 4.6220346144891045e-05, + "loss": 0.04894132018089294, + "step": 125730 + }, + { + "epoch": 0.5398280999115599, + "grad_norm": 0.04077253118157387, + "learning_rate": 4.621603442477342e-05, + "loss": 0.11998807191848755, + "step": 125740 + }, + { + "epoch": 0.5398710320015799, + "grad_norm": 2.001880407333374, + "learning_rate": 4.62117227046558e-05, + "loss": 0.4320225238800049, + "step": 125750 + }, + { + "epoch": 0.5399139640915999, + "grad_norm": 0.0013287104666233063, + "learning_rate": 4.620741098453818e-05, + "loss": 0.18426822423934935, + "step": 125760 + }, + { + "epoch": 0.5399568961816199, + "grad_norm": 1.52393639087677, + "learning_rate": 4.6203099264420554e-05, + "loss": 0.14336030483245848, + "step": 125770 + }, + { + "epoch": 0.53999982827164, + "grad_norm": 1.1323847770690918, + "learning_rate": 4.6198787544302925e-05, + "loss": 0.1529453158378601, + "step": 125780 + }, + { + "epoch": 0.5400427603616599, + "grad_norm": 2.1432337760925293, + "learning_rate": 4.61944758241853e-05, + "loss": 0.3418971300125122, + "step": 125790 + }, + { + "epoch": 0.5400856924516799, + "grad_norm": 0.18537476658821106, + "learning_rate": 4.619016410406768e-05, + "loss": 0.25544416904449463, + "step": 125800 + }, + { + "epoch": 0.5401286245417, + "grad_norm": 0.00806635431945324, + "learning_rate": 4.618585238395006e-05, + "loss": 0.29964916706085204, + "step": 125810 + }, + { + "epoch": 0.5401715566317199, + "grad_norm": 1.3490689992904663, + "learning_rate": 4.618154066383243e-05, + "loss": 0.3459489345550537, + "step": 125820 + }, + { + "epoch": 0.5402144887217399, + "grad_norm": 0.0129646435379982, + "learning_rate": 4.6177228943714805e-05, + "loss": 0.26499371528625487, + "step": 125830 + }, + { + "epoch": 0.54025742081176, + "grad_norm": 1.2936310768127441, + "learning_rate": 4.617291722359718e-05, + "loss": 0.2571291923522949, + "step": 125840 + }, + { + "epoch": 0.5403003529017799, + "grad_norm": 0.0039204442873597145, + "learning_rate": 4.616860550347956e-05, + "loss": 0.17692142724990845, + "step": 125850 + }, + { + "epoch": 0.5403432849918, + "grad_norm": 0.3188647925853729, + "learning_rate": 4.616429378336194e-05, + "loss": 0.08579055070877076, + "step": 125860 + }, + { + "epoch": 0.54038621708182, + "grad_norm": 0.24737665057182312, + "learning_rate": 4.6159982063244314e-05, + "loss": 0.0415299266576767, + "step": 125870 + }, + { + "epoch": 0.5404291491718399, + "grad_norm": 6.9003777503967285, + "learning_rate": 4.615567034312669e-05, + "loss": 0.3434094190597534, + "step": 125880 + }, + { + "epoch": 0.54047208126186, + "grad_norm": 0.0020558438263833523, + "learning_rate": 4.615135862300907e-05, + "loss": 0.2131648302078247, + "step": 125890 + }, + { + "epoch": 0.54051501335188, + "grad_norm": 0.6378905177116394, + "learning_rate": 4.6147046902891446e-05, + "loss": 0.12402830123901368, + "step": 125900 + }, + { + "epoch": 0.5405579454419001, + "grad_norm": 4.6194000244140625, + "learning_rate": 4.6142735182773817e-05, + "loss": 0.276075553894043, + "step": 125910 + }, + { + "epoch": 0.54060087753192, + "grad_norm": 0.018461063504219055, + "learning_rate": 4.6138423462656194e-05, + "loss": 0.13185503482818603, + "step": 125920 + }, + { + "epoch": 0.54064380962194, + "grad_norm": 4.305511951446533, + "learning_rate": 4.613411174253857e-05, + "loss": 0.27200567722320557, + "step": 125930 + }, + { + "epoch": 0.5406867417119601, + "grad_norm": 0.00458148866891861, + "learning_rate": 4.612980002242095e-05, + "loss": 0.0953050673007965, + "step": 125940 + }, + { + "epoch": 0.54072967380198, + "grad_norm": 2.7567226886749268, + "learning_rate": 4.612548830230332e-05, + "loss": 0.2230508804321289, + "step": 125950 + }, + { + "epoch": 0.540772605892, + "grad_norm": 1.6046864986419678, + "learning_rate": 4.6121176582185697e-05, + "loss": 0.3258085012435913, + "step": 125960 + }, + { + "epoch": 0.5408155379820201, + "grad_norm": 1.2450833320617676, + "learning_rate": 4.6116864862068074e-05, + "loss": 0.36345996856689455, + "step": 125970 + }, + { + "epoch": 0.54085847007204, + "grad_norm": 0.006003808230161667, + "learning_rate": 4.611255314195045e-05, + "loss": 0.08728630542755127, + "step": 125980 + }, + { + "epoch": 0.5409014021620601, + "grad_norm": 0.04204230010509491, + "learning_rate": 4.610824142183283e-05, + "loss": 0.3631253480911255, + "step": 125990 + }, + { + "epoch": 0.5409443342520801, + "grad_norm": 5.768819332122803, + "learning_rate": 4.6103929701715206e-05, + "loss": 0.08400842547416687, + "step": 126000 + }, + { + "epoch": 0.5409443342520801, + "eval_loss": 0.3944237530231476, + "eval_runtime": 27.2327, + "eval_samples_per_second": 3.672, + "eval_steps_per_second": 3.672, + "step": 126000 + }, + { + "epoch": 0.5409872663421, + "grad_norm": 0.007012277841567993, + "learning_rate": 4.609961798159758e-05, + "loss": 0.18022228479385377, + "step": 126010 + }, + { + "epoch": 0.5410301984321201, + "grad_norm": 1.9996778964996338, + "learning_rate": 4.609530626147996e-05, + "loss": 0.3052912712097168, + "step": 126020 + }, + { + "epoch": 0.5410731305221401, + "grad_norm": 5.9770073890686035, + "learning_rate": 4.609099454136233e-05, + "loss": 0.2505201816558838, + "step": 126030 + }, + { + "epoch": 0.54111606261216, + "grad_norm": 0.027300434187054634, + "learning_rate": 4.608668282124471e-05, + "loss": 0.1408507227897644, + "step": 126040 + }, + { + "epoch": 0.5411589947021801, + "grad_norm": 0.0006558285094797611, + "learning_rate": 4.6082371101127086e-05, + "loss": 0.15853034257888793, + "step": 126050 + }, + { + "epoch": 0.5412019267922001, + "grad_norm": 2.2059273719787598, + "learning_rate": 4.607805938100946e-05, + "loss": 0.27972419261932374, + "step": 126060 + }, + { + "epoch": 0.5412448588822201, + "grad_norm": 1.6577632427215576, + "learning_rate": 4.6073747660891834e-05, + "loss": 0.12001736164093017, + "step": 126070 + }, + { + "epoch": 0.5412877909722401, + "grad_norm": 2.1002368927001953, + "learning_rate": 4.606943594077421e-05, + "loss": 0.0606159508228302, + "step": 126080 + }, + { + "epoch": 0.5413307230622602, + "grad_norm": 0.039800770580768585, + "learning_rate": 4.606512422065659e-05, + "loss": 0.10772346258163452, + "step": 126090 + }, + { + "epoch": 0.5413736551522801, + "grad_norm": 0.26574936509132385, + "learning_rate": 4.6060812500538966e-05, + "loss": 0.18571717739105226, + "step": 126100 + }, + { + "epoch": 0.5414165872423001, + "grad_norm": 0.1073109582066536, + "learning_rate": 4.605650078042134e-05, + "loss": 0.3292713165283203, + "step": 126110 + }, + { + "epoch": 0.5414595193323202, + "grad_norm": 0.049754850566387177, + "learning_rate": 4.605218906030372e-05, + "loss": 0.24871139526367186, + "step": 126120 + }, + { + "epoch": 0.5415024514223401, + "grad_norm": 0.008045070804655552, + "learning_rate": 4.60478773401861e-05, + "loss": 0.4405210971832275, + "step": 126130 + }, + { + "epoch": 0.5415453835123601, + "grad_norm": 0.027863921597599983, + "learning_rate": 4.6043565620068475e-05, + "loss": 0.25272364616394044, + "step": 126140 + }, + { + "epoch": 0.5415883156023802, + "grad_norm": 9.032499313354492, + "learning_rate": 4.6039253899950846e-05, + "loss": 0.0940330982208252, + "step": 126150 + }, + { + "epoch": 0.5416312476924001, + "grad_norm": 0.0032740167807787657, + "learning_rate": 4.603494217983322e-05, + "loss": 0.09897719025611877, + "step": 126160 + }, + { + "epoch": 0.5416741797824202, + "grad_norm": 0.8318930268287659, + "learning_rate": 4.60306304597156e-05, + "loss": 0.33201496601104735, + "step": 126170 + }, + { + "epoch": 0.5417171118724402, + "grad_norm": 4.014708518981934, + "learning_rate": 4.602631873959798e-05, + "loss": 0.3847514629364014, + "step": 126180 + }, + { + "epoch": 0.5417600439624601, + "grad_norm": 0.0012932472163811326, + "learning_rate": 4.602200701948035e-05, + "loss": 0.26420438289642334, + "step": 126190 + }, + { + "epoch": 0.5418029760524802, + "grad_norm": 0.14375215768814087, + "learning_rate": 4.6017695299362726e-05, + "loss": 0.18487160205841063, + "step": 126200 + }, + { + "epoch": 0.5418459081425002, + "grad_norm": 1.129388689994812, + "learning_rate": 4.60133835792451e-05, + "loss": 0.3273311614990234, + "step": 126210 + }, + { + "epoch": 0.5418888402325202, + "grad_norm": 0.07141338288784027, + "learning_rate": 4.600907185912749e-05, + "loss": 0.15279535055160523, + "step": 126220 + }, + { + "epoch": 0.5419317723225402, + "grad_norm": 0.19866278767585754, + "learning_rate": 4.6004760139009864e-05, + "loss": 0.10880914926528931, + "step": 126230 + }, + { + "epoch": 0.5419747044125602, + "grad_norm": 2.901414394378662, + "learning_rate": 4.6000448418892235e-05, + "loss": 0.19144272804260254, + "step": 126240 + }, + { + "epoch": 0.5420176365025802, + "grad_norm": 2.086766242980957, + "learning_rate": 4.599613669877461e-05, + "loss": 0.2593048334121704, + "step": 126250 + }, + { + "epoch": 0.5420605685926002, + "grad_norm": 4.121439456939697, + "learning_rate": 4.599182497865699e-05, + "loss": 0.15519894361495973, + "step": 126260 + }, + { + "epoch": 0.5421035006826203, + "grad_norm": 0.00668214401230216, + "learning_rate": 4.598751325853937e-05, + "loss": 0.29855411052703856, + "step": 126270 + }, + { + "epoch": 0.5421464327726402, + "grad_norm": 0.015316566452383995, + "learning_rate": 4.598320153842174e-05, + "loss": 0.1899287462234497, + "step": 126280 + }, + { + "epoch": 0.5421893648626602, + "grad_norm": 0.20664560794830322, + "learning_rate": 4.5978889818304115e-05, + "loss": 0.2651756048202515, + "step": 126290 + }, + { + "epoch": 0.5422322969526803, + "grad_norm": 0.8667112588882446, + "learning_rate": 4.597457809818649e-05, + "loss": 0.20992062091827393, + "step": 126300 + }, + { + "epoch": 0.5422752290427002, + "grad_norm": 4.961584568023682, + "learning_rate": 4.597026637806887e-05, + "loss": 0.14463411569595336, + "step": 126310 + }, + { + "epoch": 0.5423181611327202, + "grad_norm": 0.01307595707476139, + "learning_rate": 4.596595465795124e-05, + "loss": 0.07341977953910828, + "step": 126320 + }, + { + "epoch": 0.5423610932227403, + "grad_norm": 0.019085580483078957, + "learning_rate": 4.5961642937833624e-05, + "loss": 0.2257392644882202, + "step": 126330 + }, + { + "epoch": 0.5424040253127603, + "grad_norm": 0.17916961014270782, + "learning_rate": 4.5957331217716e-05, + "loss": 0.377333927154541, + "step": 126340 + }, + { + "epoch": 0.5424469574027803, + "grad_norm": 2.313668727874756, + "learning_rate": 4.595301949759838e-05, + "loss": 0.13621577024459838, + "step": 126350 + }, + { + "epoch": 0.5424898894928003, + "grad_norm": 5.779814720153809, + "learning_rate": 4.594870777748075e-05, + "loss": 0.3154439926147461, + "step": 126360 + }, + { + "epoch": 0.5425328215828203, + "grad_norm": 0.008008879609405994, + "learning_rate": 4.594439605736313e-05, + "loss": 0.13945317268371582, + "step": 126370 + }, + { + "epoch": 0.5425757536728403, + "grad_norm": 0.04248788207769394, + "learning_rate": 4.5940084337245504e-05, + "loss": 0.29401838779449463, + "step": 126380 + }, + { + "epoch": 0.5426186857628603, + "grad_norm": 4.502487659454346, + "learning_rate": 4.593577261712788e-05, + "loss": 0.18828514814376832, + "step": 126390 + }, + { + "epoch": 0.5426616178528804, + "grad_norm": 7.196977138519287, + "learning_rate": 4.593146089701025e-05, + "loss": 0.47498259544372556, + "step": 126400 + }, + { + "epoch": 0.5427045499429003, + "grad_norm": 2.026784896850586, + "learning_rate": 4.592714917689263e-05, + "loss": 0.3331002950668335, + "step": 126410 + }, + { + "epoch": 0.5427474820329203, + "grad_norm": 1.6803898811340332, + "learning_rate": 4.592283745677501e-05, + "loss": 0.33963189125061033, + "step": 126420 + }, + { + "epoch": 0.5427904141229404, + "grad_norm": 3.6730363368988037, + "learning_rate": 4.5918525736657384e-05, + "loss": 0.21576766967773436, + "step": 126430 + }, + { + "epoch": 0.5428333462129603, + "grad_norm": 0.039370764046907425, + "learning_rate": 4.591421401653976e-05, + "loss": 0.1567553162574768, + "step": 126440 + }, + { + "epoch": 0.5428762783029804, + "grad_norm": 0.008856616914272308, + "learning_rate": 4.590990229642214e-05, + "loss": 0.1858171582221985, + "step": 126450 + }, + { + "epoch": 0.5429192103930004, + "grad_norm": 0.15867328643798828, + "learning_rate": 4.5905590576304516e-05, + "loss": 0.21397511959075927, + "step": 126460 + }, + { + "epoch": 0.5429621424830203, + "grad_norm": 2.5111141204833984, + "learning_rate": 4.5901278856186894e-05, + "loss": 0.18194727897644042, + "step": 126470 + }, + { + "epoch": 0.5430050745730404, + "grad_norm": 24.574228286743164, + "learning_rate": 4.5896967136069264e-05, + "loss": 0.18436410427093505, + "step": 126480 + }, + { + "epoch": 0.5430480066630604, + "grad_norm": 1.3025906085968018, + "learning_rate": 4.589265541595164e-05, + "loss": 0.3645550012588501, + "step": 126490 + }, + { + "epoch": 0.5430909387530803, + "grad_norm": 0.013285533525049686, + "learning_rate": 4.588834369583402e-05, + "loss": 0.17530207633972167, + "step": 126500 + }, + { + "epoch": 0.5431338708431004, + "grad_norm": 8.834342002868652, + "learning_rate": 4.5884031975716396e-05, + "loss": 0.12580578327178954, + "step": 126510 + }, + { + "epoch": 0.5431768029331204, + "grad_norm": 0.870551586151123, + "learning_rate": 4.587972025559877e-05, + "loss": 0.17170372009277343, + "step": 126520 + }, + { + "epoch": 0.5432197350231404, + "grad_norm": 4.450483798980713, + "learning_rate": 4.5875408535481144e-05, + "loss": 0.29233579635620116, + "step": 126530 + }, + { + "epoch": 0.5432626671131604, + "grad_norm": 1.8690029382705688, + "learning_rate": 4.587109681536352e-05, + "loss": 0.10871686935424804, + "step": 126540 + }, + { + "epoch": 0.5433055992031804, + "grad_norm": 0.014682224951684475, + "learning_rate": 4.58667850952459e-05, + "loss": 0.1381064772605896, + "step": 126550 + }, + { + "epoch": 0.5433485312932004, + "grad_norm": 2.533137798309326, + "learning_rate": 4.5862473375128276e-05, + "loss": 0.15920352935791016, + "step": 126560 + }, + { + "epoch": 0.5433914633832204, + "grad_norm": 0.013429299928247929, + "learning_rate": 4.5858161655010653e-05, + "loss": 0.049779373407363894, + "step": 126570 + }, + { + "epoch": 0.5434343954732405, + "grad_norm": 1.3268059492111206, + "learning_rate": 4.585384993489303e-05, + "loss": 0.22604901790618898, + "step": 126580 + }, + { + "epoch": 0.5434773275632604, + "grad_norm": 1.8555922508239746, + "learning_rate": 4.584953821477541e-05, + "loss": 0.13655784130096435, + "step": 126590 + }, + { + "epoch": 0.5435202596532804, + "grad_norm": 0.03693840652704239, + "learning_rate": 4.5845226494657785e-05, + "loss": 0.20669023990631102, + "step": 126600 + }, + { + "epoch": 0.5435631917433005, + "grad_norm": 0.21512533724308014, + "learning_rate": 4.5840914774540156e-05, + "loss": 0.08742049932479859, + "step": 126610 + }, + { + "epoch": 0.5436061238333204, + "grad_norm": 0.3347058892250061, + "learning_rate": 4.5836603054422533e-05, + "loss": 0.2082576036453247, + "step": 126620 + }, + { + "epoch": 0.5436490559233405, + "grad_norm": 0.4242490828037262, + "learning_rate": 4.583229133430491e-05, + "loss": 0.13741503953933715, + "step": 126630 + }, + { + "epoch": 0.5436919880133605, + "grad_norm": 0.010707002133131027, + "learning_rate": 4.582797961418729e-05, + "loss": 0.25746917724609375, + "step": 126640 + }, + { + "epoch": 0.5437349201033804, + "grad_norm": 0.004346279427409172, + "learning_rate": 4.582366789406966e-05, + "loss": 0.15046364068984985, + "step": 126650 + }, + { + "epoch": 0.5437778521934005, + "grad_norm": 0.9764925837516785, + "learning_rate": 4.5819356173952036e-05, + "loss": 0.2195502758026123, + "step": 126660 + }, + { + "epoch": 0.5438207842834205, + "grad_norm": 0.2153153419494629, + "learning_rate": 4.581504445383441e-05, + "loss": 0.1246726393699646, + "step": 126670 + }, + { + "epoch": 0.5438637163734404, + "grad_norm": 4.666318416595459, + "learning_rate": 4.581073273371679e-05, + "loss": 0.28361926078796384, + "step": 126680 + }, + { + "epoch": 0.5439066484634605, + "grad_norm": 0.01102722529321909, + "learning_rate": 4.580642101359917e-05, + "loss": 0.22579269409179686, + "step": 126690 + }, + { + "epoch": 0.5439495805534805, + "grad_norm": 0.002459155162796378, + "learning_rate": 4.5802109293481545e-05, + "loss": 0.46924777030944825, + "step": 126700 + }, + { + "epoch": 0.5439925126435005, + "grad_norm": 0.16135826706886292, + "learning_rate": 4.579779757336392e-05, + "loss": 0.1990830659866333, + "step": 126710 + }, + { + "epoch": 0.5440354447335205, + "grad_norm": 0.00882128719240427, + "learning_rate": 4.57934858532463e-05, + "loss": 0.1869539737701416, + "step": 126720 + }, + { + "epoch": 0.5440783768235405, + "grad_norm": 0.04351044446229935, + "learning_rate": 4.578917413312867e-05, + "loss": 0.3000959873199463, + "step": 126730 + }, + { + "epoch": 0.5441213089135605, + "grad_norm": 5.979353904724121, + "learning_rate": 4.578486241301105e-05, + "loss": 0.19326921701431274, + "step": 126740 + }, + { + "epoch": 0.5441642410035805, + "grad_norm": 42.468711853027344, + "learning_rate": 4.5780550692893425e-05, + "loss": 0.10602353811264038, + "step": 126750 + }, + { + "epoch": 0.5442071730936006, + "grad_norm": 0.016265008598566055, + "learning_rate": 4.57762389727758e-05, + "loss": 0.3011580228805542, + "step": 126760 + }, + { + "epoch": 0.5442501051836206, + "grad_norm": 0.18827781081199646, + "learning_rate": 4.577192725265817e-05, + "loss": 0.10657843351364135, + "step": 126770 + }, + { + "epoch": 0.5442930372736405, + "grad_norm": 2.5613012313842773, + "learning_rate": 4.576761553254055e-05, + "loss": 0.16385369300842284, + "step": 126780 + }, + { + "epoch": 0.5443359693636606, + "grad_norm": 0.015754317864775658, + "learning_rate": 4.576330381242293e-05, + "loss": 0.19457767009735108, + "step": 126790 + }, + { + "epoch": 0.5443789014536806, + "grad_norm": 0.05552869662642479, + "learning_rate": 4.5758992092305305e-05, + "loss": 0.2870542764663696, + "step": 126800 + }, + { + "epoch": 0.5444218335437006, + "grad_norm": 3.7243716716766357, + "learning_rate": 4.575468037218768e-05, + "loss": 0.10843846797943116, + "step": 126810 + }, + { + "epoch": 0.5444647656337206, + "grad_norm": 0.0405440479516983, + "learning_rate": 4.575036865207006e-05, + "loss": 0.05366473197937012, + "step": 126820 + }, + { + "epoch": 0.5445076977237406, + "grad_norm": 40.28740310668945, + "learning_rate": 4.574605693195244e-05, + "loss": 0.17246224880218505, + "step": 126830 + }, + { + "epoch": 0.5445506298137606, + "grad_norm": 0.7835990786552429, + "learning_rate": 4.5741745211834815e-05, + "loss": 0.03872422575950622, + "step": 126840 + }, + { + "epoch": 0.5445935619037806, + "grad_norm": 0.027119014412164688, + "learning_rate": 4.5737433491717185e-05, + "loss": 0.29775936603546144, + "step": 126850 + }, + { + "epoch": 0.5446364939938007, + "grad_norm": 0.17144376039505005, + "learning_rate": 4.573312177159956e-05, + "loss": 0.2765123128890991, + "step": 126860 + }, + { + "epoch": 0.5446794260838206, + "grad_norm": 0.059212543070316315, + "learning_rate": 4.572881005148194e-05, + "loss": 0.12597641944885254, + "step": 126870 + }, + { + "epoch": 0.5447223581738406, + "grad_norm": 0.03909136354923248, + "learning_rate": 4.572449833136432e-05, + "loss": 0.07332187294960021, + "step": 126880 + }, + { + "epoch": 0.5447652902638607, + "grad_norm": 21.765789031982422, + "learning_rate": 4.572018661124669e-05, + "loss": 0.3605009078979492, + "step": 126890 + }, + { + "epoch": 0.5448082223538806, + "grad_norm": 0.06298067420721054, + "learning_rate": 4.5715874891129065e-05, + "loss": 0.2783812046051025, + "step": 126900 + }, + { + "epoch": 0.5448511544439006, + "grad_norm": 0.011054006405174732, + "learning_rate": 4.571156317101144e-05, + "loss": 0.2735854148864746, + "step": 126910 + }, + { + "epoch": 0.5448940865339207, + "grad_norm": 0.23685322701931, + "learning_rate": 4.5707251450893827e-05, + "loss": 0.3598737001419067, + "step": 126920 + }, + { + "epoch": 0.5449370186239406, + "grad_norm": 8.397737503051758, + "learning_rate": 4.57029397307762e-05, + "loss": 0.20286760330200196, + "step": 126930 + }, + { + "epoch": 0.5449799507139607, + "grad_norm": 7.1092939376831055, + "learning_rate": 4.5698628010658574e-05, + "loss": 0.4456644535064697, + "step": 126940 + }, + { + "epoch": 0.5450228828039807, + "grad_norm": 0.0011188010685145855, + "learning_rate": 4.569431629054095e-05, + "loss": 0.4357870101928711, + "step": 126950 + }, + { + "epoch": 0.5450658148940006, + "grad_norm": 0.0300370492041111, + "learning_rate": 4.569000457042333e-05, + "loss": 0.26468491554260254, + "step": 126960 + }, + { + "epoch": 0.5451087469840207, + "grad_norm": 0.003478456288576126, + "learning_rate": 4.5685692850305707e-05, + "loss": 0.3201757907867432, + "step": 126970 + }, + { + "epoch": 0.5451516790740407, + "grad_norm": 0.04247760400176048, + "learning_rate": 4.568138113018808e-05, + "loss": 0.22876076698303222, + "step": 126980 + }, + { + "epoch": 0.5451946111640606, + "grad_norm": 0.017776286229491234, + "learning_rate": 4.5677069410070454e-05, + "loss": 0.1681857228279114, + "step": 126990 + }, + { + "epoch": 0.5452375432540807, + "grad_norm": 0.07333787530660629, + "learning_rate": 4.567275768995283e-05, + "loss": 0.22127797603607177, + "step": 127000 + }, + { + "epoch": 0.5452375432540807, + "eval_loss": 0.41241249442100525, + "eval_runtime": 27.1526, + "eval_samples_per_second": 3.683, + "eval_steps_per_second": 3.683, + "step": 127000 + }, + { + "epoch": 0.5452804753441007, + "grad_norm": 0.04378829896450043, + "learning_rate": 4.566844596983521e-05, + "loss": 0.1355321526527405, + "step": 127010 + }, + { + "epoch": 0.5453234074341207, + "grad_norm": 0.024462919682264328, + "learning_rate": 4.566413424971758e-05, + "loss": 0.23159735202789306, + "step": 127020 + }, + { + "epoch": 0.5453663395241407, + "grad_norm": 5.287015914916992, + "learning_rate": 4.5659822529599964e-05, + "loss": 0.09170323610305786, + "step": 127030 + }, + { + "epoch": 0.5454092716141608, + "grad_norm": 5.614040851593018, + "learning_rate": 4.565551080948234e-05, + "loss": 0.21654801368713378, + "step": 127040 + }, + { + "epoch": 0.5454522037041807, + "grad_norm": 0.028848685324192047, + "learning_rate": 4.565119908936472e-05, + "loss": 0.15463857650756835, + "step": 127050 + }, + { + "epoch": 0.5454951357942007, + "grad_norm": 0.30613818764686584, + "learning_rate": 4.564688736924709e-05, + "loss": 0.2023834228515625, + "step": 127060 + }, + { + "epoch": 0.5455380678842208, + "grad_norm": 0.015514791011810303, + "learning_rate": 4.5642575649129466e-05, + "loss": 0.08889861702919007, + "step": 127070 + }, + { + "epoch": 0.5455809999742407, + "grad_norm": 0.002453985158354044, + "learning_rate": 4.5638263929011844e-05, + "loss": 0.00347699411213398, + "step": 127080 + }, + { + "epoch": 0.5456239320642607, + "grad_norm": 0.44067129492759705, + "learning_rate": 4.563395220889422e-05, + "loss": 0.13874021768569947, + "step": 127090 + }, + { + "epoch": 0.5456668641542808, + "grad_norm": 1.6640559434890747, + "learning_rate": 4.562964048877659e-05, + "loss": 0.31788613796234133, + "step": 127100 + }, + { + "epoch": 0.5457097962443007, + "grad_norm": 0.002911994466558099, + "learning_rate": 4.562532876865897e-05, + "loss": 0.23183104991912842, + "step": 127110 + }, + { + "epoch": 0.5457527283343208, + "grad_norm": 0.0019254583166912198, + "learning_rate": 4.5621017048541346e-05, + "loss": 0.15993919372558593, + "step": 127120 + }, + { + "epoch": 0.5457956604243408, + "grad_norm": 1.9986472129821777, + "learning_rate": 4.5616705328423724e-05, + "loss": 0.2222136974334717, + "step": 127130 + }, + { + "epoch": 0.5458385925143607, + "grad_norm": 0.020649341866374016, + "learning_rate": 4.56123936083061e-05, + "loss": 0.1820436477661133, + "step": 127140 + }, + { + "epoch": 0.5458815246043808, + "grad_norm": 2.087904930114746, + "learning_rate": 4.560808188818848e-05, + "loss": 0.15863490104675293, + "step": 127150 + }, + { + "epoch": 0.5459244566944008, + "grad_norm": 0.0071143195964396, + "learning_rate": 4.5603770168070856e-05, + "loss": 0.3248276710510254, + "step": 127160 + }, + { + "epoch": 0.5459673887844207, + "grad_norm": 0.23474909365177155, + "learning_rate": 4.559945844795323e-05, + "loss": 0.10756796598434448, + "step": 127170 + }, + { + "epoch": 0.5460103208744408, + "grad_norm": 0.10987505316734314, + "learning_rate": 4.5595146727835604e-05, + "loss": 0.25221178531646726, + "step": 127180 + }, + { + "epoch": 0.5460532529644608, + "grad_norm": 0.12152563780546188, + "learning_rate": 4.559083500771798e-05, + "loss": 0.3498376369476318, + "step": 127190 + }, + { + "epoch": 0.5460961850544809, + "grad_norm": 1.011543869972229, + "learning_rate": 4.558652328760036e-05, + "loss": 0.33713626861572266, + "step": 127200 + }, + { + "epoch": 0.5461391171445008, + "grad_norm": 11.715632438659668, + "learning_rate": 4.5582211567482736e-05, + "loss": 0.33246119022369386, + "step": 127210 + }, + { + "epoch": 0.5461820492345208, + "grad_norm": 0.20116642117500305, + "learning_rate": 4.5577899847365106e-05, + "loss": 0.15852749347686768, + "step": 127220 + }, + { + "epoch": 0.5462249813245409, + "grad_norm": 1.2381221055984497, + "learning_rate": 4.5573588127247484e-05, + "loss": 0.13592121601104737, + "step": 127230 + }, + { + "epoch": 0.5462679134145608, + "grad_norm": 0.1290377825498581, + "learning_rate": 4.556927640712986e-05, + "loss": 0.10746562480926514, + "step": 127240 + }, + { + "epoch": 0.5463108455045809, + "grad_norm": 0.029829656705260277, + "learning_rate": 4.556496468701224e-05, + "loss": 0.14033961296081543, + "step": 127250 + }, + { + "epoch": 0.5463537775946009, + "grad_norm": 0.04186912998557091, + "learning_rate": 4.5560652966894616e-05, + "loss": 0.25282788276672363, + "step": 127260 + }, + { + "epoch": 0.5463967096846208, + "grad_norm": 0.020326996222138405, + "learning_rate": 4.555634124677699e-05, + "loss": 0.298588490486145, + "step": 127270 + }, + { + "epoch": 0.5464396417746409, + "grad_norm": 0.09214252233505249, + "learning_rate": 4.555202952665937e-05, + "loss": 0.21430680751800538, + "step": 127280 + }, + { + "epoch": 0.5464825738646609, + "grad_norm": 0.06303048133850098, + "learning_rate": 4.554771780654175e-05, + "loss": 0.2151409149169922, + "step": 127290 + }, + { + "epoch": 0.5465255059546809, + "grad_norm": 0.01923828199505806, + "learning_rate": 4.5543406086424125e-05, + "loss": 0.16676281690597533, + "step": 127300 + }, + { + "epoch": 0.5465684380447009, + "grad_norm": 0.006287601310759783, + "learning_rate": 4.5539094366306496e-05, + "loss": 0.321563196182251, + "step": 127310 + }, + { + "epoch": 0.5466113701347209, + "grad_norm": 0.9648750424385071, + "learning_rate": 4.553478264618887e-05, + "loss": 0.13181719779968262, + "step": 127320 + }, + { + "epoch": 0.5466543022247409, + "grad_norm": 1.9215424060821533, + "learning_rate": 4.553047092607125e-05, + "loss": 0.25484459400177, + "step": 127330 + }, + { + "epoch": 0.5466972343147609, + "grad_norm": 2.3266000747680664, + "learning_rate": 4.552615920595363e-05, + "loss": 0.2846336364746094, + "step": 127340 + }, + { + "epoch": 0.546740166404781, + "grad_norm": 1.2252665758132935, + "learning_rate": 4.5521847485836e-05, + "loss": 0.3566447734832764, + "step": 127350 + }, + { + "epoch": 0.5467830984948009, + "grad_norm": 0.11931613087654114, + "learning_rate": 4.5517535765718375e-05, + "loss": 0.286944317817688, + "step": 127360 + }, + { + "epoch": 0.5468260305848209, + "grad_norm": 0.00794187467545271, + "learning_rate": 4.551322404560075e-05, + "loss": 0.2526960849761963, + "step": 127370 + }, + { + "epoch": 0.546868962674841, + "grad_norm": 0.09007790684700012, + "learning_rate": 4.550891232548313e-05, + "loss": 0.24354372024536133, + "step": 127380 + }, + { + "epoch": 0.5469118947648609, + "grad_norm": 1.4762769937515259, + "learning_rate": 4.550460060536551e-05, + "loss": 0.2883622407913208, + "step": 127390 + }, + { + "epoch": 0.546954826854881, + "grad_norm": 0.0014529629843309522, + "learning_rate": 4.5500288885247885e-05, + "loss": 0.20099167823791503, + "step": 127400 + }, + { + "epoch": 0.546997758944901, + "grad_norm": 0.060556598007678986, + "learning_rate": 4.549597716513026e-05, + "loss": 0.2658257246017456, + "step": 127410 + }, + { + "epoch": 0.5470406910349209, + "grad_norm": 0.04418276250362396, + "learning_rate": 4.549166544501264e-05, + "loss": 0.1786326289176941, + "step": 127420 + }, + { + "epoch": 0.547083623124941, + "grad_norm": 0.006653594318777323, + "learning_rate": 4.548735372489501e-05, + "loss": 0.38285691738128663, + "step": 127430 + }, + { + "epoch": 0.547126555214961, + "grad_norm": 0.14826098084449768, + "learning_rate": 4.548304200477739e-05, + "loss": 0.21129980087280273, + "step": 127440 + }, + { + "epoch": 0.5471694873049809, + "grad_norm": 3.9380741119384766, + "learning_rate": 4.5478730284659765e-05, + "loss": 0.2790215015411377, + "step": 127450 + }, + { + "epoch": 0.547212419395001, + "grad_norm": 0.0008009527227841318, + "learning_rate": 4.547441856454214e-05, + "loss": 0.12772501707077027, + "step": 127460 + }, + { + "epoch": 0.547255351485021, + "grad_norm": 8.273842811584473, + "learning_rate": 4.547010684442451e-05, + "loss": 0.3419928073883057, + "step": 127470 + }, + { + "epoch": 0.547298283575041, + "grad_norm": 0.0034350028727203608, + "learning_rate": 4.546579512430689e-05, + "loss": 0.08441671133041381, + "step": 127480 + }, + { + "epoch": 0.547341215665061, + "grad_norm": 5.727380275726318, + "learning_rate": 4.546148340418927e-05, + "loss": 0.14370622634887695, + "step": 127490 + }, + { + "epoch": 0.547384147755081, + "grad_norm": 0.05439167469739914, + "learning_rate": 4.5457171684071645e-05, + "loss": 0.09095125794410705, + "step": 127500 + }, + { + "epoch": 0.547427079845101, + "grad_norm": 0.013659200631082058, + "learning_rate": 4.545285996395402e-05, + "loss": 0.19422099590301514, + "step": 127510 + }, + { + "epoch": 0.547470011935121, + "grad_norm": 0.06669435650110245, + "learning_rate": 4.54485482438364e-05, + "loss": 0.2521942615509033, + "step": 127520 + }, + { + "epoch": 0.5475129440251411, + "grad_norm": 0.016093647107481956, + "learning_rate": 4.544423652371878e-05, + "loss": 0.13946938514709473, + "step": 127530 + }, + { + "epoch": 0.547555876115161, + "grad_norm": 0.021878918632864952, + "learning_rate": 4.5439924803601154e-05, + "loss": 0.14208060503005981, + "step": 127540 + }, + { + "epoch": 0.547598808205181, + "grad_norm": 1.2861130237579346, + "learning_rate": 4.5435613083483525e-05, + "loss": 0.24725103378295898, + "step": 127550 + }, + { + "epoch": 0.5476417402952011, + "grad_norm": 6.029102325439453, + "learning_rate": 4.54313013633659e-05, + "loss": 0.20380520820617676, + "step": 127560 + }, + { + "epoch": 0.547684672385221, + "grad_norm": 0.45046472549438477, + "learning_rate": 4.542698964324828e-05, + "loss": 0.3024513006210327, + "step": 127570 + }, + { + "epoch": 0.547727604475241, + "grad_norm": 5.435580253601074, + "learning_rate": 4.542267792313066e-05, + "loss": 0.16858481168746947, + "step": 127580 + }, + { + "epoch": 0.5477705365652611, + "grad_norm": 0.027056651189923286, + "learning_rate": 4.541836620301303e-05, + "loss": 0.1475573182106018, + "step": 127590 + }, + { + "epoch": 0.547813468655281, + "grad_norm": 0.004209815990179777, + "learning_rate": 4.5414054482895405e-05, + "loss": 0.16616259813308715, + "step": 127600 + }, + { + "epoch": 0.5478564007453011, + "grad_norm": 0.004726664163172245, + "learning_rate": 4.540974276277778e-05, + "loss": 0.1367909550666809, + "step": 127610 + }, + { + "epoch": 0.5478993328353211, + "grad_norm": 23.20948028564453, + "learning_rate": 4.540543104266016e-05, + "loss": 0.06587894558906555, + "step": 127620 + }, + { + "epoch": 0.5479422649253411, + "grad_norm": 0.2586648464202881, + "learning_rate": 4.5401119322542537e-05, + "loss": 0.37384953498840334, + "step": 127630 + }, + { + "epoch": 0.5479851970153611, + "grad_norm": 1.3465591669082642, + "learning_rate": 4.5396807602424914e-05, + "loss": 0.3581723928451538, + "step": 127640 + }, + { + "epoch": 0.5480281291053811, + "grad_norm": 1.7588071823120117, + "learning_rate": 4.539249588230729e-05, + "loss": 0.09634592533111572, + "step": 127650 + }, + { + "epoch": 0.5480710611954012, + "grad_norm": 0.1279430240392685, + "learning_rate": 4.538818416218967e-05, + "loss": 0.12120214700698853, + "step": 127660 + }, + { + "epoch": 0.5481139932854211, + "grad_norm": 0.026256712153553963, + "learning_rate": 4.5383872442072046e-05, + "loss": 0.09571439027786255, + "step": 127670 + }, + { + "epoch": 0.5481569253754411, + "grad_norm": 0.001200821716338396, + "learning_rate": 4.5379560721954417e-05, + "loss": 0.16283975839614867, + "step": 127680 + }, + { + "epoch": 0.5481998574654612, + "grad_norm": 0.4395844042301178, + "learning_rate": 4.5375249001836794e-05, + "loss": 0.21897361278533936, + "step": 127690 + }, + { + "epoch": 0.5482427895554811, + "grad_norm": 0.8989280462265015, + "learning_rate": 4.537093728171917e-05, + "loss": 0.09027788639068604, + "step": 127700 + }, + { + "epoch": 0.5482857216455012, + "grad_norm": 0.00886700302362442, + "learning_rate": 4.536662556160155e-05, + "loss": 0.10902763605117798, + "step": 127710 + }, + { + "epoch": 0.5483286537355212, + "grad_norm": 1.8262544870376587, + "learning_rate": 4.536231384148392e-05, + "loss": 0.23176026344299316, + "step": 127720 + }, + { + "epoch": 0.5483715858255411, + "grad_norm": 0.0683245062828064, + "learning_rate": 4.5358002121366296e-05, + "loss": 0.06529564261436463, + "step": 127730 + }, + { + "epoch": 0.5484145179155612, + "grad_norm": 0.007358227856457233, + "learning_rate": 4.535369040124868e-05, + "loss": 0.16256901025772094, + "step": 127740 + }, + { + "epoch": 0.5484574500055812, + "grad_norm": 0.02833356335759163, + "learning_rate": 4.534937868113106e-05, + "loss": 0.0007393436040729285, + "step": 127750 + }, + { + "epoch": 0.5485003820956011, + "grad_norm": 4.225329399108887, + "learning_rate": 4.534506696101343e-05, + "loss": 0.2745645523071289, + "step": 127760 + }, + { + "epoch": 0.5485433141856212, + "grad_norm": 0.0018272794550284743, + "learning_rate": 4.5340755240895806e-05, + "loss": 0.1887308955192566, + "step": 127770 + }, + { + "epoch": 0.5485862462756412, + "grad_norm": 0.07719375193119049, + "learning_rate": 4.533644352077818e-05, + "loss": 0.36277971267700193, + "step": 127780 + }, + { + "epoch": 0.5486291783656612, + "grad_norm": 1.7213512659072876, + "learning_rate": 4.533213180066056e-05, + "loss": 0.23921799659729004, + "step": 127790 + }, + { + "epoch": 0.5486721104556812, + "grad_norm": 0.007673746906220913, + "learning_rate": 4.532782008054293e-05, + "loss": 0.1820694923400879, + "step": 127800 + }, + { + "epoch": 0.5487150425457012, + "grad_norm": 1.0937405824661255, + "learning_rate": 4.532350836042531e-05, + "loss": 0.06678230762481689, + "step": 127810 + }, + { + "epoch": 0.5487579746357212, + "grad_norm": 12.98852252960205, + "learning_rate": 4.5319196640307686e-05, + "loss": 0.1009778380393982, + "step": 127820 + }, + { + "epoch": 0.5488009067257412, + "grad_norm": 0.05488551780581474, + "learning_rate": 4.531488492019006e-05, + "loss": 0.1254146456718445, + "step": 127830 + }, + { + "epoch": 0.5488438388157613, + "grad_norm": 1.9723633527755737, + "learning_rate": 4.5310573200072434e-05, + "loss": 0.13314028978347778, + "step": 127840 + }, + { + "epoch": 0.5488867709057812, + "grad_norm": 1.2804629802703857, + "learning_rate": 4.530626147995482e-05, + "loss": 0.21371521949768066, + "step": 127850 + }, + { + "epoch": 0.5489297029958012, + "grad_norm": 0.0441896878182888, + "learning_rate": 4.5301949759837195e-05, + "loss": 0.10532666444778442, + "step": 127860 + }, + { + "epoch": 0.5489726350858213, + "grad_norm": 0.00991550087928772, + "learning_rate": 4.529763803971957e-05, + "loss": 0.28925399780273436, + "step": 127870 + }, + { + "epoch": 0.5490155671758412, + "grad_norm": 0.38075557351112366, + "learning_rate": 4.529332631960194e-05, + "loss": 0.07656524777412414, + "step": 127880 + }, + { + "epoch": 0.5490584992658613, + "grad_norm": 1.3430020809173584, + "learning_rate": 4.528901459948432e-05, + "loss": 0.26489455699920655, + "step": 127890 + }, + { + "epoch": 0.5491014313558813, + "grad_norm": 2.7436866760253906, + "learning_rate": 4.52847028793667e-05, + "loss": 0.16090919971466064, + "step": 127900 + }, + { + "epoch": 0.5491443634459012, + "grad_norm": 0.4776339828968048, + "learning_rate": 4.5280391159249075e-05, + "loss": 0.13634073734283447, + "step": 127910 + }, + { + "epoch": 0.5491872955359213, + "grad_norm": 0.45546698570251465, + "learning_rate": 4.5276079439131446e-05, + "loss": 0.2898316621780396, + "step": 127920 + }, + { + "epoch": 0.5492302276259413, + "grad_norm": 0.012597916647791862, + "learning_rate": 4.527176771901382e-05, + "loss": 0.08472794890403748, + "step": 127930 + }, + { + "epoch": 0.5492731597159612, + "grad_norm": 0.07850173115730286, + "learning_rate": 4.52674559988962e-05, + "loss": 0.2743486166000366, + "step": 127940 + }, + { + "epoch": 0.5493160918059813, + "grad_norm": 3.1064162254333496, + "learning_rate": 4.526314427877858e-05, + "loss": 0.6178059577941895, + "step": 127950 + }, + { + "epoch": 0.5493590238960013, + "grad_norm": 2.259978771209717, + "learning_rate": 4.5258832558660955e-05, + "loss": 0.24268021583557128, + "step": 127960 + }, + { + "epoch": 0.5494019559860213, + "grad_norm": 0.07178674638271332, + "learning_rate": 4.525452083854333e-05, + "loss": 0.20910418033599854, + "step": 127970 + }, + { + "epoch": 0.5494448880760413, + "grad_norm": 7.745858192443848, + "learning_rate": 4.525020911842571e-05, + "loss": 0.40424814224243166, + "step": 127980 + }, + { + "epoch": 0.5494878201660613, + "grad_norm": 0.003248595166951418, + "learning_rate": 4.524589739830809e-05, + "loss": 0.18874037265777588, + "step": 127990 + }, + { + "epoch": 0.5495307522560813, + "grad_norm": 0.0067419628612697124, + "learning_rate": 4.524158567819046e-05, + "loss": 0.11825863122940064, + "step": 128000 + }, + { + "epoch": 0.5495307522560813, + "eval_loss": 0.39745041728019714, + "eval_runtime": 27.1308, + "eval_samples_per_second": 3.686, + "eval_steps_per_second": 3.686, + "step": 128000 + }, + { + "epoch": 0.5495736843461013, + "grad_norm": 0.023375723510980606, + "learning_rate": 4.5237273958072835e-05, + "loss": 0.1321635961532593, + "step": 128010 + }, + { + "epoch": 0.5496166164361214, + "grad_norm": 0.0028161010704934597, + "learning_rate": 4.523296223795521e-05, + "loss": 0.3294975996017456, + "step": 128020 + }, + { + "epoch": 0.5496595485261413, + "grad_norm": 0.03735275939106941, + "learning_rate": 4.522865051783759e-05, + "loss": 0.00880674496293068, + "step": 128030 + }, + { + "epoch": 0.5497024806161613, + "grad_norm": 1.1383004188537598, + "learning_rate": 4.522433879771997e-05, + "loss": 0.059259730577468875, + "step": 128040 + }, + { + "epoch": 0.5497454127061814, + "grad_norm": 0.0009236226323992014, + "learning_rate": 4.522002707760234e-05, + "loss": 0.11791167259216309, + "step": 128050 + }, + { + "epoch": 0.5497883447962014, + "grad_norm": 2.122427225112915, + "learning_rate": 4.5215715357484715e-05, + "loss": 0.20672807693481446, + "step": 128060 + }, + { + "epoch": 0.5498312768862214, + "grad_norm": 2.029148578643799, + "learning_rate": 4.521140363736709e-05, + "loss": 0.40177311897277834, + "step": 128070 + }, + { + "epoch": 0.5498742089762414, + "grad_norm": 0.0030512872617691755, + "learning_rate": 4.520709191724947e-05, + "loss": 0.4742741107940674, + "step": 128080 + }, + { + "epoch": 0.5499171410662614, + "grad_norm": 0.05324231833219528, + "learning_rate": 4.520278019713185e-05, + "loss": 0.18547524213790895, + "step": 128090 + }, + { + "epoch": 0.5499600731562814, + "grad_norm": 0.0029397367034107447, + "learning_rate": 4.5198468477014224e-05, + "loss": 0.3278725385665894, + "step": 128100 + }, + { + "epoch": 0.5500030052463014, + "grad_norm": 0.012124622240662575, + "learning_rate": 4.51941567568966e-05, + "loss": 0.24240074157714844, + "step": 128110 + }, + { + "epoch": 0.5500459373363215, + "grad_norm": 8.108833312988281, + "learning_rate": 4.518984503677898e-05, + "loss": 0.22392873764038085, + "step": 128120 + }, + { + "epoch": 0.5500888694263414, + "grad_norm": 2.374791383743286, + "learning_rate": 4.518553331666135e-05, + "loss": 0.1306439995765686, + "step": 128130 + }, + { + "epoch": 0.5501318015163614, + "grad_norm": 0.005056803114712238, + "learning_rate": 4.518122159654373e-05, + "loss": 0.13596285581588746, + "step": 128140 + }, + { + "epoch": 0.5501747336063815, + "grad_norm": 0.07786231487989426, + "learning_rate": 4.5176909876426104e-05, + "loss": 0.16290545463562012, + "step": 128150 + }, + { + "epoch": 0.5502176656964014, + "grad_norm": 1.501407265663147, + "learning_rate": 4.517259815630848e-05, + "loss": 0.3343447923660278, + "step": 128160 + }, + { + "epoch": 0.5502605977864214, + "grad_norm": 1.5795830488204956, + "learning_rate": 4.516828643619085e-05, + "loss": 0.39916000366210935, + "step": 128170 + }, + { + "epoch": 0.5503035298764415, + "grad_norm": 0.021032895892858505, + "learning_rate": 4.516397471607323e-05, + "loss": 0.3229613542556763, + "step": 128180 + }, + { + "epoch": 0.5503464619664614, + "grad_norm": 0.8697996735572815, + "learning_rate": 4.515966299595561e-05, + "loss": 0.19066758155822755, + "step": 128190 + }, + { + "epoch": 0.5503893940564815, + "grad_norm": 0.006133753340691328, + "learning_rate": 4.5155351275837984e-05, + "loss": 0.09759909510612488, + "step": 128200 + }, + { + "epoch": 0.5504323261465015, + "grad_norm": 1.7772566080093384, + "learning_rate": 4.515103955572036e-05, + "loss": 0.20648114681243895, + "step": 128210 + }, + { + "epoch": 0.5504752582365214, + "grad_norm": 0.0055733914487063885, + "learning_rate": 4.514672783560274e-05, + "loss": 0.18312152624130248, + "step": 128220 + }, + { + "epoch": 0.5505181903265415, + "grad_norm": 0.5254460573196411, + "learning_rate": 4.5142416115485116e-05, + "loss": 0.21679816246032715, + "step": 128230 + }, + { + "epoch": 0.5505611224165615, + "grad_norm": 0.005690948572009802, + "learning_rate": 4.5138104395367494e-05, + "loss": 0.09224072098731995, + "step": 128240 + }, + { + "epoch": 0.5506040545065815, + "grad_norm": 0.0012284154072403908, + "learning_rate": 4.5133792675249864e-05, + "loss": 0.35671429634094237, + "step": 128250 + }, + { + "epoch": 0.5506469865966015, + "grad_norm": 1.7502591609954834, + "learning_rate": 4.512948095513224e-05, + "loss": 0.07723878622055054, + "step": 128260 + }, + { + "epoch": 0.5506899186866215, + "grad_norm": 0.7807573676109314, + "learning_rate": 4.512516923501462e-05, + "loss": 0.19361660480499268, + "step": 128270 + }, + { + "epoch": 0.5507328507766415, + "grad_norm": 0.0044706715270876884, + "learning_rate": 4.5120857514896996e-05, + "loss": 0.15273451805114746, + "step": 128280 + }, + { + "epoch": 0.5507757828666615, + "grad_norm": 1.5963835716247559, + "learning_rate": 4.511654579477937e-05, + "loss": 0.23731424808502197, + "step": 128290 + }, + { + "epoch": 0.5508187149566816, + "grad_norm": 1.5934256315231323, + "learning_rate": 4.5112234074661744e-05, + "loss": 0.4009242534637451, + "step": 128300 + }, + { + "epoch": 0.5508616470467015, + "grad_norm": 0.028949454426765442, + "learning_rate": 4.510792235454412e-05, + "loss": 0.4464095592498779, + "step": 128310 + }, + { + "epoch": 0.5509045791367215, + "grad_norm": 1.0958473682403564, + "learning_rate": 4.51036106344265e-05, + "loss": 0.22179553508758545, + "step": 128320 + }, + { + "epoch": 0.5509475112267416, + "grad_norm": 0.18953019380569458, + "learning_rate": 4.5099298914308876e-05, + "loss": 0.06211835741996765, + "step": 128330 + }, + { + "epoch": 0.5509904433167615, + "grad_norm": 0.22642917931079865, + "learning_rate": 4.5094987194191253e-05, + "loss": 0.34406998157501223, + "step": 128340 + }, + { + "epoch": 0.5510333754067815, + "grad_norm": 0.35814112424850464, + "learning_rate": 4.509067547407363e-05, + "loss": 0.245259428024292, + "step": 128350 + }, + { + "epoch": 0.5510763074968016, + "grad_norm": 0.060045354068279266, + "learning_rate": 4.508636375395601e-05, + "loss": 0.15209543704986572, + "step": 128360 + }, + { + "epoch": 0.5511192395868215, + "grad_norm": 0.06613177061080933, + "learning_rate": 4.508205203383838e-05, + "loss": 0.2412318468093872, + "step": 128370 + }, + { + "epoch": 0.5511621716768416, + "grad_norm": 3.185753345489502, + "learning_rate": 4.5077740313720756e-05, + "loss": 0.18033461570739745, + "step": 128380 + }, + { + "epoch": 0.5512051037668616, + "grad_norm": 4.417606353759766, + "learning_rate": 4.507342859360313e-05, + "loss": 0.2318558692932129, + "step": 128390 + }, + { + "epoch": 0.5512480358568815, + "grad_norm": 8.20495319366455, + "learning_rate": 4.506911687348551e-05, + "loss": 0.3258568286895752, + "step": 128400 + }, + { + "epoch": 0.5512909679469016, + "grad_norm": 0.00039666148950345814, + "learning_rate": 4.506480515336789e-05, + "loss": 0.3040439605712891, + "step": 128410 + }, + { + "epoch": 0.5513339000369216, + "grad_norm": 0.0841839537024498, + "learning_rate": 4.506049343325026e-05, + "loss": 0.15690793991088867, + "step": 128420 + }, + { + "epoch": 0.5513768321269416, + "grad_norm": 0.011664030142128468, + "learning_rate": 4.5056181713132636e-05, + "loss": 0.061115825176239015, + "step": 128430 + }, + { + "epoch": 0.5514197642169616, + "grad_norm": 2.6365978717803955, + "learning_rate": 4.505186999301502e-05, + "loss": 0.19872939586639404, + "step": 128440 + }, + { + "epoch": 0.5514626963069816, + "grad_norm": 0.004380214959383011, + "learning_rate": 4.50475582728974e-05, + "loss": 0.04639408886432648, + "step": 128450 + }, + { + "epoch": 0.5515056283970016, + "grad_norm": 3.333980083465576, + "learning_rate": 4.504324655277977e-05, + "loss": 0.31141598224639894, + "step": 128460 + }, + { + "epoch": 0.5515485604870216, + "grad_norm": 0.06421362608671188, + "learning_rate": 4.5038934832662145e-05, + "loss": 0.19181085824966432, + "step": 128470 + }, + { + "epoch": 0.5515914925770417, + "grad_norm": 1.3378186225891113, + "learning_rate": 4.503462311254452e-05, + "loss": 0.2731289863586426, + "step": 128480 + }, + { + "epoch": 0.5516344246670617, + "grad_norm": 0.03502466529607773, + "learning_rate": 4.50303113924269e-05, + "loss": 0.08915647268295288, + "step": 128490 + }, + { + "epoch": 0.5516773567570816, + "grad_norm": 0.13968119025230408, + "learning_rate": 4.502599967230927e-05, + "loss": 0.28186616897583006, + "step": 128500 + }, + { + "epoch": 0.5517202888471017, + "grad_norm": 1.2802331447601318, + "learning_rate": 4.502168795219165e-05, + "loss": 0.09446300268173217, + "step": 128510 + }, + { + "epoch": 0.5517632209371217, + "grad_norm": 1.2596136331558228, + "learning_rate": 4.5017376232074025e-05, + "loss": 0.4868049621582031, + "step": 128520 + }, + { + "epoch": 0.5518061530271416, + "grad_norm": 0.017403313890099525, + "learning_rate": 4.50130645119564e-05, + "loss": 0.280033278465271, + "step": 128530 + }, + { + "epoch": 0.5518490851171617, + "grad_norm": 0.016531366854906082, + "learning_rate": 4.500875279183877e-05, + "loss": 0.10965802669525146, + "step": 128540 + }, + { + "epoch": 0.5518920172071817, + "grad_norm": 0.9327275156974792, + "learning_rate": 4.500444107172116e-05, + "loss": 0.20025987625122071, + "step": 128550 + }, + { + "epoch": 0.5519349492972017, + "grad_norm": 1.1419161558151245, + "learning_rate": 4.5000129351603535e-05, + "loss": 0.4087411880493164, + "step": 128560 + }, + { + "epoch": 0.5519778813872217, + "grad_norm": 1.9150222539901733, + "learning_rate": 4.499581763148591e-05, + "loss": 0.31905062198638917, + "step": 128570 + }, + { + "epoch": 0.5520208134772417, + "grad_norm": 0.32453474402427673, + "learning_rate": 4.499150591136828e-05, + "loss": 0.13009634017944335, + "step": 128580 + }, + { + "epoch": 0.5520637455672617, + "grad_norm": 0.012127312831580639, + "learning_rate": 4.498719419125066e-05, + "loss": 0.04768897294998169, + "step": 128590 + }, + { + "epoch": 0.5521066776572817, + "grad_norm": 0.004312812816351652, + "learning_rate": 4.498288247113304e-05, + "loss": 0.22252929210662842, + "step": 128600 + }, + { + "epoch": 0.5521496097473018, + "grad_norm": 1.6124236583709717, + "learning_rate": 4.4978570751015415e-05, + "loss": 0.3631885051727295, + "step": 128610 + }, + { + "epoch": 0.5521925418373217, + "grad_norm": 0.09973174333572388, + "learning_rate": 4.4974259030897785e-05, + "loss": 0.1778208613395691, + "step": 128620 + }, + { + "epoch": 0.5522354739273417, + "grad_norm": 5.448539733886719, + "learning_rate": 4.496994731078016e-05, + "loss": 0.24020824432373047, + "step": 128630 + }, + { + "epoch": 0.5522784060173618, + "grad_norm": 2.429185628890991, + "learning_rate": 4.496563559066254e-05, + "loss": 0.28744187355041506, + "step": 128640 + }, + { + "epoch": 0.5523213381073817, + "grad_norm": 0.15772603452205658, + "learning_rate": 4.496132387054492e-05, + "loss": 0.15476926565170288, + "step": 128650 + }, + { + "epoch": 0.5523642701974018, + "grad_norm": 0.17234504222869873, + "learning_rate": 4.4957012150427295e-05, + "loss": 0.35704262256622316, + "step": 128660 + }, + { + "epoch": 0.5524072022874218, + "grad_norm": 0.0040247696451842785, + "learning_rate": 4.495270043030967e-05, + "loss": 0.24672627449035645, + "step": 128670 + }, + { + "epoch": 0.5524501343774417, + "grad_norm": 0.04398768022656441, + "learning_rate": 4.494838871019205e-05, + "loss": 0.09762682318687439, + "step": 128680 + }, + { + "epoch": 0.5524930664674618, + "grad_norm": 0.058741435408592224, + "learning_rate": 4.4944076990074427e-05, + "loss": 0.1486470341682434, + "step": 128690 + }, + { + "epoch": 0.5525359985574818, + "grad_norm": 0.9128871560096741, + "learning_rate": 4.49397652699568e-05, + "loss": 0.28860721588134763, + "step": 128700 + }, + { + "epoch": 0.5525789306475017, + "grad_norm": 0.007726700510829687, + "learning_rate": 4.4935453549839174e-05, + "loss": 0.1310112237930298, + "step": 128710 + }, + { + "epoch": 0.5526218627375218, + "grad_norm": 4.7020368576049805, + "learning_rate": 4.493114182972155e-05, + "loss": 0.1983258008956909, + "step": 128720 + }, + { + "epoch": 0.5526647948275418, + "grad_norm": 0.29634737968444824, + "learning_rate": 4.492683010960393e-05, + "loss": 0.2613669872283936, + "step": 128730 + }, + { + "epoch": 0.5527077269175618, + "grad_norm": 0.0016307096229866147, + "learning_rate": 4.49225183894863e-05, + "loss": 0.20361201763153075, + "step": 128740 + }, + { + "epoch": 0.5527506590075818, + "grad_norm": 0.008389080874621868, + "learning_rate": 4.491820666936868e-05, + "loss": 0.13784075975418092, + "step": 128750 + }, + { + "epoch": 0.5527935910976018, + "grad_norm": 0.11101187020540237, + "learning_rate": 4.4913894949251054e-05, + "loss": 0.250306224822998, + "step": 128760 + }, + { + "epoch": 0.5528365231876218, + "grad_norm": 2.010310649871826, + "learning_rate": 4.490958322913343e-05, + "loss": 0.21546776294708253, + "step": 128770 + }, + { + "epoch": 0.5528794552776418, + "grad_norm": 0.577353835105896, + "learning_rate": 4.490527150901581e-05, + "loss": 0.04757989346981049, + "step": 128780 + }, + { + "epoch": 0.5529223873676619, + "grad_norm": 2.2937493324279785, + "learning_rate": 4.4900959788898186e-05, + "loss": 0.19977052211761476, + "step": 128790 + }, + { + "epoch": 0.5529653194576818, + "grad_norm": 0.5439838767051697, + "learning_rate": 4.4896648068780564e-05, + "loss": 0.28266804218292235, + "step": 128800 + }, + { + "epoch": 0.5530082515477018, + "grad_norm": 0.027256833389401436, + "learning_rate": 4.489233634866294e-05, + "loss": 0.04045622944831848, + "step": 128810 + }, + { + "epoch": 0.5530511836377219, + "grad_norm": 0.00477907620370388, + "learning_rate": 4.488802462854532e-05, + "loss": 0.1647333025932312, + "step": 128820 + }, + { + "epoch": 0.5530941157277418, + "grad_norm": 0.9636557102203369, + "learning_rate": 4.488371290842769e-05, + "loss": 0.4419291019439697, + "step": 128830 + }, + { + "epoch": 0.5531370478177619, + "grad_norm": 1.1076637506484985, + "learning_rate": 4.4879401188310066e-05, + "loss": 0.3644695520401001, + "step": 128840 + }, + { + "epoch": 0.5531799799077819, + "grad_norm": 1.1289384365081787, + "learning_rate": 4.4875089468192444e-05, + "loss": 0.3530080795288086, + "step": 128850 + }, + { + "epoch": 0.5532229119978018, + "grad_norm": 0.3441978991031647, + "learning_rate": 4.487077774807482e-05, + "loss": 0.19301141500473024, + "step": 128860 + }, + { + "epoch": 0.5532658440878219, + "grad_norm": 2.600874662399292, + "learning_rate": 4.486646602795719e-05, + "loss": 0.33310320377349856, + "step": 128870 + }, + { + "epoch": 0.5533087761778419, + "grad_norm": 0.01902606710791588, + "learning_rate": 4.486215430783957e-05, + "loss": 0.24450154304504396, + "step": 128880 + }, + { + "epoch": 0.5533517082678618, + "grad_norm": 7.304584503173828, + "learning_rate": 4.4857842587721946e-05, + "loss": 0.27731058597564695, + "step": 128890 + }, + { + "epoch": 0.5533946403578819, + "grad_norm": 0.12720587849617004, + "learning_rate": 4.4853530867604324e-05, + "loss": 0.13187975883483888, + "step": 128900 + }, + { + "epoch": 0.5534375724479019, + "grad_norm": 3.016563653945923, + "learning_rate": 4.48492191474867e-05, + "loss": 0.09463203549385071, + "step": 128910 + }, + { + "epoch": 0.553480504537922, + "grad_norm": 0.22556646168231964, + "learning_rate": 4.484490742736908e-05, + "loss": 0.070140540599823, + "step": 128920 + }, + { + "epoch": 0.5535234366279419, + "grad_norm": 0.01902354694902897, + "learning_rate": 4.4840595707251456e-05, + "loss": 0.18325456380844116, + "step": 128930 + }, + { + "epoch": 0.5535663687179619, + "grad_norm": 0.0022118953056633472, + "learning_rate": 4.483628398713383e-05, + "loss": 0.15008280277252198, + "step": 128940 + }, + { + "epoch": 0.553609300807982, + "grad_norm": 4.692033290863037, + "learning_rate": 4.4831972267016204e-05, + "loss": 0.3606921911239624, + "step": 128950 + }, + { + "epoch": 0.5536522328980019, + "grad_norm": 0.025387076660990715, + "learning_rate": 4.482766054689858e-05, + "loss": 0.0033765774220228194, + "step": 128960 + }, + { + "epoch": 0.553695164988022, + "grad_norm": 0.14318892359733582, + "learning_rate": 4.482334882678096e-05, + "loss": 0.122013258934021, + "step": 128970 + }, + { + "epoch": 0.553738097078042, + "grad_norm": 0.019069045782089233, + "learning_rate": 4.4819037106663336e-05, + "loss": 0.40986361503601076, + "step": 128980 + }, + { + "epoch": 0.5537810291680619, + "grad_norm": 0.1801142543554306, + "learning_rate": 4.4814725386545706e-05, + "loss": 0.2598786592483521, + "step": 128990 + }, + { + "epoch": 0.553823961258082, + "grad_norm": 0.0032951515167951584, + "learning_rate": 4.4810413666428084e-05, + "loss": 0.3131296157836914, + "step": 129000 + }, + { + "epoch": 0.553823961258082, + "eval_loss": 0.41541945934295654, + "eval_runtime": 27.1184, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 129000 + }, + { + "epoch": 0.553866893348102, + "grad_norm": 0.007539977785199881, + "learning_rate": 4.480610194631046e-05, + "loss": 0.13454591035842894, + "step": 129010 + }, + { + "epoch": 0.553909825438122, + "grad_norm": 2.2462775707244873, + "learning_rate": 4.480179022619284e-05, + "loss": 0.3304026126861572, + "step": 129020 + }, + { + "epoch": 0.553952757528142, + "grad_norm": 2.5247340202331543, + "learning_rate": 4.4797478506075216e-05, + "loss": 0.23408877849578857, + "step": 129030 + }, + { + "epoch": 0.553995689618162, + "grad_norm": 0.6959047317504883, + "learning_rate": 4.479316678595759e-05, + "loss": 0.18713444471359253, + "step": 129040 + }, + { + "epoch": 0.554038621708182, + "grad_norm": 2.4266421794891357, + "learning_rate": 4.478885506583997e-05, + "loss": 0.21233630180358887, + "step": 129050 + }, + { + "epoch": 0.554081553798202, + "grad_norm": 1.3284385204315186, + "learning_rate": 4.478454334572235e-05, + "loss": 0.1826010227203369, + "step": 129060 + }, + { + "epoch": 0.554124485888222, + "grad_norm": 3.7686238288879395, + "learning_rate": 4.478023162560472e-05, + "loss": 0.5331307411193847, + "step": 129070 + }, + { + "epoch": 0.554167417978242, + "grad_norm": 0.6449288725852966, + "learning_rate": 4.4775919905487095e-05, + "loss": 0.36653029918670654, + "step": 129080 + }, + { + "epoch": 0.554210350068262, + "grad_norm": 1.2267459630966187, + "learning_rate": 4.477160818536947e-05, + "loss": 0.22919206619262694, + "step": 129090 + }, + { + "epoch": 0.5542532821582821, + "grad_norm": 0.0552271232008934, + "learning_rate": 4.476729646525185e-05, + "loss": 0.07877624034881592, + "step": 129100 + }, + { + "epoch": 0.554296214248302, + "grad_norm": 1.44490647315979, + "learning_rate": 4.476298474513422e-05, + "loss": 0.30919790267944336, + "step": 129110 + }, + { + "epoch": 0.554339146338322, + "grad_norm": 0.7900761961936951, + "learning_rate": 4.47586730250166e-05, + "loss": 0.18931243419647217, + "step": 129120 + }, + { + "epoch": 0.5543820784283421, + "grad_norm": 0.06050022318959236, + "learning_rate": 4.4754361304898975e-05, + "loss": 0.4321730613708496, + "step": 129130 + }, + { + "epoch": 0.554425010518362, + "grad_norm": 0.04257184639573097, + "learning_rate": 4.475004958478135e-05, + "loss": 0.031278005242347716, + "step": 129140 + }, + { + "epoch": 0.5544679426083821, + "grad_norm": 0.23344434797763824, + "learning_rate": 4.474573786466374e-05, + "loss": 0.18850735425949097, + "step": 129150 + }, + { + "epoch": 0.5545108746984021, + "grad_norm": 2.25657320022583, + "learning_rate": 4.474142614454611e-05, + "loss": 0.25844502449035645, + "step": 129160 + }, + { + "epoch": 0.554553806788422, + "grad_norm": 1.844508409500122, + "learning_rate": 4.4737114424428485e-05, + "loss": 0.14059102535247803, + "step": 129170 + }, + { + "epoch": 0.5545967388784421, + "grad_norm": 2.2424442768096924, + "learning_rate": 4.473280270431086e-05, + "loss": 0.3256859064102173, + "step": 129180 + }, + { + "epoch": 0.5546396709684621, + "grad_norm": 2.073718547821045, + "learning_rate": 4.472849098419324e-05, + "loss": 0.1257432818412781, + "step": 129190 + }, + { + "epoch": 0.554682603058482, + "grad_norm": 0.24247653782367706, + "learning_rate": 4.472417926407561e-05, + "loss": 0.3258751153945923, + "step": 129200 + }, + { + "epoch": 0.5547255351485021, + "grad_norm": 0.04378345608711243, + "learning_rate": 4.471986754395799e-05, + "loss": 0.07690662741661072, + "step": 129210 + }, + { + "epoch": 0.5547684672385221, + "grad_norm": 0.0017137116519734263, + "learning_rate": 4.4715555823840365e-05, + "loss": 0.21572604179382324, + "step": 129220 + }, + { + "epoch": 0.5548113993285421, + "grad_norm": 4.380400657653809, + "learning_rate": 4.471124410372274e-05, + "loss": 0.14595119953155516, + "step": 129230 + }, + { + "epoch": 0.5548543314185621, + "grad_norm": 0.11375784128904343, + "learning_rate": 4.470693238360511e-05, + "loss": 0.14446167945861815, + "step": 129240 + }, + { + "epoch": 0.5548972635085822, + "grad_norm": 0.0006020744331181049, + "learning_rate": 4.470262066348749e-05, + "loss": 0.11202117204666137, + "step": 129250 + }, + { + "epoch": 0.5549401955986021, + "grad_norm": 0.47002795338630676, + "learning_rate": 4.4698308943369874e-05, + "loss": 0.24686095714569092, + "step": 129260 + }, + { + "epoch": 0.5549831276886221, + "grad_norm": 0.08183751255273819, + "learning_rate": 4.469399722325225e-05, + "loss": 0.007567648589611053, + "step": 129270 + }, + { + "epoch": 0.5550260597786422, + "grad_norm": 0.013336584903299809, + "learning_rate": 4.468968550313462e-05, + "loss": 0.10117838382720948, + "step": 129280 + }, + { + "epoch": 0.5550689918686621, + "grad_norm": 0.001083645736798644, + "learning_rate": 4.4685373783017e-05, + "loss": 0.2760536909103394, + "step": 129290 + }, + { + "epoch": 0.5551119239586821, + "grad_norm": 5.931659698486328, + "learning_rate": 4.468106206289938e-05, + "loss": 0.4119400978088379, + "step": 129300 + }, + { + "epoch": 0.5551548560487022, + "grad_norm": 0.023911086842417717, + "learning_rate": 4.4676750342781754e-05, + "loss": 0.18249212503433226, + "step": 129310 + }, + { + "epoch": 0.5551977881387221, + "grad_norm": 2.3981242179870605, + "learning_rate": 4.4672438622664125e-05, + "loss": 0.314291787147522, + "step": 129320 + }, + { + "epoch": 0.5552407202287422, + "grad_norm": 0.017882850021123886, + "learning_rate": 4.46681269025465e-05, + "loss": 0.016638435423374176, + "step": 129330 + }, + { + "epoch": 0.5552836523187622, + "grad_norm": 0.0006807830650359392, + "learning_rate": 4.466381518242888e-05, + "loss": 0.26828172206878664, + "step": 129340 + }, + { + "epoch": 0.5553265844087822, + "grad_norm": 0.004719418473541737, + "learning_rate": 4.465950346231126e-05, + "loss": 0.14387308359146117, + "step": 129350 + }, + { + "epoch": 0.5553695164988022, + "grad_norm": 0.0066816494800150394, + "learning_rate": 4.465519174219363e-05, + "loss": 0.14007120132446288, + "step": 129360 + }, + { + "epoch": 0.5554124485888222, + "grad_norm": 0.0010613937629386783, + "learning_rate": 4.465088002207601e-05, + "loss": 0.0076937094330787655, + "step": 129370 + }, + { + "epoch": 0.5554553806788423, + "grad_norm": 1.7576144933700562, + "learning_rate": 4.464656830195839e-05, + "loss": 0.3215049743652344, + "step": 129380 + }, + { + "epoch": 0.5554983127688622, + "grad_norm": 8.515897750854492, + "learning_rate": 4.4642256581840766e-05, + "loss": 0.2446056604385376, + "step": 129390 + }, + { + "epoch": 0.5555412448588822, + "grad_norm": 0.01369958184659481, + "learning_rate": 4.4637944861723137e-05, + "loss": 0.21584432125091552, + "step": 129400 + }, + { + "epoch": 0.5555841769489023, + "grad_norm": 0.005753880832344294, + "learning_rate": 4.4633633141605514e-05, + "loss": 0.062202876806259154, + "step": 129410 + }, + { + "epoch": 0.5556271090389222, + "grad_norm": 0.45579853653907776, + "learning_rate": 4.462932142148789e-05, + "loss": 0.2092193603515625, + "step": 129420 + }, + { + "epoch": 0.5556700411289422, + "grad_norm": 0.007810491602867842, + "learning_rate": 4.462500970137027e-05, + "loss": 0.05864572525024414, + "step": 129430 + }, + { + "epoch": 0.5557129732189623, + "grad_norm": 0.005244900938123465, + "learning_rate": 4.462069798125264e-05, + "loss": 0.20220718383789063, + "step": 129440 + }, + { + "epoch": 0.5557559053089822, + "grad_norm": 1.7433593273162842, + "learning_rate": 4.4616386261135017e-05, + "loss": 0.34491286277770994, + "step": 129450 + }, + { + "epoch": 0.5557988373990023, + "grad_norm": 0.5952501893043518, + "learning_rate": 4.4612074541017394e-05, + "loss": 0.19884873628616334, + "step": 129460 + }, + { + "epoch": 0.5558417694890223, + "grad_norm": 0.05651632323861122, + "learning_rate": 4.460776282089977e-05, + "loss": 0.324522876739502, + "step": 129470 + }, + { + "epoch": 0.5558847015790422, + "grad_norm": 0.29743555188179016, + "learning_rate": 4.460345110078215e-05, + "loss": 0.42862529754638673, + "step": 129480 + }, + { + "epoch": 0.5559276336690623, + "grad_norm": 0.037468746304512024, + "learning_rate": 4.4599139380664526e-05, + "loss": 0.33815324306488037, + "step": 129490 + }, + { + "epoch": 0.5559705657590823, + "grad_norm": 1.3633570671081543, + "learning_rate": 4.45948276605469e-05, + "loss": 0.31829593181610105, + "step": 129500 + }, + { + "epoch": 0.5560134978491023, + "grad_norm": 2.867215633392334, + "learning_rate": 4.459051594042928e-05, + "loss": 0.24856970310211182, + "step": 129510 + }, + { + "epoch": 0.5560564299391223, + "grad_norm": 0.30264541506767273, + "learning_rate": 4.458620422031166e-05, + "loss": 0.1394771933555603, + "step": 129520 + }, + { + "epoch": 0.5560993620291423, + "grad_norm": 3.5270214080810547, + "learning_rate": 4.458189250019403e-05, + "loss": 0.11074033975601197, + "step": 129530 + }, + { + "epoch": 0.5561422941191623, + "grad_norm": 0.8371272087097168, + "learning_rate": 4.4577580780076406e-05, + "loss": 0.15264673233032228, + "step": 129540 + }, + { + "epoch": 0.5561852262091823, + "grad_norm": 0.09782962501049042, + "learning_rate": 4.457326905995878e-05, + "loss": 0.24928109645843505, + "step": 129550 + }, + { + "epoch": 0.5562281582992024, + "grad_norm": 1.7487066984176636, + "learning_rate": 4.456895733984116e-05, + "loss": 0.3725933313369751, + "step": 129560 + }, + { + "epoch": 0.5562710903892223, + "grad_norm": 0.0029988440219312906, + "learning_rate": 4.456464561972353e-05, + "loss": 0.3409987688064575, + "step": 129570 + }, + { + "epoch": 0.5563140224792423, + "grad_norm": 1.3888320922851562, + "learning_rate": 4.456033389960591e-05, + "loss": 0.15952913761138915, + "step": 129580 + }, + { + "epoch": 0.5563569545692624, + "grad_norm": 0.002297930186614394, + "learning_rate": 4.4556022179488286e-05, + "loss": 0.28969786167144773, + "step": 129590 + }, + { + "epoch": 0.5563998866592823, + "grad_norm": 2.3459360599517822, + "learning_rate": 4.455171045937066e-05, + "loss": 0.270216703414917, + "step": 129600 + }, + { + "epoch": 0.5564428187493023, + "grad_norm": 2.2138748168945312, + "learning_rate": 4.454739873925304e-05, + "loss": 0.3445254325866699, + "step": 129610 + }, + { + "epoch": 0.5564857508393224, + "grad_norm": 0.04111243784427643, + "learning_rate": 4.454308701913542e-05, + "loss": 0.24446432590484618, + "step": 129620 + }, + { + "epoch": 0.5565286829293423, + "grad_norm": 6.398478031158447, + "learning_rate": 4.4538775299017795e-05, + "loss": 0.25752198696136475, + "step": 129630 + }, + { + "epoch": 0.5565716150193624, + "grad_norm": 0.03157360851764679, + "learning_rate": 4.453446357890017e-05, + "loss": 0.25403120517730715, + "step": 129640 + }, + { + "epoch": 0.5566145471093824, + "grad_norm": 0.6314763426780701, + "learning_rate": 4.453015185878254e-05, + "loss": 0.2993044376373291, + "step": 129650 + }, + { + "epoch": 0.5566574791994023, + "grad_norm": 0.11549243330955505, + "learning_rate": 4.452584013866492e-05, + "loss": 0.14770009517669677, + "step": 129660 + }, + { + "epoch": 0.5567004112894224, + "grad_norm": 1.7644439935684204, + "learning_rate": 4.45215284185473e-05, + "loss": 0.26133854389190675, + "step": 129670 + }, + { + "epoch": 0.5567433433794424, + "grad_norm": 1.5984668731689453, + "learning_rate": 4.4517216698429675e-05, + "loss": 0.32401697635650634, + "step": 129680 + }, + { + "epoch": 0.5567862754694624, + "grad_norm": 3.6902859210968018, + "learning_rate": 4.4512904978312046e-05, + "loss": 0.19751888513565063, + "step": 129690 + }, + { + "epoch": 0.5568292075594824, + "grad_norm": 0.004566490650177002, + "learning_rate": 4.450859325819442e-05, + "loss": 0.26257147789001467, + "step": 129700 + }, + { + "epoch": 0.5568721396495024, + "grad_norm": 0.012955792248249054, + "learning_rate": 4.45042815380768e-05, + "loss": 0.2490767478942871, + "step": 129710 + }, + { + "epoch": 0.5569150717395224, + "grad_norm": 0.005359560716897249, + "learning_rate": 4.449996981795918e-05, + "loss": 0.18116750717163085, + "step": 129720 + }, + { + "epoch": 0.5569580038295424, + "grad_norm": 0.06892059743404388, + "learning_rate": 4.4495658097841555e-05, + "loss": 0.1894413113594055, + "step": 129730 + }, + { + "epoch": 0.5570009359195625, + "grad_norm": 0.0009415155509486794, + "learning_rate": 4.449134637772393e-05, + "loss": 0.23534340858459474, + "step": 129740 + }, + { + "epoch": 0.5570438680095824, + "grad_norm": 0.2022874653339386, + "learning_rate": 4.448703465760631e-05, + "loss": 0.3457710027694702, + "step": 129750 + }, + { + "epoch": 0.5570868000996024, + "grad_norm": 0.19633248448371887, + "learning_rate": 4.448272293748869e-05, + "loss": 0.12056779861450195, + "step": 129760 + }, + { + "epoch": 0.5571297321896225, + "grad_norm": 0.0035316748544573784, + "learning_rate": 4.447841121737106e-05, + "loss": 0.12530456781387328, + "step": 129770 + }, + { + "epoch": 0.5571726642796425, + "grad_norm": 0.18400853872299194, + "learning_rate": 4.4474099497253435e-05, + "loss": 0.1890039324760437, + "step": 129780 + }, + { + "epoch": 0.5572155963696624, + "grad_norm": 0.07773124426603317, + "learning_rate": 4.446978777713581e-05, + "loss": 0.3870898962020874, + "step": 129790 + }, + { + "epoch": 0.5572585284596825, + "grad_norm": 0.06875111162662506, + "learning_rate": 4.446547605701819e-05, + "loss": 0.32651290893554685, + "step": 129800 + }, + { + "epoch": 0.5573014605497025, + "grad_norm": 1.4054416418075562, + "learning_rate": 4.446116433690056e-05, + "loss": 0.3409204244613647, + "step": 129810 + }, + { + "epoch": 0.5573443926397225, + "grad_norm": 5.326071739196777, + "learning_rate": 4.445685261678294e-05, + "loss": 0.2509272336959839, + "step": 129820 + }, + { + "epoch": 0.5573873247297425, + "grad_norm": 0.06678488850593567, + "learning_rate": 4.4452540896665315e-05, + "loss": 0.09128606915473939, + "step": 129830 + }, + { + "epoch": 0.5574302568197625, + "grad_norm": 2.2432029247283936, + "learning_rate": 4.444822917654769e-05, + "loss": 0.19574520587921143, + "step": 129840 + }, + { + "epoch": 0.5574731889097825, + "grad_norm": 8.737180709838867, + "learning_rate": 4.4443917456430076e-05, + "loss": 0.3267951726913452, + "step": 129850 + }, + { + "epoch": 0.5575161209998025, + "grad_norm": 0.00803994107991457, + "learning_rate": 4.443960573631245e-05, + "loss": 0.36591262817382814, + "step": 129860 + }, + { + "epoch": 0.5575590530898226, + "grad_norm": 1.9719129800796509, + "learning_rate": 4.4435294016194824e-05, + "loss": 0.34435100555419923, + "step": 129870 + }, + { + "epoch": 0.5576019851798425, + "grad_norm": 0.20269423723220825, + "learning_rate": 4.44309822960772e-05, + "loss": 0.1376429319381714, + "step": 129880 + }, + { + "epoch": 0.5576449172698625, + "grad_norm": 0.07807984948158264, + "learning_rate": 4.442667057595958e-05, + "loss": 0.08359581232070923, + "step": 129890 + }, + { + "epoch": 0.5576878493598826, + "grad_norm": 0.1486503630876541, + "learning_rate": 4.442235885584195e-05, + "loss": 0.35685715675354, + "step": 129900 + }, + { + "epoch": 0.5577307814499025, + "grad_norm": 0.018347645178437233, + "learning_rate": 4.441804713572433e-05, + "loss": 0.15195566415786743, + "step": 129910 + }, + { + "epoch": 0.5577737135399226, + "grad_norm": 0.09118858724832535, + "learning_rate": 4.4413735415606704e-05, + "loss": 0.2308788299560547, + "step": 129920 + }, + { + "epoch": 0.5578166456299426, + "grad_norm": 0.5971916913986206, + "learning_rate": 4.440942369548908e-05, + "loss": 0.09032097458839417, + "step": 129930 + }, + { + "epoch": 0.5578595777199625, + "grad_norm": 3.844421625137329, + "learning_rate": 4.440511197537145e-05, + "loss": 0.3063809871673584, + "step": 129940 + }, + { + "epoch": 0.5579025098099826, + "grad_norm": 2.842432975769043, + "learning_rate": 4.440080025525383e-05, + "loss": 0.2679614543914795, + "step": 129950 + }, + { + "epoch": 0.5579454419000026, + "grad_norm": 2.7169439792633057, + "learning_rate": 4.4396488535136214e-05, + "loss": 0.2645352840423584, + "step": 129960 + }, + { + "epoch": 0.5579883739900225, + "grad_norm": 1.920378565788269, + "learning_rate": 4.439217681501859e-05, + "loss": 0.22816011905670167, + "step": 129970 + }, + { + "epoch": 0.5580313060800426, + "grad_norm": 0.016523117199540138, + "learning_rate": 4.438786509490096e-05, + "loss": 0.2725505828857422, + "step": 129980 + }, + { + "epoch": 0.5580742381700626, + "grad_norm": 1.1269477605819702, + "learning_rate": 4.438355337478334e-05, + "loss": 0.27584056854248046, + "step": 129990 + }, + { + "epoch": 0.5581171702600826, + "grad_norm": 0.30185261368751526, + "learning_rate": 4.4379241654665716e-05, + "loss": 0.14589924812316896, + "step": 130000 + }, + { + "epoch": 0.5581171702600826, + "eval_loss": 0.396757572889328, + "eval_runtime": 27.2906, + "eval_samples_per_second": 3.664, + "eval_steps_per_second": 3.664, + "step": 130000 + }, + { + "epoch": 0.5581601023501026, + "grad_norm": 0.13702532649040222, + "learning_rate": 4.4374929934548094e-05, + "loss": 0.3244907379150391, + "step": 130010 + }, + { + "epoch": 0.5582030344401226, + "grad_norm": 1.8142026662826538, + "learning_rate": 4.4370618214430464e-05, + "loss": 0.2708948850631714, + "step": 130020 + }, + { + "epoch": 0.5582459665301426, + "grad_norm": 0.06915269047021866, + "learning_rate": 4.436630649431284e-05, + "loss": 0.2838058710098267, + "step": 130030 + }, + { + "epoch": 0.5582888986201626, + "grad_norm": 0.12360896915197372, + "learning_rate": 4.436199477419522e-05, + "loss": 0.10447860956192016, + "step": 130040 + }, + { + "epoch": 0.5583318307101827, + "grad_norm": 5.960447788238525, + "learning_rate": 4.4357683054077596e-05, + "loss": 0.13814549446105956, + "step": 130050 + }, + { + "epoch": 0.5583747628002026, + "grad_norm": 0.5248103737831116, + "learning_rate": 4.435337133395997e-05, + "loss": 0.21094183921813964, + "step": 130060 + }, + { + "epoch": 0.5584176948902226, + "grad_norm": 0.48741307854652405, + "learning_rate": 4.434905961384235e-05, + "loss": 0.10333422422409058, + "step": 130070 + }, + { + "epoch": 0.5584606269802427, + "grad_norm": 1.0012564659118652, + "learning_rate": 4.434474789372473e-05, + "loss": 0.2197781801223755, + "step": 130080 + }, + { + "epoch": 0.5585035590702626, + "grad_norm": 0.07096364349126816, + "learning_rate": 4.4340436173607105e-05, + "loss": 0.2918126583099365, + "step": 130090 + }, + { + "epoch": 0.5585464911602827, + "grad_norm": 1.9969171285629272, + "learning_rate": 4.4336124453489476e-05, + "loss": 0.15286139249801636, + "step": 130100 + }, + { + "epoch": 0.5585894232503027, + "grad_norm": 0.4066997766494751, + "learning_rate": 4.433181273337185e-05, + "loss": 0.25106277465820315, + "step": 130110 + }, + { + "epoch": 0.5586323553403226, + "grad_norm": 1.3090592622756958, + "learning_rate": 4.432750101325423e-05, + "loss": 0.2082353115081787, + "step": 130120 + }, + { + "epoch": 0.5586752874303427, + "grad_norm": 0.007877206429839134, + "learning_rate": 4.432318929313661e-05, + "loss": 0.22714662551879883, + "step": 130130 + }, + { + "epoch": 0.5587182195203627, + "grad_norm": 0.03949381038546562, + "learning_rate": 4.431887757301898e-05, + "loss": 0.1773894190788269, + "step": 130140 + }, + { + "epoch": 0.5587611516103826, + "grad_norm": 0.2964007556438446, + "learning_rate": 4.4314565852901356e-05, + "loss": 0.056935679912567136, + "step": 130150 + }, + { + "epoch": 0.5588040837004027, + "grad_norm": 3.9917354583740234, + "learning_rate": 4.431025413278373e-05, + "loss": 0.10850996971130371, + "step": 130160 + }, + { + "epoch": 0.5588470157904227, + "grad_norm": 0.019908474758267403, + "learning_rate": 4.430594241266611e-05, + "loss": 0.15592384338378906, + "step": 130170 + }, + { + "epoch": 0.5588899478804427, + "grad_norm": 0.9341161847114563, + "learning_rate": 4.430163069254849e-05, + "loss": 0.11459755897521973, + "step": 130180 + }, + { + "epoch": 0.5589328799704627, + "grad_norm": 0.004427948500961065, + "learning_rate": 4.4297318972430865e-05, + "loss": 0.17416831254959106, + "step": 130190 + }, + { + "epoch": 0.5589758120604827, + "grad_norm": 0.29199692606925964, + "learning_rate": 4.429300725231324e-05, + "loss": 0.4187572956085205, + "step": 130200 + }, + { + "epoch": 0.5590187441505028, + "grad_norm": 2.022859573364258, + "learning_rate": 4.428869553219562e-05, + "loss": 0.11252399682998657, + "step": 130210 + }, + { + "epoch": 0.5590616762405227, + "grad_norm": 2.3882155418395996, + "learning_rate": 4.4284383812078e-05, + "loss": 0.1012803077697754, + "step": 130220 + }, + { + "epoch": 0.5591046083305428, + "grad_norm": 0.0023776378948241472, + "learning_rate": 4.428007209196037e-05, + "loss": 0.3746946811676025, + "step": 130230 + }, + { + "epoch": 0.5591475404205628, + "grad_norm": 1.6952602863311768, + "learning_rate": 4.4275760371842745e-05, + "loss": 0.28969037532806396, + "step": 130240 + }, + { + "epoch": 0.5591904725105827, + "grad_norm": 0.619696319103241, + "learning_rate": 4.427144865172512e-05, + "loss": 0.22678766250610352, + "step": 130250 + }, + { + "epoch": 0.5592334046006028, + "grad_norm": 8.467891693115234, + "learning_rate": 4.42671369316075e-05, + "loss": 0.4200021743774414, + "step": 130260 + }, + { + "epoch": 0.5592763366906228, + "grad_norm": 5.092950820922852, + "learning_rate": 4.426282521148987e-05, + "loss": 0.22055649757385254, + "step": 130270 + }, + { + "epoch": 0.5593192687806428, + "grad_norm": 0.061911582946777344, + "learning_rate": 4.425851349137225e-05, + "loss": 0.2685711145401001, + "step": 130280 + }, + { + "epoch": 0.5593622008706628, + "grad_norm": 2.5211803913116455, + "learning_rate": 4.4254201771254625e-05, + "loss": 0.2446134328842163, + "step": 130290 + }, + { + "epoch": 0.5594051329606828, + "grad_norm": 0.7915776371955872, + "learning_rate": 4.4249890051137e-05, + "loss": 0.12928345203399658, + "step": 130300 + }, + { + "epoch": 0.5594480650507028, + "grad_norm": 1.423632025718689, + "learning_rate": 4.424557833101938e-05, + "loss": 0.11890238523483276, + "step": 130310 + }, + { + "epoch": 0.5594909971407228, + "grad_norm": 5.311061382293701, + "learning_rate": 4.424126661090176e-05, + "loss": 0.16063714027404785, + "step": 130320 + }, + { + "epoch": 0.5595339292307429, + "grad_norm": 2.1655609607696533, + "learning_rate": 4.4236954890784135e-05, + "loss": 0.4018458366394043, + "step": 130330 + }, + { + "epoch": 0.5595768613207628, + "grad_norm": 0.07226397097110748, + "learning_rate": 4.423264317066651e-05, + "loss": 0.02289922386407852, + "step": 130340 + }, + { + "epoch": 0.5596197934107828, + "grad_norm": 1.9848915338516235, + "learning_rate": 4.422833145054888e-05, + "loss": 0.2881872892379761, + "step": 130350 + }, + { + "epoch": 0.5596627255008029, + "grad_norm": 0.004091145936399698, + "learning_rate": 4.422401973043126e-05, + "loss": 0.16486506462097167, + "step": 130360 + }, + { + "epoch": 0.5597056575908228, + "grad_norm": 0.008164674043655396, + "learning_rate": 4.421970801031364e-05, + "loss": 0.039905214309692384, + "step": 130370 + }, + { + "epoch": 0.5597485896808428, + "grad_norm": 0.020558584481477737, + "learning_rate": 4.4215396290196015e-05, + "loss": 0.10911152362823487, + "step": 130380 + }, + { + "epoch": 0.5597915217708629, + "grad_norm": 0.06221897155046463, + "learning_rate": 4.4211084570078385e-05, + "loss": 0.22285866737365723, + "step": 130390 + }, + { + "epoch": 0.5598344538608828, + "grad_norm": 6.663233280181885, + "learning_rate": 4.420677284996076e-05, + "loss": 0.22175629138946534, + "step": 130400 + }, + { + "epoch": 0.5598773859509029, + "grad_norm": 0.8797337412834167, + "learning_rate": 4.420246112984314e-05, + "loss": 0.20046689510345458, + "step": 130410 + }, + { + "epoch": 0.5599203180409229, + "grad_norm": 6.092191696166992, + "learning_rate": 4.419814940972552e-05, + "loss": 0.27480373382568357, + "step": 130420 + }, + { + "epoch": 0.5599632501309428, + "grad_norm": 0.015885712578892708, + "learning_rate": 4.4193837689607894e-05, + "loss": 0.09221312403678894, + "step": 130430 + }, + { + "epoch": 0.5600061822209629, + "grad_norm": 1.9088473320007324, + "learning_rate": 4.418952596949027e-05, + "loss": 0.42419586181640623, + "step": 130440 + }, + { + "epoch": 0.5600491143109829, + "grad_norm": 6.146290302276611, + "learning_rate": 4.418521424937265e-05, + "loss": 0.2966385126113892, + "step": 130450 + }, + { + "epoch": 0.5600920464010029, + "grad_norm": 0.005371685605496168, + "learning_rate": 4.4180902529255027e-05, + "loss": 0.19647728204727172, + "step": 130460 + }, + { + "epoch": 0.5601349784910229, + "grad_norm": 0.14021477103233337, + "learning_rate": 4.41765908091374e-05, + "loss": 0.24573662281036376, + "step": 130470 + }, + { + "epoch": 0.5601779105810429, + "grad_norm": 1.327796220779419, + "learning_rate": 4.4172279089019774e-05, + "loss": 0.3573757648468018, + "step": 130480 + }, + { + "epoch": 0.5602208426710629, + "grad_norm": 1.4445470571517944, + "learning_rate": 4.416796736890215e-05, + "loss": 0.2246248245239258, + "step": 130490 + }, + { + "epoch": 0.5602637747610829, + "grad_norm": 0.6600468158721924, + "learning_rate": 4.416365564878453e-05, + "loss": 0.15008944272994995, + "step": 130500 + }, + { + "epoch": 0.560306706851103, + "grad_norm": 0.23366093635559082, + "learning_rate": 4.41593439286669e-05, + "loss": 0.3061052799224854, + "step": 130510 + }, + { + "epoch": 0.5603496389411229, + "grad_norm": 2.143343687057495, + "learning_rate": 4.415503220854928e-05, + "loss": 0.2754983425140381, + "step": 130520 + }, + { + "epoch": 0.5603925710311429, + "grad_norm": 0.9091231822967529, + "learning_rate": 4.4150720488431654e-05, + "loss": 0.17608437538146973, + "step": 130530 + }, + { + "epoch": 0.560435503121163, + "grad_norm": 0.24302370846271515, + "learning_rate": 4.414640876831403e-05, + "loss": 0.21974198818206786, + "step": 130540 + }, + { + "epoch": 0.5604784352111829, + "grad_norm": 0.24109004437923431, + "learning_rate": 4.414209704819641e-05, + "loss": 0.2817169427871704, + "step": 130550 + }, + { + "epoch": 0.5605213673012029, + "grad_norm": 2.0368618965148926, + "learning_rate": 4.4137785328078786e-05, + "loss": 0.2441883087158203, + "step": 130560 + }, + { + "epoch": 0.560564299391223, + "grad_norm": 3.77104115486145, + "learning_rate": 4.4133473607961164e-05, + "loss": 0.11188125610351562, + "step": 130570 + }, + { + "epoch": 0.5606072314812429, + "grad_norm": 0.8214830160140991, + "learning_rate": 4.412916188784354e-05, + "loss": 0.428134822845459, + "step": 130580 + }, + { + "epoch": 0.560650163571263, + "grad_norm": 1.9323092699050903, + "learning_rate": 4.412485016772592e-05, + "loss": 0.05530650019645691, + "step": 130590 + }, + { + "epoch": 0.560693095661283, + "grad_norm": 0.003310135565698147, + "learning_rate": 4.412053844760829e-05, + "loss": 0.2841073751449585, + "step": 130600 + }, + { + "epoch": 0.5607360277513029, + "grad_norm": 0.0028873884584754705, + "learning_rate": 4.4116226727490666e-05, + "loss": 0.2064734935760498, + "step": 130610 + }, + { + "epoch": 0.560778959841323, + "grad_norm": 0.4137687087059021, + "learning_rate": 4.4111915007373044e-05, + "loss": 0.15328125953674315, + "step": 130620 + }, + { + "epoch": 0.560821891931343, + "grad_norm": 0.1398584544658661, + "learning_rate": 4.410760328725542e-05, + "loss": 0.20579054355621337, + "step": 130630 + }, + { + "epoch": 0.5608648240213631, + "grad_norm": 0.7580888271331787, + "learning_rate": 4.410329156713779e-05, + "loss": 0.3941477298736572, + "step": 130640 + }, + { + "epoch": 0.560907756111383, + "grad_norm": 1.2244528532028198, + "learning_rate": 4.409897984702017e-05, + "loss": 0.13406124114990234, + "step": 130650 + }, + { + "epoch": 0.560950688201403, + "grad_norm": 0.018945829942822456, + "learning_rate": 4.409466812690255e-05, + "loss": 0.09961066246032715, + "step": 130660 + }, + { + "epoch": 0.5609936202914231, + "grad_norm": 6.098886013031006, + "learning_rate": 4.409035640678493e-05, + "loss": 0.3770725727081299, + "step": 130670 + }, + { + "epoch": 0.561036552381443, + "grad_norm": 13.162020683288574, + "learning_rate": 4.40860446866673e-05, + "loss": 0.06884355545043945, + "step": 130680 + }, + { + "epoch": 0.561079484471463, + "grad_norm": 0.17313456535339355, + "learning_rate": 4.408173296654968e-05, + "loss": 0.22985308170318602, + "step": 130690 + }, + { + "epoch": 0.5611224165614831, + "grad_norm": 0.05977223441004753, + "learning_rate": 4.4077421246432056e-05, + "loss": 0.1930696964263916, + "step": 130700 + }, + { + "epoch": 0.561165348651503, + "grad_norm": 0.021493813022971153, + "learning_rate": 4.407310952631443e-05, + "loss": 0.11986234188079833, + "step": 130710 + }, + { + "epoch": 0.5612082807415231, + "grad_norm": 2.4718406200408936, + "learning_rate": 4.4068797806196804e-05, + "loss": 0.13716913461685182, + "step": 130720 + }, + { + "epoch": 0.5612512128315431, + "grad_norm": 0.5843445062637329, + "learning_rate": 4.406448608607918e-05, + "loss": 0.2553562641143799, + "step": 130730 + }, + { + "epoch": 0.561294144921563, + "grad_norm": 0.7953113317489624, + "learning_rate": 4.406017436596156e-05, + "loss": 0.15884344577789306, + "step": 130740 + }, + { + "epoch": 0.5613370770115831, + "grad_norm": 0.041502032428979874, + "learning_rate": 4.4055862645843936e-05, + "loss": 0.5208531856536865, + "step": 130750 + }, + { + "epoch": 0.5613800091016031, + "grad_norm": 7.92971658706665, + "learning_rate": 4.4051550925726306e-05, + "loss": 0.31333098411560056, + "step": 130760 + }, + { + "epoch": 0.5614229411916231, + "grad_norm": 0.10923028737306595, + "learning_rate": 4.404723920560869e-05, + "loss": 0.24447739124298096, + "step": 130770 + }, + { + "epoch": 0.5614658732816431, + "grad_norm": 0.002735383342951536, + "learning_rate": 4.404292748549107e-05, + "loss": 0.26939570903778076, + "step": 130780 + }, + { + "epoch": 0.5615088053716631, + "grad_norm": 0.04318393021821976, + "learning_rate": 4.4038615765373445e-05, + "loss": 0.2544058322906494, + "step": 130790 + }, + { + "epoch": 0.5615517374616831, + "grad_norm": 0.25320038199424744, + "learning_rate": 4.4034304045255816e-05, + "loss": 0.028018417954444885, + "step": 130800 + }, + { + "epoch": 0.5615946695517031, + "grad_norm": 0.05331530421972275, + "learning_rate": 4.402999232513819e-05, + "loss": 0.3315607786178589, + "step": 130810 + }, + { + "epoch": 0.5616376016417232, + "grad_norm": 0.031914785504341125, + "learning_rate": 4.402568060502057e-05, + "loss": 0.33244547843933103, + "step": 130820 + }, + { + "epoch": 0.5616805337317431, + "grad_norm": 0.039703018963336945, + "learning_rate": 4.402136888490295e-05, + "loss": 0.2509101390838623, + "step": 130830 + }, + { + "epoch": 0.5617234658217631, + "grad_norm": 10.16286563873291, + "learning_rate": 4.401705716478532e-05, + "loss": 0.11450953483581543, + "step": 130840 + }, + { + "epoch": 0.5617663979117832, + "grad_norm": 4.78218936920166, + "learning_rate": 4.4012745444667695e-05, + "loss": 0.035375076532363894, + "step": 130850 + }, + { + "epoch": 0.5618093300018031, + "grad_norm": 0.626234769821167, + "learning_rate": 4.400843372455007e-05, + "loss": 0.11089576482772827, + "step": 130860 + }, + { + "epoch": 0.5618522620918232, + "grad_norm": 0.020610149949789047, + "learning_rate": 4.400412200443245e-05, + "loss": 0.2556567430496216, + "step": 130870 + }, + { + "epoch": 0.5618951941818432, + "grad_norm": 5.132352352142334, + "learning_rate": 4.399981028431483e-05, + "loss": 0.3633986473083496, + "step": 130880 + }, + { + "epoch": 0.5619381262718631, + "grad_norm": 0.013127193786203861, + "learning_rate": 4.3995498564197205e-05, + "loss": 0.13249205350875853, + "step": 130890 + }, + { + "epoch": 0.5619810583618832, + "grad_norm": 0.0006753028719685972, + "learning_rate": 4.399118684407958e-05, + "loss": 0.09555227160453797, + "step": 130900 + }, + { + "epoch": 0.5620239904519032, + "grad_norm": 1.3687077760696411, + "learning_rate": 4.398687512396196e-05, + "loss": 0.6257006645202636, + "step": 130910 + }, + { + "epoch": 0.5620669225419231, + "grad_norm": 0.6645755171775818, + "learning_rate": 4.398256340384433e-05, + "loss": 0.22954320907592773, + "step": 130920 + }, + { + "epoch": 0.5621098546319432, + "grad_norm": 0.013810350559651852, + "learning_rate": 4.397825168372671e-05, + "loss": 0.08926523327827454, + "step": 130930 + }, + { + "epoch": 0.5621527867219632, + "grad_norm": 3.6319634914398193, + "learning_rate": 4.3973939963609085e-05, + "loss": 0.23496651649475098, + "step": 130940 + }, + { + "epoch": 0.5621957188119832, + "grad_norm": 1.0557408332824707, + "learning_rate": 4.396962824349146e-05, + "loss": 0.17628427743911743, + "step": 130950 + }, + { + "epoch": 0.5622386509020032, + "grad_norm": 0.4414917230606079, + "learning_rate": 4.396531652337384e-05, + "loss": 0.19747617244720458, + "step": 130960 + }, + { + "epoch": 0.5622815829920232, + "grad_norm": 0.00444777961820364, + "learning_rate": 4.396100480325621e-05, + "loss": 0.1586161732673645, + "step": 130970 + }, + { + "epoch": 0.5623245150820432, + "grad_norm": 0.053738441318273544, + "learning_rate": 4.395669308313859e-05, + "loss": 0.25755629539489744, + "step": 130980 + }, + { + "epoch": 0.5623674471720632, + "grad_norm": 0.2864459156990051, + "learning_rate": 4.3952381363020965e-05, + "loss": 0.02709081768989563, + "step": 130990 + }, + { + "epoch": 0.5624103792620833, + "grad_norm": 0.0064376420341432095, + "learning_rate": 4.394806964290334e-05, + "loss": 0.2746001958847046, + "step": 131000 + }, + { + "epoch": 0.5624103792620833, + "eval_loss": 0.3990839719772339, + "eval_runtime": 27.1241, + "eval_samples_per_second": 3.687, + "eval_steps_per_second": 3.687, + "step": 131000 + }, + { + "epoch": 0.5624533113521032, + "grad_norm": 0.8062077164649963, + "learning_rate": 4.394375792278572e-05, + "loss": 0.27772514820098876, + "step": 131010 + }, + { + "epoch": 0.5624962434421232, + "grad_norm": 0.6455739736557007, + "learning_rate": 4.39394462026681e-05, + "loss": 0.02453451752662659, + "step": 131020 + }, + { + "epoch": 0.5625391755321433, + "grad_norm": 0.08529993891716003, + "learning_rate": 4.3935134482550474e-05, + "loss": 0.1866888642311096, + "step": 131030 + }, + { + "epoch": 0.5625821076221632, + "grad_norm": 1.3479008674621582, + "learning_rate": 4.393082276243285e-05, + "loss": 0.33279035091400144, + "step": 131040 + }, + { + "epoch": 0.5626250397121833, + "grad_norm": 15.479037284851074, + "learning_rate": 4.392651104231522e-05, + "loss": 0.20916690826416015, + "step": 131050 + }, + { + "epoch": 0.5626679718022033, + "grad_norm": 0.09797138720750809, + "learning_rate": 4.39221993221976e-05, + "loss": 0.04719350934028625, + "step": 131060 + }, + { + "epoch": 0.5627109038922233, + "grad_norm": 3.2566111087799072, + "learning_rate": 4.391788760207998e-05, + "loss": 0.23531157970428468, + "step": 131070 + }, + { + "epoch": 0.5627538359822433, + "grad_norm": 0.12084054201841354, + "learning_rate": 4.3913575881962354e-05, + "loss": 0.11477060317993164, + "step": 131080 + }, + { + "epoch": 0.5627967680722633, + "grad_norm": 9.098376274108887, + "learning_rate": 4.3909264161844725e-05, + "loss": 0.3104034185409546, + "step": 131090 + }, + { + "epoch": 0.5628397001622834, + "grad_norm": 0.09239120036363602, + "learning_rate": 4.39049524417271e-05, + "loss": 0.16618393659591674, + "step": 131100 + }, + { + "epoch": 0.5628826322523033, + "grad_norm": 1.3476864099502563, + "learning_rate": 4.390064072160948e-05, + "loss": 0.11332845687866211, + "step": 131110 + }, + { + "epoch": 0.5629255643423233, + "grad_norm": 0.9282361268997192, + "learning_rate": 4.3896329001491857e-05, + "loss": 0.25531725883483886, + "step": 131120 + }, + { + "epoch": 0.5629684964323434, + "grad_norm": 0.039876118302345276, + "learning_rate": 4.3892017281374234e-05, + "loss": 0.19124466180801392, + "step": 131130 + }, + { + "epoch": 0.5630114285223633, + "grad_norm": 0.0030488877091556787, + "learning_rate": 4.388770556125661e-05, + "loss": 0.2110595464706421, + "step": 131140 + }, + { + "epoch": 0.5630543606123833, + "grad_norm": 0.10496467351913452, + "learning_rate": 4.388339384113899e-05, + "loss": 0.33485052585601804, + "step": 131150 + }, + { + "epoch": 0.5630972927024034, + "grad_norm": 0.3290857970714569, + "learning_rate": 4.3879082121021366e-05, + "loss": 0.1390450119972229, + "step": 131160 + }, + { + "epoch": 0.5631402247924233, + "grad_norm": 0.0027307202108204365, + "learning_rate": 4.3874770400903737e-05, + "loss": 0.3555908679962158, + "step": 131170 + }, + { + "epoch": 0.5631831568824434, + "grad_norm": 10.344257354736328, + "learning_rate": 4.3870458680786114e-05, + "loss": 0.31735968589782715, + "step": 131180 + }, + { + "epoch": 0.5632260889724634, + "grad_norm": 0.0717422142624855, + "learning_rate": 4.386614696066849e-05, + "loss": 0.20104632377624512, + "step": 131190 + }, + { + "epoch": 0.5632690210624833, + "grad_norm": 0.003424069145694375, + "learning_rate": 4.386183524055087e-05, + "loss": 0.2732577323913574, + "step": 131200 + }, + { + "epoch": 0.5633119531525034, + "grad_norm": 2.7449123859405518, + "learning_rate": 4.385752352043324e-05, + "loss": 0.2550164222717285, + "step": 131210 + }, + { + "epoch": 0.5633548852425234, + "grad_norm": 0.03599822148680687, + "learning_rate": 4.3853211800315616e-05, + "loss": 0.19458101987838744, + "step": 131220 + }, + { + "epoch": 0.5633978173325433, + "grad_norm": 0.17414642870426178, + "learning_rate": 4.3848900080197994e-05, + "loss": 0.22085530757904054, + "step": 131230 + }, + { + "epoch": 0.5634407494225634, + "grad_norm": 24.48627281188965, + "learning_rate": 4.384458836008037e-05, + "loss": 0.134981906414032, + "step": 131240 + }, + { + "epoch": 0.5634836815125834, + "grad_norm": 0.002881893888115883, + "learning_rate": 4.384027663996275e-05, + "loss": 0.2002098321914673, + "step": 131250 + }, + { + "epoch": 0.5635266136026034, + "grad_norm": 0.007777002640068531, + "learning_rate": 4.3835964919845126e-05, + "loss": 0.13187239170074463, + "step": 131260 + }, + { + "epoch": 0.5635695456926234, + "grad_norm": 0.005385277327150106, + "learning_rate": 4.38316531997275e-05, + "loss": 0.07089147567749024, + "step": 131270 + }, + { + "epoch": 0.5636124777826435, + "grad_norm": 0.007684089709073305, + "learning_rate": 4.382734147960988e-05, + "loss": 0.33400042057037355, + "step": 131280 + }, + { + "epoch": 0.5636554098726634, + "grad_norm": 0.18498967587947845, + "learning_rate": 4.382302975949225e-05, + "loss": 0.19423152208328248, + "step": 131290 + }, + { + "epoch": 0.5636983419626834, + "grad_norm": 0.3919488489627838, + "learning_rate": 4.381871803937463e-05, + "loss": 0.20056331157684326, + "step": 131300 + }, + { + "epoch": 0.5637412740527035, + "grad_norm": 0.9636663794517517, + "learning_rate": 4.3814406319257006e-05, + "loss": 0.08319100737571716, + "step": 131310 + }, + { + "epoch": 0.5637842061427234, + "grad_norm": 0.06585653126239777, + "learning_rate": 4.381009459913938e-05, + "loss": 0.1678343176841736, + "step": 131320 + }, + { + "epoch": 0.5638271382327434, + "grad_norm": 0.20225432515144348, + "learning_rate": 4.380578287902176e-05, + "loss": 0.16132519245147706, + "step": 131330 + }, + { + "epoch": 0.5638700703227635, + "grad_norm": 1.0769022703170776, + "learning_rate": 4.380147115890413e-05, + "loss": 0.5539116382598877, + "step": 131340 + }, + { + "epoch": 0.5639130024127834, + "grad_norm": 0.03397184610366821, + "learning_rate": 4.379715943878651e-05, + "loss": 0.30623517036437986, + "step": 131350 + }, + { + "epoch": 0.5639559345028035, + "grad_norm": 0.07383184134960175, + "learning_rate": 4.3792847718668886e-05, + "loss": 0.15672016143798828, + "step": 131360 + }, + { + "epoch": 0.5639988665928235, + "grad_norm": 0.23466603457927704, + "learning_rate": 4.378853599855127e-05, + "loss": 0.193856680393219, + "step": 131370 + }, + { + "epoch": 0.5640417986828434, + "grad_norm": 0.0022643166594207287, + "learning_rate": 4.378422427843364e-05, + "loss": 0.028151240944862366, + "step": 131380 + }, + { + "epoch": 0.5640847307728635, + "grad_norm": 0.020985648036003113, + "learning_rate": 4.377991255831602e-05, + "loss": 0.24543752670288085, + "step": 131390 + }, + { + "epoch": 0.5641276628628835, + "grad_norm": 1.4199275970458984, + "learning_rate": 4.3775600838198395e-05, + "loss": 0.3444381237030029, + "step": 131400 + }, + { + "epoch": 0.5641705949529034, + "grad_norm": 0.010642913170158863, + "learning_rate": 4.377128911808077e-05, + "loss": 0.0823455810546875, + "step": 131410 + }, + { + "epoch": 0.5642135270429235, + "grad_norm": 0.0010722661390900612, + "learning_rate": 4.376697739796314e-05, + "loss": 0.15871133804321289, + "step": 131420 + }, + { + "epoch": 0.5642564591329435, + "grad_norm": 57.9321174621582, + "learning_rate": 4.376266567784552e-05, + "loss": 0.14185404777526855, + "step": 131430 + }, + { + "epoch": 0.5642993912229635, + "grad_norm": 2.7046358585357666, + "learning_rate": 4.37583539577279e-05, + "loss": 0.41197805404663085, + "step": 131440 + }, + { + "epoch": 0.5643423233129835, + "grad_norm": 0.12308663129806519, + "learning_rate": 4.3754042237610275e-05, + "loss": 0.2550692319869995, + "step": 131450 + }, + { + "epoch": 0.5643852554030035, + "grad_norm": 2.411984920501709, + "learning_rate": 4.3749730517492646e-05, + "loss": 0.42504286766052246, + "step": 131460 + }, + { + "epoch": 0.5644281874930235, + "grad_norm": 0.002382261911407113, + "learning_rate": 4.374541879737502e-05, + "loss": 0.1963081955909729, + "step": 131470 + }, + { + "epoch": 0.5644711195830435, + "grad_norm": 0.0080162538215518, + "learning_rate": 4.374110707725741e-05, + "loss": 0.29774773120880127, + "step": 131480 + }, + { + "epoch": 0.5645140516730636, + "grad_norm": 2.8535964488983154, + "learning_rate": 4.3736795357139784e-05, + "loss": 0.2715657472610474, + "step": 131490 + }, + { + "epoch": 0.5645569837630836, + "grad_norm": 0.12202345579862595, + "learning_rate": 4.3732483637022155e-05, + "loss": 0.12172391414642333, + "step": 131500 + }, + { + "epoch": 0.5645999158531035, + "grad_norm": 0.040142424404621124, + "learning_rate": 4.372817191690453e-05, + "loss": 0.2592360496520996, + "step": 131510 + }, + { + "epoch": 0.5646428479431236, + "grad_norm": 1.4805283546447754, + "learning_rate": 4.372386019678691e-05, + "loss": 0.2895283460617065, + "step": 131520 + }, + { + "epoch": 0.5646857800331436, + "grad_norm": 1.0980712175369263, + "learning_rate": 4.371954847666929e-05, + "loss": 0.26518375873565675, + "step": 131530 + }, + { + "epoch": 0.5647287121231636, + "grad_norm": 0.09577146172523499, + "learning_rate": 4.371523675655166e-05, + "loss": 0.037435561418533325, + "step": 131540 + }, + { + "epoch": 0.5647716442131836, + "grad_norm": 0.04082731157541275, + "learning_rate": 4.3710925036434035e-05, + "loss": 0.26807751655578616, + "step": 131550 + }, + { + "epoch": 0.5648145763032036, + "grad_norm": 0.017090164124965668, + "learning_rate": 4.370661331631641e-05, + "loss": 0.1559891700744629, + "step": 131560 + }, + { + "epoch": 0.5648575083932236, + "grad_norm": 1.3990321159362793, + "learning_rate": 4.370230159619879e-05, + "loss": 0.4341771125793457, + "step": 131570 + }, + { + "epoch": 0.5649004404832436, + "grad_norm": 0.31079280376434326, + "learning_rate": 4.369798987608116e-05, + "loss": 0.12254334688186645, + "step": 131580 + }, + { + "epoch": 0.5649433725732637, + "grad_norm": 1.2702585458755493, + "learning_rate": 4.3693678155963544e-05, + "loss": 0.13572754859924316, + "step": 131590 + }, + { + "epoch": 0.5649863046632836, + "grad_norm": 0.020670155063271523, + "learning_rate": 4.368936643584592e-05, + "loss": 0.10454627275466918, + "step": 131600 + }, + { + "epoch": 0.5650292367533036, + "grad_norm": 0.010946854017674923, + "learning_rate": 4.36850547157283e-05, + "loss": 0.1574021100997925, + "step": 131610 + }, + { + "epoch": 0.5650721688433237, + "grad_norm": 0.022414082661271095, + "learning_rate": 4.368074299561067e-05, + "loss": 0.16641606092453004, + "step": 131620 + }, + { + "epoch": 0.5651151009333436, + "grad_norm": 0.07762713730335236, + "learning_rate": 4.367643127549305e-05, + "loss": 0.3286291122436523, + "step": 131630 + }, + { + "epoch": 0.5651580330233636, + "grad_norm": 0.0033479107078164816, + "learning_rate": 4.3672119555375424e-05, + "loss": 0.12209920883178711, + "step": 131640 + }, + { + "epoch": 0.5652009651133837, + "grad_norm": 0.0006103302584961057, + "learning_rate": 4.36678078352578e-05, + "loss": 0.03355185687541962, + "step": 131650 + }, + { + "epoch": 0.5652438972034036, + "grad_norm": 0.013331168331205845, + "learning_rate": 4.366349611514017e-05, + "loss": 0.15870726108551025, + "step": 131660 + }, + { + "epoch": 0.5652868292934237, + "grad_norm": 0.006360860541462898, + "learning_rate": 4.365918439502255e-05, + "loss": 0.20273292064666748, + "step": 131670 + }, + { + "epoch": 0.5653297613834437, + "grad_norm": 0.0005006656865589321, + "learning_rate": 4.365487267490493e-05, + "loss": 0.19195096492767333, + "step": 131680 + }, + { + "epoch": 0.5653726934734636, + "grad_norm": 0.000357257726136595, + "learning_rate": 4.3650560954787304e-05, + "loss": 0.16304749250411987, + "step": 131690 + }, + { + "epoch": 0.5654156255634837, + "grad_norm": 0.013322567567229271, + "learning_rate": 4.364624923466968e-05, + "loss": 0.37918355464935305, + "step": 131700 + }, + { + "epoch": 0.5654585576535037, + "grad_norm": 0.0018318496877327561, + "learning_rate": 4.364193751455206e-05, + "loss": 0.3940425395965576, + "step": 131710 + }, + { + "epoch": 0.5655014897435237, + "grad_norm": 1.2409061193466187, + "learning_rate": 4.3637625794434436e-05, + "loss": 0.3135698318481445, + "step": 131720 + }, + { + "epoch": 0.5655444218335437, + "grad_norm": 0.11461956053972244, + "learning_rate": 4.3633314074316814e-05, + "loss": 0.19950928688049316, + "step": 131730 + }, + { + "epoch": 0.5655873539235637, + "grad_norm": 0.07321076840162277, + "learning_rate": 4.362900235419919e-05, + "loss": 0.08899721503257751, + "step": 131740 + }, + { + "epoch": 0.5656302860135837, + "grad_norm": 0.00024293846217915416, + "learning_rate": 4.362469063408156e-05, + "loss": 0.38968701362609864, + "step": 131750 + }, + { + "epoch": 0.5656732181036037, + "grad_norm": 0.00021523365285247564, + "learning_rate": 4.362037891396394e-05, + "loss": 0.2745439767837524, + "step": 131760 + }, + { + "epoch": 0.5657161501936238, + "grad_norm": 0.29328349232673645, + "learning_rate": 4.3616067193846316e-05, + "loss": 0.2557271957397461, + "step": 131770 + }, + { + "epoch": 0.5657590822836437, + "grad_norm": 3.7032077312469482, + "learning_rate": 4.3611755473728693e-05, + "loss": 0.2522104263305664, + "step": 131780 + }, + { + "epoch": 0.5658020143736637, + "grad_norm": 0.000676521216519177, + "learning_rate": 4.3607443753611064e-05, + "loss": 0.23237941265106202, + "step": 131790 + }, + { + "epoch": 0.5658449464636838, + "grad_norm": 1.2005796432495117, + "learning_rate": 4.360313203349344e-05, + "loss": 0.22138490676879882, + "step": 131800 + }, + { + "epoch": 0.5658878785537037, + "grad_norm": 0.49399295449256897, + "learning_rate": 4.359882031337582e-05, + "loss": 0.14007023572921753, + "step": 131810 + }, + { + "epoch": 0.5659308106437237, + "grad_norm": 0.01492351945489645, + "learning_rate": 4.3594508593258196e-05, + "loss": 0.08878965377807617, + "step": 131820 + }, + { + "epoch": 0.5659737427337438, + "grad_norm": 0.2737496495246887, + "learning_rate": 4.3590196873140573e-05, + "loss": 0.11913632154464722, + "step": 131830 + }, + { + "epoch": 0.5660166748237637, + "grad_norm": 0.0028132752049714327, + "learning_rate": 4.358588515302295e-05, + "loss": 0.1496422529220581, + "step": 131840 + }, + { + "epoch": 0.5660596069137838, + "grad_norm": 8.923117637634277, + "learning_rate": 4.358157343290533e-05, + "loss": 0.3230890274047852, + "step": 131850 + }, + { + "epoch": 0.5661025390038038, + "grad_norm": 0.12785527110099792, + "learning_rate": 4.3577261712787705e-05, + "loss": 0.41931886672973634, + "step": 131860 + }, + { + "epoch": 0.5661454710938237, + "grad_norm": 1.3026045560836792, + "learning_rate": 4.3572949992670076e-05, + "loss": 0.455197811126709, + "step": 131870 + }, + { + "epoch": 0.5661884031838438, + "grad_norm": 6.643957138061523, + "learning_rate": 4.356863827255245e-05, + "loss": 0.1968802809715271, + "step": 131880 + }, + { + "epoch": 0.5662313352738638, + "grad_norm": 0.10116653144359589, + "learning_rate": 4.356432655243483e-05, + "loss": 0.0696562647819519, + "step": 131890 + }, + { + "epoch": 0.5662742673638838, + "grad_norm": 0.09920462220907211, + "learning_rate": 4.356001483231721e-05, + "loss": 0.12258206605911255, + "step": 131900 + }, + { + "epoch": 0.5663171994539038, + "grad_norm": 0.06810770183801651, + "learning_rate": 4.355570311219958e-05, + "loss": 0.06588384509086609, + "step": 131910 + }, + { + "epoch": 0.5663601315439238, + "grad_norm": 0.010695497505366802, + "learning_rate": 4.3551391392081956e-05, + "loss": 0.315065336227417, + "step": 131920 + }, + { + "epoch": 0.5664030636339439, + "grad_norm": 0.014091585762798786, + "learning_rate": 4.354707967196433e-05, + "loss": 0.13193466663360595, + "step": 131930 + }, + { + "epoch": 0.5664459957239638, + "grad_norm": 0.1220465674996376, + "learning_rate": 4.354276795184671e-05, + "loss": 0.3290372610092163, + "step": 131940 + }, + { + "epoch": 0.5664889278139839, + "grad_norm": 1.8874152898788452, + "learning_rate": 4.353845623172909e-05, + "loss": 0.3399007558822632, + "step": 131950 + }, + { + "epoch": 0.5665318599040039, + "grad_norm": 0.05992142856121063, + "learning_rate": 4.3534144511611465e-05, + "loss": 0.3105152606964111, + "step": 131960 + }, + { + "epoch": 0.5665747919940238, + "grad_norm": 4.826685428619385, + "learning_rate": 4.352983279149384e-05, + "loss": 0.41585707664489746, + "step": 131970 + }, + { + "epoch": 0.5666177240840439, + "grad_norm": 0.002173103392124176, + "learning_rate": 4.352552107137622e-05, + "loss": 0.05335950255393982, + "step": 131980 + }, + { + "epoch": 0.5666606561740639, + "grad_norm": 0.002653565490618348, + "learning_rate": 4.352120935125859e-05, + "loss": 0.22259302139282228, + "step": 131990 + }, + { + "epoch": 0.5667035882640838, + "grad_norm": 4.037930488586426, + "learning_rate": 4.351689763114097e-05, + "loss": 0.23362507820129394, + "step": 132000 + }, + { + "epoch": 0.5667035882640838, + "eval_loss": 0.4016095995903015, + "eval_runtime": 27.123, + "eval_samples_per_second": 3.687, + "eval_steps_per_second": 3.687, + "step": 132000 + }, + { + "epoch": 0.5667465203541039, + "grad_norm": 1.3028349876403809, + "learning_rate": 4.3512585911023345e-05, + "loss": 0.16070892810821533, + "step": 132010 + }, + { + "epoch": 0.5667894524441239, + "grad_norm": 4.420613765716553, + "learning_rate": 4.350827419090572e-05, + "loss": 0.2285766839981079, + "step": 132020 + }, + { + "epoch": 0.5668323845341439, + "grad_norm": 0.03554035350680351, + "learning_rate": 4.35039624707881e-05, + "loss": 0.19530502557754517, + "step": 132030 + }, + { + "epoch": 0.5668753166241639, + "grad_norm": 5.485271453857422, + "learning_rate": 4.349965075067047e-05, + "loss": 0.23321409225463868, + "step": 132040 + }, + { + "epoch": 0.566918248714184, + "grad_norm": 0.029682185500860214, + "learning_rate": 4.349533903055285e-05, + "loss": 0.30017259120941164, + "step": 132050 + }, + { + "epoch": 0.5669611808042039, + "grad_norm": 0.03592360019683838, + "learning_rate": 4.3491027310435225e-05, + "loss": 0.1560931086540222, + "step": 132060 + }, + { + "epoch": 0.5670041128942239, + "grad_norm": 1.4533315896987915, + "learning_rate": 4.348671559031761e-05, + "loss": 0.26650247573852537, + "step": 132070 + }, + { + "epoch": 0.567047044984244, + "grad_norm": 0.13568095862865448, + "learning_rate": 4.348240387019998e-05, + "loss": 0.2090909719467163, + "step": 132080 + }, + { + "epoch": 0.5670899770742639, + "grad_norm": 0.04799778759479523, + "learning_rate": 4.347809215008236e-05, + "loss": 0.3779847860336304, + "step": 132090 + }, + { + "epoch": 0.5671329091642839, + "grad_norm": 0.007508368697017431, + "learning_rate": 4.3473780429964735e-05, + "loss": 0.2174311876296997, + "step": 132100 + }, + { + "epoch": 0.567175841254304, + "grad_norm": 1.3405908346176147, + "learning_rate": 4.346946870984711e-05, + "loss": 0.14547221660614013, + "step": 132110 + }, + { + "epoch": 0.5672187733443239, + "grad_norm": 1.839350938796997, + "learning_rate": 4.346515698972948e-05, + "loss": 0.2051999568939209, + "step": 132120 + }, + { + "epoch": 0.567261705434344, + "grad_norm": 1.1987416744232178, + "learning_rate": 4.346084526961186e-05, + "loss": 0.15413864850997924, + "step": 132130 + }, + { + "epoch": 0.567304637524364, + "grad_norm": 0.0008382200030609965, + "learning_rate": 4.345653354949424e-05, + "loss": 0.053211706876754764, + "step": 132140 + }, + { + "epoch": 0.5673475696143839, + "grad_norm": 0.9950153231620789, + "learning_rate": 4.3452221829376614e-05, + "loss": 0.5215589523315429, + "step": 132150 + }, + { + "epoch": 0.567390501704404, + "grad_norm": 1.078520655632019, + "learning_rate": 4.3447910109258985e-05, + "loss": 0.15002841949462892, + "step": 132160 + }, + { + "epoch": 0.567433433794424, + "grad_norm": 0.6501953601837158, + "learning_rate": 4.344359838914136e-05, + "loss": 0.07707089185714722, + "step": 132170 + }, + { + "epoch": 0.567476365884444, + "grad_norm": 3.6814463138580322, + "learning_rate": 4.3439286669023747e-05, + "loss": 0.3484069347381592, + "step": 132180 + }, + { + "epoch": 0.567519297974464, + "grad_norm": 0.01013046782463789, + "learning_rate": 4.3434974948906124e-05, + "loss": 0.2226715564727783, + "step": 132190 + }, + { + "epoch": 0.567562230064484, + "grad_norm": 0.07840029150247574, + "learning_rate": 4.3430663228788494e-05, + "loss": 0.0511742115020752, + "step": 132200 + }, + { + "epoch": 0.567605162154504, + "grad_norm": 0.06084107980132103, + "learning_rate": 4.342635150867087e-05, + "loss": 0.18663891553878784, + "step": 132210 + }, + { + "epoch": 0.567648094244524, + "grad_norm": 0.0007634757785126567, + "learning_rate": 4.342203978855325e-05, + "loss": 0.10051954984664917, + "step": 132220 + }, + { + "epoch": 0.567691026334544, + "grad_norm": 0.05226648971438408, + "learning_rate": 4.3417728068435626e-05, + "loss": 0.11277986764907837, + "step": 132230 + }, + { + "epoch": 0.567733958424564, + "grad_norm": 1.3088717460632324, + "learning_rate": 4.3413416348318e-05, + "loss": 0.23899502754211427, + "step": 132240 + }, + { + "epoch": 0.567776890514584, + "grad_norm": 0.001747295493260026, + "learning_rate": 4.3409104628200374e-05, + "loss": 0.21408369541168212, + "step": 132250 + }, + { + "epoch": 0.5678198226046041, + "grad_norm": 1.5225026607513428, + "learning_rate": 4.340479290808275e-05, + "loss": 0.2718491077423096, + "step": 132260 + }, + { + "epoch": 0.567862754694624, + "grad_norm": 0.8086637854576111, + "learning_rate": 4.340048118796513e-05, + "loss": 0.16083817481994628, + "step": 132270 + }, + { + "epoch": 0.567905686784644, + "grad_norm": 1.550381064414978, + "learning_rate": 4.33961694678475e-05, + "loss": 0.18627991676330566, + "step": 132280 + }, + { + "epoch": 0.5679486188746641, + "grad_norm": 0.007228048052638769, + "learning_rate": 4.3391857747729884e-05, + "loss": 0.18209935426712037, + "step": 132290 + }, + { + "epoch": 0.567991550964684, + "grad_norm": 0.027518831193447113, + "learning_rate": 4.338754602761226e-05, + "loss": 0.2660661697387695, + "step": 132300 + }, + { + "epoch": 0.568034483054704, + "grad_norm": 0.001855163834989071, + "learning_rate": 4.338323430749464e-05, + "loss": 0.10220578908920289, + "step": 132310 + }, + { + "epoch": 0.5680774151447241, + "grad_norm": 0.0008658714359626174, + "learning_rate": 4.337892258737701e-05, + "loss": 0.09651315808296204, + "step": 132320 + }, + { + "epoch": 0.568120347234744, + "grad_norm": 0.13477729260921478, + "learning_rate": 4.3374610867259386e-05, + "loss": 0.23384594917297363, + "step": 132330 + }, + { + "epoch": 0.5681632793247641, + "grad_norm": 0.1917153149843216, + "learning_rate": 4.3370299147141764e-05, + "loss": 0.09794583320617675, + "step": 132340 + }, + { + "epoch": 0.5682062114147841, + "grad_norm": 0.21307621896266937, + "learning_rate": 4.336598742702414e-05, + "loss": 0.3008575439453125, + "step": 132350 + }, + { + "epoch": 0.5682491435048042, + "grad_norm": 0.007791618350893259, + "learning_rate": 4.336167570690651e-05, + "loss": 0.12317459583282471, + "step": 132360 + }, + { + "epoch": 0.5682920755948241, + "grad_norm": 2.7668819427490234, + "learning_rate": 4.335736398678889e-05, + "loss": 0.2858917236328125, + "step": 132370 + }, + { + "epoch": 0.5683350076848441, + "grad_norm": 2.066178560256958, + "learning_rate": 4.3353052266671266e-05, + "loss": 0.24323766231536864, + "step": 132380 + }, + { + "epoch": 0.5683779397748642, + "grad_norm": 0.7362213730812073, + "learning_rate": 4.3348740546553644e-05, + "loss": 0.15425702333450317, + "step": 132390 + }, + { + "epoch": 0.5684208718648841, + "grad_norm": 0.06285788863897324, + "learning_rate": 4.334442882643602e-05, + "loss": 0.2881110906600952, + "step": 132400 + }, + { + "epoch": 0.5684638039549041, + "grad_norm": 0.00026648957282304764, + "learning_rate": 4.33401171063184e-05, + "loss": 0.24016718864440917, + "step": 132410 + }, + { + "epoch": 0.5685067360449242, + "grad_norm": 0.054259609431028366, + "learning_rate": 4.3335805386200776e-05, + "loss": 0.1691007137298584, + "step": 132420 + }, + { + "epoch": 0.5685496681349441, + "grad_norm": 0.0385783426463604, + "learning_rate": 4.333149366608315e-05, + "loss": 0.23008294105529786, + "step": 132430 + }, + { + "epoch": 0.5685926002249642, + "grad_norm": 0.7211818695068359, + "learning_rate": 4.332718194596553e-05, + "loss": 0.34435076713562013, + "step": 132440 + }, + { + "epoch": 0.5686355323149842, + "grad_norm": 1.870476245880127, + "learning_rate": 4.33228702258479e-05, + "loss": 0.2791109323501587, + "step": 132450 + }, + { + "epoch": 0.5686784644050041, + "grad_norm": 1.46523916721344, + "learning_rate": 4.331855850573028e-05, + "loss": 0.19431822299957274, + "step": 132460 + }, + { + "epoch": 0.5687213964950242, + "grad_norm": 0.025228098034858704, + "learning_rate": 4.3314246785612656e-05, + "loss": 0.4006353378295898, + "step": 132470 + }, + { + "epoch": 0.5687643285850442, + "grad_norm": 0.02704770676791668, + "learning_rate": 4.330993506549503e-05, + "loss": 0.09785645008087158, + "step": 132480 + }, + { + "epoch": 0.5688072606750642, + "grad_norm": 2.4727747440338135, + "learning_rate": 4.3305623345377404e-05, + "loss": 0.3076622486114502, + "step": 132490 + }, + { + "epoch": 0.5688501927650842, + "grad_norm": 0.0011699130991473794, + "learning_rate": 4.330131162525978e-05, + "loss": 0.2031773567199707, + "step": 132500 + }, + { + "epoch": 0.5688931248551042, + "grad_norm": 0.9709953665733337, + "learning_rate": 4.329699990514216e-05, + "loss": 0.5550169944763184, + "step": 132510 + }, + { + "epoch": 0.5689360569451242, + "grad_norm": 0.10869079828262329, + "learning_rate": 4.3292688185024536e-05, + "loss": 0.07343157529830932, + "step": 132520 + }, + { + "epoch": 0.5689789890351442, + "grad_norm": 0.04295245185494423, + "learning_rate": 4.328837646490691e-05, + "loss": 0.2245166063308716, + "step": 132530 + }, + { + "epoch": 0.5690219211251643, + "grad_norm": 0.005385030992329121, + "learning_rate": 4.328406474478929e-05, + "loss": 0.03220377266407013, + "step": 132540 + }, + { + "epoch": 0.5690648532151842, + "grad_norm": 1.353747844696045, + "learning_rate": 4.327975302467167e-05, + "loss": 0.12378357648849488, + "step": 132550 + }, + { + "epoch": 0.5691077853052042, + "grad_norm": 0.06430882960557938, + "learning_rate": 4.3275441304554045e-05, + "loss": 0.155087149143219, + "step": 132560 + }, + { + "epoch": 0.5691507173952243, + "grad_norm": 0.0034910349640995264, + "learning_rate": 4.3271129584436415e-05, + "loss": 0.16095657348632814, + "step": 132570 + }, + { + "epoch": 0.5691936494852442, + "grad_norm": 0.0006872376543469727, + "learning_rate": 4.326681786431879e-05, + "loss": 0.0919446349143982, + "step": 132580 + }, + { + "epoch": 0.5692365815752642, + "grad_norm": 2.450606107711792, + "learning_rate": 4.326250614420117e-05, + "loss": 0.5073282241821289, + "step": 132590 + }, + { + "epoch": 0.5692795136652843, + "grad_norm": 0.015987534075975418, + "learning_rate": 4.325819442408355e-05, + "loss": 0.11756675243377686, + "step": 132600 + }, + { + "epoch": 0.5693224457553042, + "grad_norm": 2.127434015274048, + "learning_rate": 4.325388270396592e-05, + "loss": 0.30617260932922363, + "step": 132610 + }, + { + "epoch": 0.5693653778453243, + "grad_norm": 0.5571884512901306, + "learning_rate": 4.3249570983848295e-05, + "loss": 0.2655909061431885, + "step": 132620 + }, + { + "epoch": 0.5694083099353443, + "grad_norm": 2.569355010986328, + "learning_rate": 4.324525926373067e-05, + "loss": 0.17807893753051757, + "step": 132630 + }, + { + "epoch": 0.5694512420253642, + "grad_norm": 0.01739351451396942, + "learning_rate": 4.324094754361305e-05, + "loss": 0.19666470289230348, + "step": 132640 + }, + { + "epoch": 0.5694941741153843, + "grad_norm": 0.07557443529367447, + "learning_rate": 4.323663582349543e-05, + "loss": 0.2611125707626343, + "step": 132650 + }, + { + "epoch": 0.5695371062054043, + "grad_norm": 6.050956726074219, + "learning_rate": 4.3232324103377805e-05, + "loss": 0.37708406448364257, + "step": 132660 + }, + { + "epoch": 0.5695800382954243, + "grad_norm": 0.09482987970113754, + "learning_rate": 4.322801238326018e-05, + "loss": 0.17632991075515747, + "step": 132670 + }, + { + "epoch": 0.5696229703854443, + "grad_norm": 2.6997487545013428, + "learning_rate": 4.322370066314256e-05, + "loss": 0.16271533966064453, + "step": 132680 + }, + { + "epoch": 0.5696659024754643, + "grad_norm": 2.603530168533325, + "learning_rate": 4.321938894302493e-05, + "loss": 0.36246230602264407, + "step": 132690 + }, + { + "epoch": 0.5697088345654843, + "grad_norm": 0.08923777937889099, + "learning_rate": 4.321507722290731e-05, + "loss": 0.21388437747955322, + "step": 132700 + }, + { + "epoch": 0.5697517666555043, + "grad_norm": 0.35162797570228577, + "learning_rate": 4.3210765502789685e-05, + "loss": 0.3742487668991089, + "step": 132710 + }, + { + "epoch": 0.5697946987455244, + "grad_norm": 2.965769052505493, + "learning_rate": 4.320645378267206e-05, + "loss": 0.2451101779937744, + "step": 132720 + }, + { + "epoch": 0.5698376308355443, + "grad_norm": 0.01462629809975624, + "learning_rate": 4.320214206255443e-05, + "loss": 0.047423246502876285, + "step": 132730 + }, + { + "epoch": 0.5698805629255643, + "grad_norm": 0.017881186679005623, + "learning_rate": 4.319783034243681e-05, + "loss": 0.20612313747406005, + "step": 132740 + }, + { + "epoch": 0.5699234950155844, + "grad_norm": 0.0030642226338386536, + "learning_rate": 4.319351862231919e-05, + "loss": 0.05692043900489807, + "step": 132750 + }, + { + "epoch": 0.5699664271056043, + "grad_norm": 0.03154606372117996, + "learning_rate": 4.3189206902201565e-05, + "loss": 0.13520408868789674, + "step": 132760 + }, + { + "epoch": 0.5700093591956243, + "grad_norm": 0.005595480091869831, + "learning_rate": 4.318489518208394e-05, + "loss": 0.28533225059509276, + "step": 132770 + }, + { + "epoch": 0.5700522912856444, + "grad_norm": 0.037034157663583755, + "learning_rate": 4.318058346196632e-05, + "loss": 0.28535511493682864, + "step": 132780 + }, + { + "epoch": 0.5700952233756644, + "grad_norm": 1.1709529161453247, + "learning_rate": 4.31762717418487e-05, + "loss": 0.302028226852417, + "step": 132790 + }, + { + "epoch": 0.5701381554656844, + "grad_norm": 1.4559121131896973, + "learning_rate": 4.3171960021731074e-05, + "loss": 0.3461907148361206, + "step": 132800 + }, + { + "epoch": 0.5701810875557044, + "grad_norm": 0.17438967525959015, + "learning_rate": 4.316764830161345e-05, + "loss": 0.16365714073181153, + "step": 132810 + }, + { + "epoch": 0.5702240196457244, + "grad_norm": 0.021815786138176918, + "learning_rate": 4.316333658149582e-05, + "loss": 0.1973589062690735, + "step": 132820 + }, + { + "epoch": 0.5702669517357444, + "grad_norm": 0.1921364665031433, + "learning_rate": 4.31590248613782e-05, + "loss": 0.09176667928695678, + "step": 132830 + }, + { + "epoch": 0.5703098838257644, + "grad_norm": 0.006648586597293615, + "learning_rate": 4.315471314126058e-05, + "loss": 0.2943988561630249, + "step": 132840 + }, + { + "epoch": 0.5703528159157845, + "grad_norm": 0.008222861215472221, + "learning_rate": 4.3150401421142954e-05, + "loss": 0.22311089038848878, + "step": 132850 + }, + { + "epoch": 0.5703957480058044, + "grad_norm": 1.4243155717849731, + "learning_rate": 4.3146089701025325e-05, + "loss": 0.3057553768157959, + "step": 132860 + }, + { + "epoch": 0.5704386800958244, + "grad_norm": 10.4828462600708, + "learning_rate": 4.31417779809077e-05, + "loss": 0.30001082420349123, + "step": 132870 + }, + { + "epoch": 0.5704816121858445, + "grad_norm": 2.4526772499084473, + "learning_rate": 4.313746626079008e-05, + "loss": 0.20372674465179444, + "step": 132880 + }, + { + "epoch": 0.5705245442758644, + "grad_norm": 3.1110787391662598, + "learning_rate": 4.313315454067246e-05, + "loss": 0.21619665622711182, + "step": 132890 + }, + { + "epoch": 0.5705674763658845, + "grad_norm": 0.05998954549431801, + "learning_rate": 4.3128842820554834e-05, + "loss": 0.16995031833648683, + "step": 132900 + }, + { + "epoch": 0.5706104084559045, + "grad_norm": 4.894834995269775, + "learning_rate": 4.312453110043721e-05, + "loss": 0.4124492645263672, + "step": 132910 + }, + { + "epoch": 0.5706533405459244, + "grad_norm": 0.019320698454976082, + "learning_rate": 4.312021938031959e-05, + "loss": 0.30166399478912354, + "step": 132920 + }, + { + "epoch": 0.5706962726359445, + "grad_norm": 0.003763388143852353, + "learning_rate": 4.3115907660201966e-05, + "loss": 0.2777060031890869, + "step": 132930 + }, + { + "epoch": 0.5707392047259645, + "grad_norm": 0.003979962319135666, + "learning_rate": 4.3111595940084336e-05, + "loss": 0.16536861658096313, + "step": 132940 + }, + { + "epoch": 0.5707821368159844, + "grad_norm": 0.02409444749355316, + "learning_rate": 4.3107284219966714e-05, + "loss": 0.27788710594177246, + "step": 132950 + }, + { + "epoch": 0.5708250689060045, + "grad_norm": 0.026255948469042778, + "learning_rate": 4.310297249984909e-05, + "loss": 0.2729495525360107, + "step": 132960 + }, + { + "epoch": 0.5708680009960245, + "grad_norm": 2.919468879699707, + "learning_rate": 4.309866077973147e-05, + "loss": 0.18315572738647462, + "step": 132970 + }, + { + "epoch": 0.5709109330860445, + "grad_norm": 0.021574953570961952, + "learning_rate": 4.309434905961384e-05, + "loss": 0.37256340980529784, + "step": 132980 + }, + { + "epoch": 0.5709538651760645, + "grad_norm": 1.8033825159072876, + "learning_rate": 4.3090037339496216e-05, + "loss": 0.11639236211776734, + "step": 132990 + }, + { + "epoch": 0.5709967972660845, + "grad_norm": 0.05781601369380951, + "learning_rate": 4.30857256193786e-05, + "loss": 0.004620448499917984, + "step": 133000 + }, + { + "epoch": 0.5709967972660845, + "eval_loss": 0.41767171025276184, + "eval_runtime": 27.1415, + "eval_samples_per_second": 3.684, + "eval_steps_per_second": 3.684, + "step": 133000 + }, + { + "epoch": 0.5710397293561045, + "grad_norm": 0.09215544909238815, + "learning_rate": 4.308141389926098e-05, + "loss": 0.08097963333129883, + "step": 133010 + }, + { + "epoch": 0.5710826614461245, + "grad_norm": 0.034179724752902985, + "learning_rate": 4.307710217914335e-05, + "loss": 0.08727078437805176, + "step": 133020 + }, + { + "epoch": 0.5711255935361446, + "grad_norm": 5.184309959411621, + "learning_rate": 4.3072790459025726e-05, + "loss": 0.276810884475708, + "step": 133030 + }, + { + "epoch": 0.5711685256261645, + "grad_norm": 1.454801082611084, + "learning_rate": 4.30684787389081e-05, + "loss": 0.47367095947265625, + "step": 133040 + }, + { + "epoch": 0.5712114577161845, + "grad_norm": 0.03418707475066185, + "learning_rate": 4.306416701879048e-05, + "loss": 0.030638954043388365, + "step": 133050 + }, + { + "epoch": 0.5712543898062046, + "grad_norm": 0.03641417250037193, + "learning_rate": 4.305985529867285e-05, + "loss": 0.14549466371536254, + "step": 133060 + }, + { + "epoch": 0.5712973218962245, + "grad_norm": 2.0202322006225586, + "learning_rate": 4.305554357855523e-05, + "loss": 0.1501977562904358, + "step": 133070 + }, + { + "epoch": 0.5713402539862446, + "grad_norm": 2.3619489669799805, + "learning_rate": 4.3051231858437606e-05, + "loss": 0.24671728610992433, + "step": 133080 + }, + { + "epoch": 0.5713831860762646, + "grad_norm": 2.6512274742126465, + "learning_rate": 4.304692013831998e-05, + "loss": 0.1540529489517212, + "step": 133090 + }, + { + "epoch": 0.5714261181662845, + "grad_norm": 0.024750245735049248, + "learning_rate": 4.3042608418202354e-05, + "loss": 0.15671908855438232, + "step": 133100 + }, + { + "epoch": 0.5714690502563046, + "grad_norm": 3.0600099563598633, + "learning_rate": 4.303829669808474e-05, + "loss": 0.15059789419174194, + "step": 133110 + }, + { + "epoch": 0.5715119823463246, + "grad_norm": 0.39473623037338257, + "learning_rate": 4.3033984977967115e-05, + "loss": 0.21995272636413574, + "step": 133120 + }, + { + "epoch": 0.5715549144363445, + "grad_norm": 2.5885555744171143, + "learning_rate": 4.302967325784949e-05, + "loss": 0.2738445281982422, + "step": 133130 + }, + { + "epoch": 0.5715978465263646, + "grad_norm": 0.8623188734054565, + "learning_rate": 4.302536153773187e-05, + "loss": 0.3122952938079834, + "step": 133140 + }, + { + "epoch": 0.5716407786163846, + "grad_norm": 0.2685237526893616, + "learning_rate": 4.302104981761424e-05, + "loss": 0.10587440729141236, + "step": 133150 + }, + { + "epoch": 0.5716837107064046, + "grad_norm": 2.3580923080444336, + "learning_rate": 4.301673809749662e-05, + "loss": 0.24532074928283693, + "step": 133160 + }, + { + "epoch": 0.5717266427964246, + "grad_norm": 0.01954667828977108, + "learning_rate": 4.3012426377378995e-05, + "loss": 0.10717763900756835, + "step": 133170 + }, + { + "epoch": 0.5717695748864446, + "grad_norm": 0.01179230585694313, + "learning_rate": 4.300811465726137e-05, + "loss": 0.2986689805984497, + "step": 133180 + }, + { + "epoch": 0.5718125069764646, + "grad_norm": 32.5291862487793, + "learning_rate": 4.300380293714374e-05, + "loss": 0.06174714565277099, + "step": 133190 + }, + { + "epoch": 0.5718554390664846, + "grad_norm": 0.019292457029223442, + "learning_rate": 4.299949121702612e-05, + "loss": 0.21613140106201173, + "step": 133200 + }, + { + "epoch": 0.5718983711565047, + "grad_norm": 0.0038005278911441565, + "learning_rate": 4.29951794969085e-05, + "loss": 0.19410216808319092, + "step": 133210 + }, + { + "epoch": 0.5719413032465247, + "grad_norm": 0.04521465674042702, + "learning_rate": 4.2990867776790875e-05, + "loss": 0.0654529094696045, + "step": 133220 + }, + { + "epoch": 0.5719842353365446, + "grad_norm": 4.110771656036377, + "learning_rate": 4.298655605667325e-05, + "loss": 0.1877021908760071, + "step": 133230 + }, + { + "epoch": 0.5720271674265647, + "grad_norm": 0.01429817546159029, + "learning_rate": 4.298224433655563e-05, + "loss": 0.055100107192993165, + "step": 133240 + }, + { + "epoch": 0.5720700995165847, + "grad_norm": 1.2499785423278809, + "learning_rate": 4.297793261643801e-05, + "loss": 0.16364293098449706, + "step": 133250 + }, + { + "epoch": 0.5721130316066046, + "grad_norm": 2.7436704635620117, + "learning_rate": 4.2973620896320384e-05, + "loss": 0.06553627848625183, + "step": 133260 + }, + { + "epoch": 0.5721559636966247, + "grad_norm": 0.036512356251478195, + "learning_rate": 4.2969309176202755e-05, + "loss": 0.23155527114868163, + "step": 133270 + }, + { + "epoch": 0.5721988957866447, + "grad_norm": 0.13235700130462646, + "learning_rate": 4.296499745608513e-05, + "loss": 0.22186541557312012, + "step": 133280 + }, + { + "epoch": 0.5722418278766647, + "grad_norm": 1.8723384141921997, + "learning_rate": 4.296068573596751e-05, + "loss": 0.11128606796264648, + "step": 133290 + }, + { + "epoch": 0.5722847599666847, + "grad_norm": 0.0058643571101129055, + "learning_rate": 4.295637401584989e-05, + "loss": 0.10969338417053223, + "step": 133300 + }, + { + "epoch": 0.5723276920567048, + "grad_norm": 0.0006499195005744696, + "learning_rate": 4.295206229573226e-05, + "loss": 0.2320557117462158, + "step": 133310 + }, + { + "epoch": 0.5723706241467247, + "grad_norm": 0.2941928803920746, + "learning_rate": 4.2947750575614635e-05, + "loss": 0.17140878438949586, + "step": 133320 + }, + { + "epoch": 0.5724135562367447, + "grad_norm": 0.023136064410209656, + "learning_rate": 4.294343885549701e-05, + "loss": 0.2508916139602661, + "step": 133330 + }, + { + "epoch": 0.5724564883267648, + "grad_norm": 1.5124292373657227, + "learning_rate": 4.293912713537939e-05, + "loss": 0.4470512390136719, + "step": 133340 + }, + { + "epoch": 0.5724994204167847, + "grad_norm": 0.0816258117556572, + "learning_rate": 4.293481541526177e-05, + "loss": 0.10875993967056274, + "step": 133350 + }, + { + "epoch": 0.5725423525068047, + "grad_norm": 12.681973457336426, + "learning_rate": 4.2930503695144144e-05, + "loss": 0.0813460648059845, + "step": 133360 + }, + { + "epoch": 0.5725852845968248, + "grad_norm": 1.014262080192566, + "learning_rate": 4.292619197502652e-05, + "loss": 0.1749336004257202, + "step": 133370 + }, + { + "epoch": 0.5726282166868447, + "grad_norm": 0.023005418479442596, + "learning_rate": 4.29218802549089e-05, + "loss": 0.14777911901474, + "step": 133380 + }, + { + "epoch": 0.5726711487768648, + "grad_norm": 0.012674704194068909, + "learning_rate": 4.291756853479127e-05, + "loss": 0.268320631980896, + "step": 133390 + }, + { + "epoch": 0.5727140808668848, + "grad_norm": 0.10391156375408173, + "learning_rate": 4.291325681467365e-05, + "loss": 0.07142456769943237, + "step": 133400 + }, + { + "epoch": 0.5727570129569047, + "grad_norm": 15.536042213439941, + "learning_rate": 4.2908945094556024e-05, + "loss": 0.22836570739746093, + "step": 133410 + }, + { + "epoch": 0.5727999450469248, + "grad_norm": 0.3068942129611969, + "learning_rate": 4.29046333744384e-05, + "loss": 0.11900877952575684, + "step": 133420 + }, + { + "epoch": 0.5728428771369448, + "grad_norm": 0.0022904151119291782, + "learning_rate": 4.290032165432077e-05, + "loss": 0.07925449013710022, + "step": 133430 + }, + { + "epoch": 0.5728858092269647, + "grad_norm": 0.07070890814065933, + "learning_rate": 4.289600993420315e-05, + "loss": 0.3149399995803833, + "step": 133440 + }, + { + "epoch": 0.5729287413169848, + "grad_norm": 0.0037805649917572737, + "learning_rate": 4.289169821408553e-05, + "loss": 0.05543408393859863, + "step": 133450 + }, + { + "epoch": 0.5729716734070048, + "grad_norm": 0.0010744985193014145, + "learning_rate": 4.2887386493967904e-05, + "loss": 0.21528022289276122, + "step": 133460 + }, + { + "epoch": 0.5730146054970248, + "grad_norm": 1.0971367359161377, + "learning_rate": 4.288307477385028e-05, + "loss": 0.2108835458755493, + "step": 133470 + }, + { + "epoch": 0.5730575375870448, + "grad_norm": 0.0034420473966747522, + "learning_rate": 4.287876305373266e-05, + "loss": 0.22491807937622071, + "step": 133480 + }, + { + "epoch": 0.5731004696770648, + "grad_norm": 0.10176596790552139, + "learning_rate": 4.2874451333615036e-05, + "loss": 0.2868778705596924, + "step": 133490 + }, + { + "epoch": 0.5731434017670848, + "grad_norm": 0.0029990740586072206, + "learning_rate": 4.2870139613497413e-05, + "loss": 0.2023683547973633, + "step": 133500 + }, + { + "epoch": 0.5731863338571048, + "grad_norm": 0.7405256628990173, + "learning_rate": 4.286582789337979e-05, + "loss": 0.1412230134010315, + "step": 133510 + }, + { + "epoch": 0.5732292659471249, + "grad_norm": 0.005497376900166273, + "learning_rate": 4.286151617326216e-05, + "loss": 0.31963338851928713, + "step": 133520 + }, + { + "epoch": 0.5732721980371448, + "grad_norm": 0.010010753758251667, + "learning_rate": 4.285720445314454e-05, + "loss": 0.2021845817565918, + "step": 133530 + }, + { + "epoch": 0.5733151301271648, + "grad_norm": 0.0025157523341476917, + "learning_rate": 4.2852892733026916e-05, + "loss": 0.10246919393539429, + "step": 133540 + }, + { + "epoch": 0.5733580622171849, + "grad_norm": 2.5009193420410156, + "learning_rate": 4.2848581012909293e-05, + "loss": 0.147149920463562, + "step": 133550 + }, + { + "epoch": 0.5734009943072048, + "grad_norm": 0.006231301464140415, + "learning_rate": 4.2844269292791664e-05, + "loss": 0.14908454418182374, + "step": 133560 + }, + { + "epoch": 0.5734439263972249, + "grad_norm": 6.109598636627197, + "learning_rate": 4.283995757267404e-05, + "loss": 0.2803361415863037, + "step": 133570 + }, + { + "epoch": 0.5734868584872449, + "grad_norm": 0.08631127327680588, + "learning_rate": 4.283564585255642e-05, + "loss": 0.2282865047454834, + "step": 133580 + }, + { + "epoch": 0.5735297905772648, + "grad_norm": 1.3754993677139282, + "learning_rate": 4.28313341324388e-05, + "loss": 0.30085842609405516, + "step": 133590 + }, + { + "epoch": 0.5735727226672849, + "grad_norm": 0.0010205082362517715, + "learning_rate": 4.282702241232117e-05, + "loss": 0.0039805609732866285, + "step": 133600 + }, + { + "epoch": 0.5736156547573049, + "grad_norm": 1.0616086721420288, + "learning_rate": 4.282271069220355e-05, + "loss": 0.2139833688735962, + "step": 133610 + }, + { + "epoch": 0.5736585868473248, + "grad_norm": 0.22987225651741028, + "learning_rate": 4.281839897208593e-05, + "loss": 0.22784335613250734, + "step": 133620 + }, + { + "epoch": 0.5737015189373449, + "grad_norm": 0.009210779331624508, + "learning_rate": 4.2814087251968305e-05, + "loss": 0.15790432691574097, + "step": 133630 + }, + { + "epoch": 0.5737444510273649, + "grad_norm": 0.0013962037628516555, + "learning_rate": 4.2809775531850676e-05, + "loss": 0.11812090873718262, + "step": 133640 + }, + { + "epoch": 0.573787383117385, + "grad_norm": 1.9198707342147827, + "learning_rate": 4.280546381173305e-05, + "loss": 0.32148313522338867, + "step": 133650 + }, + { + "epoch": 0.5738303152074049, + "grad_norm": 0.027793236076831818, + "learning_rate": 4.280115209161543e-05, + "loss": 0.07742552757263184, + "step": 133660 + }, + { + "epoch": 0.573873247297425, + "grad_norm": 0.0017426724079996347, + "learning_rate": 4.279684037149781e-05, + "loss": 0.31075618267059324, + "step": 133670 + }, + { + "epoch": 0.573916179387445, + "grad_norm": 8.916657447814941, + "learning_rate": 4.279252865138018e-05, + "loss": 0.3325552463531494, + "step": 133680 + }, + { + "epoch": 0.5739591114774649, + "grad_norm": 0.039197370409965515, + "learning_rate": 4.2788216931262556e-05, + "loss": 0.146637225151062, + "step": 133690 + }, + { + "epoch": 0.574002043567485, + "grad_norm": 0.03254859149456024, + "learning_rate": 4.278390521114494e-05, + "loss": 0.33236918449401853, + "step": 133700 + }, + { + "epoch": 0.574044975657505, + "grad_norm": 0.04245612770318985, + "learning_rate": 4.277959349102732e-05, + "loss": 0.18411213159561157, + "step": 133710 + }, + { + "epoch": 0.5740879077475249, + "grad_norm": 1.4738399982452393, + "learning_rate": 4.277528177090969e-05, + "loss": 0.5320202827453613, + "step": 133720 + }, + { + "epoch": 0.574130839837545, + "grad_norm": 0.03626309707760811, + "learning_rate": 4.2770970050792065e-05, + "loss": 0.21667177677154542, + "step": 133730 + }, + { + "epoch": 0.574173771927565, + "grad_norm": 1.9234563112258911, + "learning_rate": 4.276665833067444e-05, + "loss": 0.35777835845947265, + "step": 133740 + }, + { + "epoch": 0.574216704017585, + "grad_norm": 0.9428356885910034, + "learning_rate": 4.276234661055682e-05, + "loss": 0.4586705207824707, + "step": 133750 + }, + { + "epoch": 0.574259636107605, + "grad_norm": 2.4569039344787598, + "learning_rate": 4.275803489043919e-05, + "loss": 0.29904091358184814, + "step": 133760 + }, + { + "epoch": 0.574302568197625, + "grad_norm": 16.694744110107422, + "learning_rate": 4.275372317032157e-05, + "loss": 0.19398369789123535, + "step": 133770 + }, + { + "epoch": 0.574345500287645, + "grad_norm": 0.0017974856309592724, + "learning_rate": 4.2749411450203945e-05, + "loss": 0.009163709729909897, + "step": 133780 + }, + { + "epoch": 0.574388432377665, + "grad_norm": 0.010474118404090405, + "learning_rate": 4.274509973008632e-05, + "loss": 0.11285858154296875, + "step": 133790 + }, + { + "epoch": 0.5744313644676851, + "grad_norm": 0.040957894176244736, + "learning_rate": 4.274078800996869e-05, + "loss": 0.2543015480041504, + "step": 133800 + }, + { + "epoch": 0.574474296557705, + "grad_norm": 1.2445433139801025, + "learning_rate": 4.273647628985108e-05, + "loss": 0.27809972763061525, + "step": 133810 + }, + { + "epoch": 0.574517228647725, + "grad_norm": 0.0012289606966078281, + "learning_rate": 4.2732164569733455e-05, + "loss": 0.277581524848938, + "step": 133820 + }, + { + "epoch": 0.5745601607377451, + "grad_norm": 0.0005551620270125568, + "learning_rate": 4.272785284961583e-05, + "loss": 0.36834819316864015, + "step": 133830 + }, + { + "epoch": 0.574603092827765, + "grad_norm": 0.029749320819973946, + "learning_rate": 4.27235411294982e-05, + "loss": 0.0925449252128601, + "step": 133840 + }, + { + "epoch": 0.574646024917785, + "grad_norm": 0.0027887921314686537, + "learning_rate": 4.271922940938058e-05, + "loss": 0.34550836086273196, + "step": 133850 + }, + { + "epoch": 0.5746889570078051, + "grad_norm": 1.5625827312469482, + "learning_rate": 4.271491768926296e-05, + "loss": 0.20014092922210694, + "step": 133860 + }, + { + "epoch": 0.574731889097825, + "grad_norm": 0.07165715843439102, + "learning_rate": 4.2710605969145335e-05, + "loss": 0.25347466468811036, + "step": 133870 + }, + { + "epoch": 0.5747748211878451, + "grad_norm": 0.03872102126479149, + "learning_rate": 4.270629424902771e-05, + "loss": 0.04969048798084259, + "step": 133880 + }, + { + "epoch": 0.5748177532778651, + "grad_norm": 1.2972571849822998, + "learning_rate": 4.270198252891008e-05, + "loss": 0.3419404268264771, + "step": 133890 + }, + { + "epoch": 0.574860685367885, + "grad_norm": 1.4392205476760864, + "learning_rate": 4.269767080879246e-05, + "loss": 0.3982239246368408, + "step": 133900 + }, + { + "epoch": 0.5749036174579051, + "grad_norm": 1.936293125152588, + "learning_rate": 4.269335908867484e-05, + "loss": 0.14439613819122316, + "step": 133910 + }, + { + "epoch": 0.5749465495479251, + "grad_norm": 0.0033235508017241955, + "learning_rate": 4.2689047368557214e-05, + "loss": 0.25156464576721194, + "step": 133920 + }, + { + "epoch": 0.574989481637945, + "grad_norm": 1.7011040449142456, + "learning_rate": 4.268473564843959e-05, + "loss": 0.4407416820526123, + "step": 133930 + }, + { + "epoch": 0.5750324137279651, + "grad_norm": 0.1319187879562378, + "learning_rate": 4.268042392832197e-05, + "loss": 0.20434746742248536, + "step": 133940 + }, + { + "epoch": 0.5750753458179851, + "grad_norm": 1.9218518733978271, + "learning_rate": 4.2676112208204346e-05, + "loss": 0.14832651615142822, + "step": 133950 + }, + { + "epoch": 0.5751182779080051, + "grad_norm": 0.0007688776240684092, + "learning_rate": 4.2671800488086724e-05, + "loss": 0.1170761227607727, + "step": 133960 + }, + { + "epoch": 0.5751612099980251, + "grad_norm": 2.263821840286255, + "learning_rate": 4.2667488767969094e-05, + "loss": 0.22509450912475587, + "step": 133970 + }, + { + "epoch": 0.5752041420880452, + "grad_norm": 0.000915932294446975, + "learning_rate": 4.266317704785147e-05, + "loss": 0.007984549552202225, + "step": 133980 + }, + { + "epoch": 0.5752470741780651, + "grad_norm": 0.1721668690443039, + "learning_rate": 4.265886532773385e-05, + "loss": 0.3790409088134766, + "step": 133990 + }, + { + "epoch": 0.5752900062680851, + "grad_norm": 0.15518277883529663, + "learning_rate": 4.2654553607616226e-05, + "loss": 0.1313472032546997, + "step": 134000 + }, + { + "epoch": 0.5752900062680851, + "eval_loss": 0.4008947014808655, + "eval_runtime": 27.183, + "eval_samples_per_second": 3.679, + "eval_steps_per_second": 3.679, + "step": 134000 + }, + { + "epoch": 0.5753329383581052, + "grad_norm": 0.3634943962097168, + "learning_rate": 4.26502418874986e-05, + "loss": 0.07963572144508362, + "step": 134010 + }, + { + "epoch": 0.5753758704481251, + "grad_norm": 0.2079387754201889, + "learning_rate": 4.2645930167380974e-05, + "loss": 0.11615699529647827, + "step": 134020 + }, + { + "epoch": 0.5754188025381451, + "grad_norm": 8.19499397277832, + "learning_rate": 4.264161844726335e-05, + "loss": 0.15074779987335205, + "step": 134030 + }, + { + "epoch": 0.5754617346281652, + "grad_norm": 0.008482100442051888, + "learning_rate": 4.263730672714573e-05, + "loss": 0.06598466634750366, + "step": 134040 + }, + { + "epoch": 0.5755046667181851, + "grad_norm": 9.044384956359863, + "learning_rate": 4.2632995007028106e-05, + "loss": 0.3111989974975586, + "step": 134050 + }, + { + "epoch": 0.5755475988082052, + "grad_norm": 0.14206865429878235, + "learning_rate": 4.2628683286910484e-05, + "loss": 0.19637430906295777, + "step": 134060 + }, + { + "epoch": 0.5755905308982252, + "grad_norm": 0.06369198113679886, + "learning_rate": 4.262437156679286e-05, + "loss": 0.18411502838134766, + "step": 134070 + }, + { + "epoch": 0.5756334629882452, + "grad_norm": 1.4943736791610718, + "learning_rate": 4.262005984667524e-05, + "loss": 0.23669490814208985, + "step": 134080 + }, + { + "epoch": 0.5756763950782652, + "grad_norm": 0.059160616248846054, + "learning_rate": 4.261574812655761e-05, + "loss": 0.2464158058166504, + "step": 134090 + }, + { + "epoch": 0.5757193271682852, + "grad_norm": 1.0862027406692505, + "learning_rate": 4.2611436406439986e-05, + "loss": 0.11739879846572876, + "step": 134100 + }, + { + "epoch": 0.5757622592583053, + "grad_norm": 30.739408493041992, + "learning_rate": 4.2607124686322364e-05, + "loss": 0.21538019180297852, + "step": 134110 + }, + { + "epoch": 0.5758051913483252, + "grad_norm": 0.002959183417260647, + "learning_rate": 4.260281296620474e-05, + "loss": 0.17276872396469117, + "step": 134120 + }, + { + "epoch": 0.5758481234383452, + "grad_norm": 1.208294153213501, + "learning_rate": 4.259850124608711e-05, + "loss": 0.2686814785003662, + "step": 134130 + }, + { + "epoch": 0.5758910555283653, + "grad_norm": 0.003919269423931837, + "learning_rate": 4.259418952596949e-05, + "loss": 0.10101698637008667, + "step": 134140 + }, + { + "epoch": 0.5759339876183852, + "grad_norm": 0.005994774866849184, + "learning_rate": 4.2589877805851866e-05, + "loss": 0.18420990705490112, + "step": 134150 + }, + { + "epoch": 0.5759769197084053, + "grad_norm": 1.3661763668060303, + "learning_rate": 4.2585566085734244e-05, + "loss": 0.36013386249542234, + "step": 134160 + }, + { + "epoch": 0.5760198517984253, + "grad_norm": 0.03579791262745857, + "learning_rate": 4.258125436561662e-05, + "loss": 0.22522573471069335, + "step": 134170 + }, + { + "epoch": 0.5760627838884452, + "grad_norm": 4.322238445281982, + "learning_rate": 4.2576942645499e-05, + "loss": 0.2739940404891968, + "step": 134180 + }, + { + "epoch": 0.5761057159784653, + "grad_norm": 1.1436915397644043, + "learning_rate": 4.2572630925381376e-05, + "loss": 0.1775204062461853, + "step": 134190 + }, + { + "epoch": 0.5761486480684853, + "grad_norm": 0.012154379859566689, + "learning_rate": 4.256831920526375e-05, + "loss": 0.1556593060493469, + "step": 134200 + }, + { + "epoch": 0.5761915801585052, + "grad_norm": 4.292967319488525, + "learning_rate": 4.2564007485146124e-05, + "loss": 0.23525865077972413, + "step": 134210 + }, + { + "epoch": 0.5762345122485253, + "grad_norm": 1.399591326713562, + "learning_rate": 4.25596957650285e-05, + "loss": 0.19609031677246094, + "step": 134220 + }, + { + "epoch": 0.5762774443385453, + "grad_norm": 1.8917382955551147, + "learning_rate": 4.255538404491088e-05, + "loss": 0.12764239311218262, + "step": 134230 + }, + { + "epoch": 0.5763203764285653, + "grad_norm": 0.0013518787454813719, + "learning_rate": 4.2551072324793256e-05, + "loss": 0.25250537395477296, + "step": 134240 + }, + { + "epoch": 0.5763633085185853, + "grad_norm": 1.5671014785766602, + "learning_rate": 4.254676060467563e-05, + "loss": 0.15098211765289307, + "step": 134250 + }, + { + "epoch": 0.5764062406086053, + "grad_norm": 0.003938416950404644, + "learning_rate": 4.2542448884558003e-05, + "loss": 0.3070345640182495, + "step": 134260 + }, + { + "epoch": 0.5764491726986253, + "grad_norm": 2.018571376800537, + "learning_rate": 4.253813716444038e-05, + "loss": 0.10029573440551758, + "step": 134270 + }, + { + "epoch": 0.5764921047886453, + "grad_norm": 0.004540940281003714, + "learning_rate": 4.253382544432276e-05, + "loss": 0.13509360551834107, + "step": 134280 + }, + { + "epoch": 0.5765350368786654, + "grad_norm": 0.014921938069164753, + "learning_rate": 4.252951372420514e-05, + "loss": 0.20777909755706786, + "step": 134290 + }, + { + "epoch": 0.5765779689686853, + "grad_norm": 0.012604661285877228, + "learning_rate": 4.252520200408751e-05, + "loss": 0.45519323348999025, + "step": 134300 + }, + { + "epoch": 0.5766209010587053, + "grad_norm": 0.2351997345685959, + "learning_rate": 4.252089028396989e-05, + "loss": 0.09932146072387696, + "step": 134310 + }, + { + "epoch": 0.5766638331487254, + "grad_norm": 0.006004502065479755, + "learning_rate": 4.251657856385227e-05, + "loss": 0.17851873636245727, + "step": 134320 + }, + { + "epoch": 0.5767067652387453, + "grad_norm": 0.17117968201637268, + "learning_rate": 4.2512266843734645e-05, + "loss": 0.17499239444732667, + "step": 134330 + }, + { + "epoch": 0.5767496973287654, + "grad_norm": 0.08349964022636414, + "learning_rate": 4.2507955123617015e-05, + "loss": 0.25635461807250975, + "step": 134340 + }, + { + "epoch": 0.5767926294187854, + "grad_norm": 0.07518622279167175, + "learning_rate": 4.250364340349939e-05, + "loss": 0.1901835560798645, + "step": 134350 + }, + { + "epoch": 0.5768355615088053, + "grad_norm": 0.03478851169347763, + "learning_rate": 4.249933168338177e-05, + "loss": 0.06834203600883484, + "step": 134360 + }, + { + "epoch": 0.5768784935988254, + "grad_norm": 1.326767086982727, + "learning_rate": 4.249501996326415e-05, + "loss": 0.18276243209838866, + "step": 134370 + }, + { + "epoch": 0.5769214256888454, + "grad_norm": 1.6591578722000122, + "learning_rate": 4.249070824314652e-05, + "loss": 0.1553168773651123, + "step": 134380 + }, + { + "epoch": 0.5769643577788653, + "grad_norm": 0.05299576371908188, + "learning_rate": 4.2486396523028895e-05, + "loss": 0.3026347875595093, + "step": 134390 + }, + { + "epoch": 0.5770072898688854, + "grad_norm": 0.5824448466300964, + "learning_rate": 4.248208480291128e-05, + "loss": 0.2127014398574829, + "step": 134400 + }, + { + "epoch": 0.5770502219589054, + "grad_norm": 0.16627003252506256, + "learning_rate": 4.247777308279366e-05, + "loss": 0.19256727695465087, + "step": 134410 + }, + { + "epoch": 0.5770931540489254, + "grad_norm": 0.4351569414138794, + "learning_rate": 4.247346136267603e-05, + "loss": 0.008743252605199814, + "step": 134420 + }, + { + "epoch": 0.5771360861389454, + "grad_norm": 3.0534777641296387, + "learning_rate": 4.2469149642558405e-05, + "loss": 0.21316053867340087, + "step": 134430 + }, + { + "epoch": 0.5771790182289654, + "grad_norm": 1.68711256980896, + "learning_rate": 4.246483792244078e-05, + "loss": 0.2891331434249878, + "step": 134440 + }, + { + "epoch": 0.5772219503189854, + "grad_norm": 0.023587733507156372, + "learning_rate": 4.246052620232316e-05, + "loss": 0.22894837856292724, + "step": 134450 + }, + { + "epoch": 0.5772648824090054, + "grad_norm": 0.07950850576162338, + "learning_rate": 4.245621448220553e-05, + "loss": 0.05865427851676941, + "step": 134460 + }, + { + "epoch": 0.5773078144990255, + "grad_norm": 0.0960141196846962, + "learning_rate": 4.245190276208791e-05, + "loss": 0.22196877002716064, + "step": 134470 + }, + { + "epoch": 0.5773507465890454, + "grad_norm": 0.01585448905825615, + "learning_rate": 4.2447591041970285e-05, + "loss": 0.29195680618286135, + "step": 134480 + }, + { + "epoch": 0.5773936786790654, + "grad_norm": 0.05223434045910835, + "learning_rate": 4.244327932185266e-05, + "loss": 0.07586430311203003, + "step": 134490 + }, + { + "epoch": 0.5774366107690855, + "grad_norm": 10.157803535461426, + "learning_rate": 4.243896760173503e-05, + "loss": 0.40190610885620115, + "step": 134500 + }, + { + "epoch": 0.5774795428591055, + "grad_norm": 2.9556596279144287, + "learning_rate": 4.243465588161742e-05, + "loss": 0.3310459852218628, + "step": 134510 + }, + { + "epoch": 0.5775224749491255, + "grad_norm": 1.8336342573165894, + "learning_rate": 4.2430344161499794e-05, + "loss": 0.2758753299713135, + "step": 134520 + }, + { + "epoch": 0.5775654070391455, + "grad_norm": 0.02351958490908146, + "learning_rate": 4.242603244138217e-05, + "loss": 0.3395817995071411, + "step": 134530 + }, + { + "epoch": 0.5776083391291655, + "grad_norm": 1.333340048789978, + "learning_rate": 4.242172072126454e-05, + "loss": 0.19011322259902955, + "step": 134540 + }, + { + "epoch": 0.5776512712191855, + "grad_norm": 0.27546411752700806, + "learning_rate": 4.241740900114692e-05, + "loss": 0.2953900575637817, + "step": 134550 + }, + { + "epoch": 0.5776942033092055, + "grad_norm": 3.004845142364502, + "learning_rate": 4.24130972810293e-05, + "loss": 0.23668646812438965, + "step": 134560 + }, + { + "epoch": 0.5777371353992256, + "grad_norm": 2.5741522312164307, + "learning_rate": 4.2408785560911674e-05, + "loss": 0.1605753183364868, + "step": 134570 + }, + { + "epoch": 0.5777800674892455, + "grad_norm": 0.09854859858751297, + "learning_rate": 4.240447384079405e-05, + "loss": 0.3948371171951294, + "step": 134580 + }, + { + "epoch": 0.5778229995792655, + "grad_norm": 0.07081442326307297, + "learning_rate": 4.240016212067642e-05, + "loss": 0.26161210536956786, + "step": 134590 + }, + { + "epoch": 0.5778659316692856, + "grad_norm": 0.08974912762641907, + "learning_rate": 4.23958504005588e-05, + "loss": 0.27458915710449217, + "step": 134600 + }, + { + "epoch": 0.5779088637593055, + "grad_norm": 0.7241619229316711, + "learning_rate": 4.2391538680441177e-05, + "loss": 0.23674228191375732, + "step": 134610 + }, + { + "epoch": 0.5779517958493255, + "grad_norm": 0.09924507886171341, + "learning_rate": 4.2387226960323554e-05, + "loss": 0.35862672328948975, + "step": 134620 + }, + { + "epoch": 0.5779947279393456, + "grad_norm": 0.3530639708042145, + "learning_rate": 4.238291524020593e-05, + "loss": 0.24312853813171387, + "step": 134630 + }, + { + "epoch": 0.5780376600293655, + "grad_norm": 0.06290092319250107, + "learning_rate": 4.237860352008831e-05, + "loss": 0.24631965160369873, + "step": 134640 + }, + { + "epoch": 0.5780805921193856, + "grad_norm": 0.24874712526798248, + "learning_rate": 4.2374291799970686e-05, + "loss": 0.12623435258865356, + "step": 134650 + }, + { + "epoch": 0.5781235242094056, + "grad_norm": 1.555324912071228, + "learning_rate": 4.236998007985306e-05, + "loss": 0.4415943145751953, + "step": 134660 + }, + { + "epoch": 0.5781664562994255, + "grad_norm": 0.0042195431888103485, + "learning_rate": 4.2365668359735434e-05, + "loss": 0.3294379711151123, + "step": 134670 + }, + { + "epoch": 0.5782093883894456, + "grad_norm": 2.310861587524414, + "learning_rate": 4.236135663961781e-05, + "loss": 0.4276157855987549, + "step": 134680 + }, + { + "epoch": 0.5782523204794656, + "grad_norm": 8.112129211425781, + "learning_rate": 4.235704491950019e-05, + "loss": 0.14049152135849, + "step": 134690 + }, + { + "epoch": 0.5782952525694856, + "grad_norm": 4.619983196258545, + "learning_rate": 4.2352733199382566e-05, + "loss": 0.0853570580482483, + "step": 134700 + }, + { + "epoch": 0.5783381846595056, + "grad_norm": 0.12085939943790436, + "learning_rate": 4.2348421479264936e-05, + "loss": 0.225472354888916, + "step": 134710 + }, + { + "epoch": 0.5783811167495256, + "grad_norm": 8.128291130065918, + "learning_rate": 4.2344109759147314e-05, + "loss": 0.24805455207824706, + "step": 134720 + }, + { + "epoch": 0.5784240488395456, + "grad_norm": 0.3583569824695587, + "learning_rate": 4.233979803902969e-05, + "loss": 0.2094562292098999, + "step": 134730 + }, + { + "epoch": 0.5784669809295656, + "grad_norm": 0.04759635776281357, + "learning_rate": 4.233548631891207e-05, + "loss": 0.24561095237731934, + "step": 134740 + }, + { + "epoch": 0.5785099130195857, + "grad_norm": 2.9910125732421875, + "learning_rate": 4.2331174598794446e-05, + "loss": 0.31358211040496825, + "step": 134750 + }, + { + "epoch": 0.5785528451096056, + "grad_norm": 6.14967155456543, + "learning_rate": 4.232686287867682e-05, + "loss": 0.3080138206481934, + "step": 134760 + }, + { + "epoch": 0.5785957771996256, + "grad_norm": 0.6008140444755554, + "learning_rate": 4.23225511585592e-05, + "loss": 0.242822003364563, + "step": 134770 + }, + { + "epoch": 0.5786387092896457, + "grad_norm": 0.17511683702468872, + "learning_rate": 4.231823943844158e-05, + "loss": 0.13240162134170533, + "step": 134780 + }, + { + "epoch": 0.5786816413796656, + "grad_norm": 0.07600205391645432, + "learning_rate": 4.231392771832395e-05, + "loss": 0.2748347282409668, + "step": 134790 + }, + { + "epoch": 0.5787245734696856, + "grad_norm": 0.017275772988796234, + "learning_rate": 4.2309615998206326e-05, + "loss": 0.16032135486602783, + "step": 134800 + }, + { + "epoch": 0.5787675055597057, + "grad_norm": 0.017104055732488632, + "learning_rate": 4.23053042780887e-05, + "loss": 0.10543267726898194, + "step": 134810 + }, + { + "epoch": 0.5788104376497256, + "grad_norm": 5.630326747894287, + "learning_rate": 4.230099255797108e-05, + "loss": 0.23811829090118408, + "step": 134820 + }, + { + "epoch": 0.5788533697397457, + "grad_norm": 0.06849951297044754, + "learning_rate": 4.229668083785345e-05, + "loss": 0.28866872787475584, + "step": 134830 + }, + { + "epoch": 0.5788963018297657, + "grad_norm": 0.1363976001739502, + "learning_rate": 4.229236911773583e-05, + "loss": 0.21454071998596191, + "step": 134840 + }, + { + "epoch": 0.5789392339197856, + "grad_norm": 0.07092367857694626, + "learning_rate": 4.2288057397618206e-05, + "loss": 0.2816061019897461, + "step": 134850 + }, + { + "epoch": 0.5789821660098057, + "grad_norm": 0.03237995132803917, + "learning_rate": 4.228374567750058e-05, + "loss": 0.1369353413581848, + "step": 134860 + }, + { + "epoch": 0.5790250980998257, + "grad_norm": 0.14592450857162476, + "learning_rate": 4.227943395738296e-05, + "loss": 0.16824755668640137, + "step": 134870 + }, + { + "epoch": 0.5790680301898457, + "grad_norm": 0.13835661113262177, + "learning_rate": 4.227512223726534e-05, + "loss": 0.1760498046875, + "step": 134880 + }, + { + "epoch": 0.5791109622798657, + "grad_norm": 0.913736879825592, + "learning_rate": 4.2270810517147715e-05, + "loss": 0.17487471103668212, + "step": 134890 + }, + { + "epoch": 0.5791538943698857, + "grad_norm": 0.0023353216238319874, + "learning_rate": 4.226649879703009e-05, + "loss": 0.27238619327545166, + "step": 134900 + }, + { + "epoch": 0.5791968264599057, + "grad_norm": 0.0350453183054924, + "learning_rate": 4.226218707691246e-05, + "loss": 0.07492238283157349, + "step": 134910 + }, + { + "epoch": 0.5792397585499257, + "grad_norm": 0.006240634713321924, + "learning_rate": 4.225787535679484e-05, + "loss": 0.30922422409057615, + "step": 134920 + }, + { + "epoch": 0.5792826906399458, + "grad_norm": 2.5151445865631104, + "learning_rate": 4.225356363667722e-05, + "loss": 0.11893032789230347, + "step": 134930 + }, + { + "epoch": 0.5793256227299658, + "grad_norm": 0.21421582996845245, + "learning_rate": 4.2249251916559595e-05, + "loss": 0.37455198764801023, + "step": 134940 + }, + { + "epoch": 0.5793685548199857, + "grad_norm": 3.1594583988189697, + "learning_rate": 4.224494019644197e-05, + "loss": 0.27971127033233645, + "step": 134950 + }, + { + "epoch": 0.5794114869100058, + "grad_norm": 0.001456442754715681, + "learning_rate": 4.224062847632434e-05, + "loss": 0.3134411096572876, + "step": 134960 + }, + { + "epoch": 0.5794544190000258, + "grad_norm": 0.03771829977631569, + "learning_rate": 4.223631675620672e-05, + "loss": 0.13740975856781007, + "step": 134970 + }, + { + "epoch": 0.5794973510900457, + "grad_norm": 0.6887321472167969, + "learning_rate": 4.22320050360891e-05, + "loss": 0.27420816421508787, + "step": 134980 + }, + { + "epoch": 0.5795402831800658, + "grad_norm": 0.015955857932567596, + "learning_rate": 4.2227693315971475e-05, + "loss": 0.13951371908187865, + "step": 134990 + }, + { + "epoch": 0.5795832152700858, + "grad_norm": 3.5314512252807617, + "learning_rate": 4.222338159585385e-05, + "loss": 0.20095674991607665, + "step": 135000 + }, + { + "epoch": 0.5795832152700858, + "eval_loss": 0.39967551827430725, + "eval_runtime": 27.1223, + "eval_samples_per_second": 3.687, + "eval_steps_per_second": 3.687, + "step": 135000 + }, + { + "epoch": 0.5796261473601058, + "grad_norm": 0.18443207442760468, + "learning_rate": 4.221906987573623e-05, + "loss": 0.16879316568374633, + "step": 135010 + }, + { + "epoch": 0.5796690794501258, + "grad_norm": 0.12480297684669495, + "learning_rate": 4.221475815561861e-05, + "loss": 0.03386820554733276, + "step": 135020 + }, + { + "epoch": 0.5797120115401458, + "grad_norm": 0.09635353833436966, + "learning_rate": 4.2210446435500984e-05, + "loss": 0.2205876588821411, + "step": 135030 + }, + { + "epoch": 0.5797549436301658, + "grad_norm": 0.8061900734901428, + "learning_rate": 4.2206134715383355e-05, + "loss": 0.15954869985580444, + "step": 135040 + }, + { + "epoch": 0.5797978757201858, + "grad_norm": 0.03748125210404396, + "learning_rate": 4.220182299526573e-05, + "loss": 0.06913055181503296, + "step": 135050 + }, + { + "epoch": 0.5798408078102059, + "grad_norm": 2.611194610595703, + "learning_rate": 4.219751127514811e-05, + "loss": 0.06778600215911865, + "step": 135060 + }, + { + "epoch": 0.5798837399002258, + "grad_norm": 0.07131896167993546, + "learning_rate": 4.219319955503049e-05, + "loss": 0.18905138969421387, + "step": 135070 + }, + { + "epoch": 0.5799266719902458, + "grad_norm": 1.3885791301727295, + "learning_rate": 4.218888783491286e-05, + "loss": 0.2527278423309326, + "step": 135080 + }, + { + "epoch": 0.5799696040802659, + "grad_norm": 0.007560020312666893, + "learning_rate": 4.2184576114795235e-05, + "loss": 0.0921245813369751, + "step": 135090 + }, + { + "epoch": 0.5800125361702858, + "grad_norm": 0.008850879967212677, + "learning_rate": 4.218026439467761e-05, + "loss": 0.3411016702651978, + "step": 135100 + }, + { + "epoch": 0.5800554682603059, + "grad_norm": 0.023805882781744003, + "learning_rate": 4.2175952674559996e-05, + "loss": 0.17042380571365356, + "step": 135110 + }, + { + "epoch": 0.5800984003503259, + "grad_norm": 0.17893634736537933, + "learning_rate": 4.217164095444237e-05, + "loss": 0.3274564266204834, + "step": 135120 + }, + { + "epoch": 0.5801413324403458, + "grad_norm": 0.0015078054275363684, + "learning_rate": 4.2167329234324744e-05, + "loss": 0.4564652442932129, + "step": 135130 + }, + { + "epoch": 0.5801842645303659, + "grad_norm": 0.9611347317695618, + "learning_rate": 4.216301751420712e-05, + "loss": 0.183595871925354, + "step": 135140 + }, + { + "epoch": 0.5802271966203859, + "grad_norm": 0.0027880992274731398, + "learning_rate": 4.21587057940895e-05, + "loss": 0.2240854024887085, + "step": 135150 + }, + { + "epoch": 0.5802701287104058, + "grad_norm": 9.634644508361816, + "learning_rate": 4.215439407397187e-05, + "loss": 0.24422330856323243, + "step": 135160 + }, + { + "epoch": 0.5803130608004259, + "grad_norm": 0.005043448880314827, + "learning_rate": 4.215008235385425e-05, + "loss": 0.3264390230178833, + "step": 135170 + }, + { + "epoch": 0.5803559928904459, + "grad_norm": 0.02733713388442993, + "learning_rate": 4.2145770633736624e-05, + "loss": 0.3870868682861328, + "step": 135180 + }, + { + "epoch": 0.5803989249804659, + "grad_norm": 2.4984142780303955, + "learning_rate": 4.2141458913619e-05, + "loss": 0.2757649183273315, + "step": 135190 + }, + { + "epoch": 0.5804418570704859, + "grad_norm": 0.08463925123214722, + "learning_rate": 4.213714719350137e-05, + "loss": 0.07629125714302062, + "step": 135200 + }, + { + "epoch": 0.5804847891605059, + "grad_norm": 0.5990811586380005, + "learning_rate": 4.213283547338375e-05, + "loss": 0.12150636911392212, + "step": 135210 + }, + { + "epoch": 0.5805277212505259, + "grad_norm": 0.08332854509353638, + "learning_rate": 4.2128523753266134e-05, + "loss": 0.14046154022216797, + "step": 135220 + }, + { + "epoch": 0.5805706533405459, + "grad_norm": 0.011388557031750679, + "learning_rate": 4.212421203314851e-05, + "loss": 0.005377927795052528, + "step": 135230 + }, + { + "epoch": 0.580613585430566, + "grad_norm": 0.5718120336532593, + "learning_rate": 4.211990031303088e-05, + "loss": 0.1931094765663147, + "step": 135240 + }, + { + "epoch": 0.5806565175205859, + "grad_norm": 0.014416528865695, + "learning_rate": 4.211558859291326e-05, + "loss": 0.2976668357849121, + "step": 135250 + }, + { + "epoch": 0.5806994496106059, + "grad_norm": 0.021259639412164688, + "learning_rate": 4.2111276872795636e-05, + "loss": 0.11948213577270508, + "step": 135260 + }, + { + "epoch": 0.580742381700626, + "grad_norm": 2.3595197200775146, + "learning_rate": 4.2106965152678013e-05, + "loss": 0.2929967164993286, + "step": 135270 + }, + { + "epoch": 0.5807853137906459, + "grad_norm": 2.1110732555389404, + "learning_rate": 4.2102653432560384e-05, + "loss": 0.21944453716278076, + "step": 135280 + }, + { + "epoch": 0.580828245880666, + "grad_norm": 0.5368881225585938, + "learning_rate": 4.209834171244276e-05, + "loss": 0.13306208848953247, + "step": 135290 + }, + { + "epoch": 0.580871177970686, + "grad_norm": 0.3135150074958801, + "learning_rate": 4.209402999232514e-05, + "loss": 0.16415667533874512, + "step": 135300 + }, + { + "epoch": 0.5809141100607059, + "grad_norm": 1.6263655424118042, + "learning_rate": 4.2089718272207516e-05, + "loss": 0.3677335262298584, + "step": 135310 + }, + { + "epoch": 0.580957042150726, + "grad_norm": 1.274057149887085, + "learning_rate": 4.2085406552089893e-05, + "loss": 0.1566999673843384, + "step": 135320 + }, + { + "epoch": 0.580999974240746, + "grad_norm": 0.06209641322493553, + "learning_rate": 4.208109483197227e-05, + "loss": 0.22127158641815187, + "step": 135330 + }, + { + "epoch": 0.5810429063307659, + "grad_norm": 0.05129466578364372, + "learning_rate": 4.207678311185465e-05, + "loss": 0.2501818180084229, + "step": 135340 + }, + { + "epoch": 0.581085838420786, + "grad_norm": 0.02592042274773121, + "learning_rate": 4.2072471391737025e-05, + "loss": 0.3072038650512695, + "step": 135350 + }, + { + "epoch": 0.581128770510806, + "grad_norm": 0.02230093814432621, + "learning_rate": 4.20681596716194e-05, + "loss": 0.32774603366851807, + "step": 135360 + }, + { + "epoch": 0.5811717026008261, + "grad_norm": 0.9433190822601318, + "learning_rate": 4.206384795150177e-05, + "loss": 0.3484883546829224, + "step": 135370 + }, + { + "epoch": 0.581214634690846, + "grad_norm": 0.004100241232663393, + "learning_rate": 4.205953623138415e-05, + "loss": 0.25763185024261476, + "step": 135380 + }, + { + "epoch": 0.581257566780866, + "grad_norm": 0.16599683463573456, + "learning_rate": 4.205522451126653e-05, + "loss": 0.2505820274353027, + "step": 135390 + }, + { + "epoch": 0.5813004988708861, + "grad_norm": 0.03479776531457901, + "learning_rate": 4.2050912791148905e-05, + "loss": 0.16014200448989868, + "step": 135400 + }, + { + "epoch": 0.581343430960906, + "grad_norm": 2.5248851776123047, + "learning_rate": 4.2046601071031276e-05, + "loss": 0.3151650667190552, + "step": 135410 + }, + { + "epoch": 0.5813863630509261, + "grad_norm": 0.9223288893699646, + "learning_rate": 4.204228935091365e-05, + "loss": 0.2693711996078491, + "step": 135420 + }, + { + "epoch": 0.5814292951409461, + "grad_norm": 0.21809233725070953, + "learning_rate": 4.203797763079603e-05, + "loss": 0.0456573337316513, + "step": 135430 + }, + { + "epoch": 0.581472227230966, + "grad_norm": 1.4331225156784058, + "learning_rate": 4.203366591067841e-05, + "loss": 0.18495348691940308, + "step": 135440 + }, + { + "epoch": 0.5815151593209861, + "grad_norm": 5.573430061340332, + "learning_rate": 4.2029354190560785e-05, + "loss": 0.3089458465576172, + "step": 135450 + }, + { + "epoch": 0.5815580914110061, + "grad_norm": 0.005199179518967867, + "learning_rate": 4.202504247044316e-05, + "loss": 0.1684165120124817, + "step": 135460 + }, + { + "epoch": 0.581601023501026, + "grad_norm": 1.1721888780593872, + "learning_rate": 4.202073075032554e-05, + "loss": 0.30724520683288575, + "step": 135470 + }, + { + "epoch": 0.5816439555910461, + "grad_norm": 0.004575973842293024, + "learning_rate": 4.201641903020792e-05, + "loss": 0.15588613748550414, + "step": 135480 + }, + { + "epoch": 0.5816868876810661, + "grad_norm": 0.021832408383488655, + "learning_rate": 4.201210731009029e-05, + "loss": 0.04639666378498077, + "step": 135490 + }, + { + "epoch": 0.5817298197710861, + "grad_norm": 2.099874973297119, + "learning_rate": 4.2007795589972665e-05, + "loss": 0.23207998275756836, + "step": 135500 + }, + { + "epoch": 0.5817727518611061, + "grad_norm": 0.16797228157520294, + "learning_rate": 4.200348386985504e-05, + "loss": 0.18904383182525636, + "step": 135510 + }, + { + "epoch": 0.5818156839511262, + "grad_norm": 0.009826351888477802, + "learning_rate": 4.199917214973742e-05, + "loss": 0.13186975717544555, + "step": 135520 + }, + { + "epoch": 0.5818586160411461, + "grad_norm": 1.4313544034957886, + "learning_rate": 4.199486042961979e-05, + "loss": 0.191562283039093, + "step": 135530 + }, + { + "epoch": 0.5819015481311661, + "grad_norm": 0.16057445108890533, + "learning_rate": 4.199054870950217e-05, + "loss": 0.07290871739387512, + "step": 135540 + }, + { + "epoch": 0.5819444802211862, + "grad_norm": 1.3174405097961426, + "learning_rate": 4.1986236989384545e-05, + "loss": 0.20775175094604492, + "step": 135550 + }, + { + "epoch": 0.5819874123112061, + "grad_norm": 1.7092238664627075, + "learning_rate": 4.198192526926692e-05, + "loss": 0.22169027328491211, + "step": 135560 + }, + { + "epoch": 0.5820303444012261, + "grad_norm": 0.14638565480709076, + "learning_rate": 4.19776135491493e-05, + "loss": 0.11512167453765869, + "step": 135570 + }, + { + "epoch": 0.5820732764912462, + "grad_norm": 0.22287026047706604, + "learning_rate": 4.197330182903168e-05, + "loss": 0.31687591075897215, + "step": 135580 + }, + { + "epoch": 0.5821162085812661, + "grad_norm": 1.248632788658142, + "learning_rate": 4.1968990108914055e-05, + "loss": 0.16288487911224364, + "step": 135590 + }, + { + "epoch": 0.5821591406712862, + "grad_norm": 1.9880515336990356, + "learning_rate": 4.196467838879643e-05, + "loss": 0.07543573975563049, + "step": 135600 + }, + { + "epoch": 0.5822020727613062, + "grad_norm": 0.03087700717151165, + "learning_rate": 4.19603666686788e-05, + "loss": 0.23086144924163818, + "step": 135610 + }, + { + "epoch": 0.5822450048513261, + "grad_norm": 0.011071198619902134, + "learning_rate": 4.195605494856118e-05, + "loss": 0.3214784383773804, + "step": 135620 + }, + { + "epoch": 0.5822879369413462, + "grad_norm": 1.2527273893356323, + "learning_rate": 4.195174322844356e-05, + "loss": 0.5449256896972656, + "step": 135630 + }, + { + "epoch": 0.5823308690313662, + "grad_norm": 8.132579803466797, + "learning_rate": 4.1947431508325934e-05, + "loss": 0.29090328216552735, + "step": 135640 + }, + { + "epoch": 0.5823738011213861, + "grad_norm": 6.972388744354248, + "learning_rate": 4.1943119788208305e-05, + "loss": 0.3711055278778076, + "step": 135650 + }, + { + "epoch": 0.5824167332114062, + "grad_norm": 0.29669734835624695, + "learning_rate": 4.193880806809068e-05, + "loss": 0.27991471290588377, + "step": 135660 + }, + { + "epoch": 0.5824596653014262, + "grad_norm": 0.022353434935212135, + "learning_rate": 4.193449634797306e-05, + "loss": 0.07086479663848877, + "step": 135670 + }, + { + "epoch": 0.5825025973914462, + "grad_norm": 0.013063160702586174, + "learning_rate": 4.193018462785544e-05, + "loss": 0.2063464879989624, + "step": 135680 + }, + { + "epoch": 0.5825455294814662, + "grad_norm": 1.8125168085098267, + "learning_rate": 4.1925872907737814e-05, + "loss": 0.2858951330184937, + "step": 135690 + }, + { + "epoch": 0.5825884615714862, + "grad_norm": 0.05596992373466492, + "learning_rate": 4.192156118762019e-05, + "loss": 0.08188768625259399, + "step": 135700 + }, + { + "epoch": 0.5826313936615062, + "grad_norm": 0.027829289436340332, + "learning_rate": 4.191724946750257e-05, + "loss": 0.3078721761703491, + "step": 135710 + }, + { + "epoch": 0.5826743257515262, + "grad_norm": 0.04408969357609749, + "learning_rate": 4.1912937747384946e-05, + "loss": 0.26244316101074217, + "step": 135720 + }, + { + "epoch": 0.5827172578415463, + "grad_norm": 0.01925904117524624, + "learning_rate": 4.1908626027267324e-05, + "loss": 0.10048316717147827, + "step": 135730 + }, + { + "epoch": 0.5827601899315662, + "grad_norm": 0.034861356019973755, + "learning_rate": 4.1904314307149694e-05, + "loss": 0.266707706451416, + "step": 135740 + }, + { + "epoch": 0.5828031220215862, + "grad_norm": 0.011179475113749504, + "learning_rate": 4.190000258703207e-05, + "loss": 0.118767249584198, + "step": 135750 + }, + { + "epoch": 0.5828460541116063, + "grad_norm": 0.34233781695365906, + "learning_rate": 4.189569086691445e-05, + "loss": 0.21334223747253417, + "step": 135760 + }, + { + "epoch": 0.5828889862016262, + "grad_norm": 1.855711817741394, + "learning_rate": 4.1891379146796826e-05, + "loss": 0.2099367380142212, + "step": 135770 + }, + { + "epoch": 0.5829319182916463, + "grad_norm": 0.0024516810663044453, + "learning_rate": 4.18870674266792e-05, + "loss": 0.1486857533454895, + "step": 135780 + }, + { + "epoch": 0.5829748503816663, + "grad_norm": 0.004157458897680044, + "learning_rate": 4.1882755706561574e-05, + "loss": 0.2245945692062378, + "step": 135790 + }, + { + "epoch": 0.5830177824716863, + "grad_norm": 0.8604725003242493, + "learning_rate": 4.187844398644395e-05, + "loss": 0.21948962211608886, + "step": 135800 + }, + { + "epoch": 0.5830607145617063, + "grad_norm": 0.6217984557151794, + "learning_rate": 4.1874132266326336e-05, + "loss": 0.2490144968032837, + "step": 135810 + }, + { + "epoch": 0.5831036466517263, + "grad_norm": 0.030795378610491753, + "learning_rate": 4.1869820546208706e-05, + "loss": 0.3707392930984497, + "step": 135820 + }, + { + "epoch": 0.5831465787417464, + "grad_norm": 0.021570665761828423, + "learning_rate": 4.1865508826091084e-05, + "loss": 0.4674999237060547, + "step": 135830 + }, + { + "epoch": 0.5831895108317663, + "grad_norm": 0.17849326133728027, + "learning_rate": 4.186119710597346e-05, + "loss": 0.11789641380310059, + "step": 135840 + }, + { + "epoch": 0.5832324429217863, + "grad_norm": 0.024820350110530853, + "learning_rate": 4.185688538585584e-05, + "loss": 0.0743901014328003, + "step": 135850 + }, + { + "epoch": 0.5832753750118064, + "grad_norm": 0.003563225269317627, + "learning_rate": 4.185257366573821e-05, + "loss": 0.2821930408477783, + "step": 135860 + }, + { + "epoch": 0.5833183071018263, + "grad_norm": 0.21024373173713684, + "learning_rate": 4.1848261945620586e-05, + "loss": 0.04329926371574402, + "step": 135870 + }, + { + "epoch": 0.5833612391918463, + "grad_norm": 0.002765827113762498, + "learning_rate": 4.1843950225502964e-05, + "loss": 0.297293496131897, + "step": 135880 + }, + { + "epoch": 0.5834041712818664, + "grad_norm": 0.0005890359170734882, + "learning_rate": 4.183963850538534e-05, + "loss": 0.05487467646598816, + "step": 135890 + }, + { + "epoch": 0.5834471033718863, + "grad_norm": 0.00514583382755518, + "learning_rate": 4.183532678526771e-05, + "loss": 0.06383908390998841, + "step": 135900 + }, + { + "epoch": 0.5834900354619064, + "grad_norm": 0.1808479130268097, + "learning_rate": 4.183101506515009e-05, + "loss": 0.2124013900756836, + "step": 135910 + }, + { + "epoch": 0.5835329675519264, + "grad_norm": 0.027015380561351776, + "learning_rate": 4.182670334503247e-05, + "loss": 0.09643960595130921, + "step": 135920 + }, + { + "epoch": 0.5835758996419463, + "grad_norm": 2.4606871604919434, + "learning_rate": 4.182239162491485e-05, + "loss": 0.3026925325393677, + "step": 135930 + }, + { + "epoch": 0.5836188317319664, + "grad_norm": 0.0760890394449234, + "learning_rate": 4.181807990479722e-05, + "loss": 0.4359142303466797, + "step": 135940 + }, + { + "epoch": 0.5836617638219864, + "grad_norm": 0.12953132390975952, + "learning_rate": 4.18137681846796e-05, + "loss": 0.2932579040527344, + "step": 135950 + }, + { + "epoch": 0.5837046959120064, + "grad_norm": 1.3948577642440796, + "learning_rate": 4.1809456464561976e-05, + "loss": 0.25689287185668946, + "step": 135960 + }, + { + "epoch": 0.5837476280020264, + "grad_norm": 0.15391452610492706, + "learning_rate": 4.180514474444435e-05, + "loss": 0.3240983486175537, + "step": 135970 + }, + { + "epoch": 0.5837905600920464, + "grad_norm": 0.2480820119380951, + "learning_rate": 4.1800833024326723e-05, + "loss": 0.031171566247940062, + "step": 135980 + }, + { + "epoch": 0.5838334921820664, + "grad_norm": 0.11079661548137665, + "learning_rate": 4.17965213042091e-05, + "loss": 0.34694485664367675, + "step": 135990 + }, + { + "epoch": 0.5838764242720864, + "grad_norm": 1.795325517654419, + "learning_rate": 4.179220958409148e-05, + "loss": 0.08840734362602234, + "step": 136000 + }, + { + "epoch": 0.5838764242720864, + "eval_loss": 0.4013851284980774, + "eval_runtime": 27.0902, + "eval_samples_per_second": 3.691, + "eval_steps_per_second": 3.691, + "step": 136000 + }, + { + "epoch": 0.5839193563621065, + "grad_norm": 0.025133926421403885, + "learning_rate": 4.1787897863973856e-05, + "loss": 0.1564624071121216, + "step": 136010 + }, + { + "epoch": 0.5839622884521264, + "grad_norm": 0.009930574335157871, + "learning_rate": 4.1783586143856226e-05, + "loss": 0.1523146629333496, + "step": 136020 + }, + { + "epoch": 0.5840052205421464, + "grad_norm": 0.08300717920064926, + "learning_rate": 4.177927442373861e-05, + "loss": 0.09917604327201843, + "step": 136030 + }, + { + "epoch": 0.5840481526321665, + "grad_norm": 1.5571662187576294, + "learning_rate": 4.177496270362099e-05, + "loss": 0.2037580966949463, + "step": 136040 + }, + { + "epoch": 0.5840910847221864, + "grad_norm": 0.001757492427714169, + "learning_rate": 4.1770650983503365e-05, + "loss": 0.21353843212127685, + "step": 136050 + }, + { + "epoch": 0.5841340168122064, + "grad_norm": 0.0005412331083789468, + "learning_rate": 4.176633926338574e-05, + "loss": 0.09854020476341248, + "step": 136060 + }, + { + "epoch": 0.5841769489022265, + "grad_norm": 1.7572810649871826, + "learning_rate": 4.176202754326811e-05, + "loss": 0.15027815103530884, + "step": 136070 + }, + { + "epoch": 0.5842198809922464, + "grad_norm": 0.08901038765907288, + "learning_rate": 4.175771582315049e-05, + "loss": 0.036554208397865294, + "step": 136080 + }, + { + "epoch": 0.5842628130822665, + "grad_norm": 0.03874312341213226, + "learning_rate": 4.175340410303287e-05, + "loss": 0.08065502643585205, + "step": 136090 + }, + { + "epoch": 0.5843057451722865, + "grad_norm": 0.0010567232966423035, + "learning_rate": 4.1749092382915245e-05, + "loss": 0.11951396465301514, + "step": 136100 + }, + { + "epoch": 0.5843486772623064, + "grad_norm": 0.32772913575172424, + "learning_rate": 4.1744780662797615e-05, + "loss": 0.007878247648477554, + "step": 136110 + }, + { + "epoch": 0.5843916093523265, + "grad_norm": 10.457266807556152, + "learning_rate": 4.174046894267999e-05, + "loss": 0.3108761072158813, + "step": 136120 + }, + { + "epoch": 0.5844345414423465, + "grad_norm": 5.449347972869873, + "learning_rate": 4.173615722256237e-05, + "loss": 0.09093397259712219, + "step": 136130 + }, + { + "epoch": 0.5844774735323665, + "grad_norm": 0.06236616149544716, + "learning_rate": 4.173184550244475e-05, + "loss": 0.202095890045166, + "step": 136140 + }, + { + "epoch": 0.5845204056223865, + "grad_norm": 0.038584187626838684, + "learning_rate": 4.1727533782327125e-05, + "loss": 0.16632542610168458, + "step": 136150 + }, + { + "epoch": 0.5845633377124065, + "grad_norm": 0.006404801271855831, + "learning_rate": 4.17232220622095e-05, + "loss": 0.15790761709213258, + "step": 136160 + }, + { + "epoch": 0.5846062698024265, + "grad_norm": 1.5933825969696045, + "learning_rate": 4.171891034209188e-05, + "loss": 0.326295280456543, + "step": 136170 + }, + { + "epoch": 0.5846492018924465, + "grad_norm": 0.18334230780601501, + "learning_rate": 4.171459862197426e-05, + "loss": 0.16310806274414064, + "step": 136180 + }, + { + "epoch": 0.5846921339824666, + "grad_norm": 2.3053152561187744, + "learning_rate": 4.171028690185663e-05, + "loss": 0.13464561700820923, + "step": 136190 + }, + { + "epoch": 0.5847350660724865, + "grad_norm": 0.6618971228599548, + "learning_rate": 4.1705975181739005e-05, + "loss": 0.12008780241012573, + "step": 136200 + }, + { + "epoch": 0.5847779981625065, + "grad_norm": 0.05321823060512543, + "learning_rate": 4.170166346162138e-05, + "loss": 0.2755621671676636, + "step": 136210 + }, + { + "epoch": 0.5848209302525266, + "grad_norm": 0.3745076656341553, + "learning_rate": 4.169735174150376e-05, + "loss": 0.22431304454803466, + "step": 136220 + }, + { + "epoch": 0.5848638623425466, + "grad_norm": 0.003494755132123828, + "learning_rate": 4.169304002138613e-05, + "loss": 0.09709044694900512, + "step": 136230 + }, + { + "epoch": 0.5849067944325665, + "grad_norm": 0.010988358408212662, + "learning_rate": 4.168872830126851e-05, + "loss": 0.30329973697662355, + "step": 136240 + }, + { + "epoch": 0.5849497265225866, + "grad_norm": 0.014921767637133598, + "learning_rate": 4.1684416581150885e-05, + "loss": 0.171160888671875, + "step": 136250 + }, + { + "epoch": 0.5849926586126066, + "grad_norm": 0.07725328207015991, + "learning_rate": 4.168010486103326e-05, + "loss": 0.04647146165370941, + "step": 136260 + }, + { + "epoch": 0.5850355907026266, + "grad_norm": 0.023668555542826653, + "learning_rate": 4.167579314091564e-05, + "loss": 0.09584589004516601, + "step": 136270 + }, + { + "epoch": 0.5850785227926466, + "grad_norm": 0.9428960680961609, + "learning_rate": 4.167148142079802e-05, + "loss": 0.26876513957977294, + "step": 136280 + }, + { + "epoch": 0.5851214548826666, + "grad_norm": 2.354199171066284, + "learning_rate": 4.1667169700680394e-05, + "loss": 0.13666833639144899, + "step": 136290 + }, + { + "epoch": 0.5851643869726866, + "grad_norm": 0.027356013655662537, + "learning_rate": 4.166285798056277e-05, + "loss": 0.6024814128875733, + "step": 136300 + }, + { + "epoch": 0.5852073190627066, + "grad_norm": 3.293339252471924, + "learning_rate": 4.165854626044514e-05, + "loss": 0.29547069072723386, + "step": 136310 + }, + { + "epoch": 0.5852502511527267, + "grad_norm": 0.022677229717373848, + "learning_rate": 4.165423454032752e-05, + "loss": 0.22778058052062988, + "step": 136320 + }, + { + "epoch": 0.5852931832427466, + "grad_norm": 0.14646054804325104, + "learning_rate": 4.16499228202099e-05, + "loss": 0.3935666084289551, + "step": 136330 + }, + { + "epoch": 0.5853361153327666, + "grad_norm": 0.07201693207025528, + "learning_rate": 4.1645611100092274e-05, + "loss": 0.20393807888031007, + "step": 136340 + }, + { + "epoch": 0.5853790474227867, + "grad_norm": 0.009186267852783203, + "learning_rate": 4.1641299379974645e-05, + "loss": 0.2039719820022583, + "step": 136350 + }, + { + "epoch": 0.5854219795128066, + "grad_norm": 0.13840673863887787, + "learning_rate": 4.163698765985702e-05, + "loss": 0.27695910930633544, + "step": 136360 + }, + { + "epoch": 0.5854649116028267, + "grad_norm": 0.0026640286669135094, + "learning_rate": 4.16326759397394e-05, + "loss": 0.2907125949859619, + "step": 136370 + }, + { + "epoch": 0.5855078436928467, + "grad_norm": 5.028379917144775, + "learning_rate": 4.1628364219621777e-05, + "loss": 0.2319865942001343, + "step": 136380 + }, + { + "epoch": 0.5855507757828666, + "grad_norm": 0.854225218296051, + "learning_rate": 4.1624052499504154e-05, + "loss": 0.21682953834533691, + "step": 136390 + }, + { + "epoch": 0.5855937078728867, + "grad_norm": 0.0029383532237261534, + "learning_rate": 4.161974077938653e-05, + "loss": 0.10343109369277954, + "step": 136400 + }, + { + "epoch": 0.5856366399629067, + "grad_norm": 2.0140578746795654, + "learning_rate": 4.161542905926891e-05, + "loss": 0.47559642791748047, + "step": 136410 + }, + { + "epoch": 0.5856795720529266, + "grad_norm": 1.8081625699996948, + "learning_rate": 4.1611117339151286e-05, + "loss": 0.22019548416137696, + "step": 136420 + }, + { + "epoch": 0.5857225041429467, + "grad_norm": 0.3506382703781128, + "learning_rate": 4.160680561903366e-05, + "loss": 0.3239941358566284, + "step": 136430 + }, + { + "epoch": 0.5857654362329667, + "grad_norm": 0.23854027688503265, + "learning_rate": 4.1602493898916034e-05, + "loss": 0.2802767276763916, + "step": 136440 + }, + { + "epoch": 0.5858083683229867, + "grad_norm": 0.02299383655190468, + "learning_rate": 4.159818217879841e-05, + "loss": 0.1584943175315857, + "step": 136450 + }, + { + "epoch": 0.5858513004130067, + "grad_norm": 0.9188843369483948, + "learning_rate": 4.159387045868079e-05, + "loss": 0.10783770084381103, + "step": 136460 + }, + { + "epoch": 0.5858942325030267, + "grad_norm": 6.9701714515686035, + "learning_rate": 4.1589558738563166e-05, + "loss": 0.11004831790924072, + "step": 136470 + }, + { + "epoch": 0.5859371645930467, + "grad_norm": 0.01801559142768383, + "learning_rate": 4.1585247018445536e-05, + "loss": 0.14326504468917847, + "step": 136480 + }, + { + "epoch": 0.5859800966830667, + "grad_norm": 0.0658947303891182, + "learning_rate": 4.1580935298327914e-05, + "loss": 0.1374224305152893, + "step": 136490 + }, + { + "epoch": 0.5860230287730868, + "grad_norm": 0.2008223682641983, + "learning_rate": 4.157662357821029e-05, + "loss": 0.30278620719909666, + "step": 136500 + }, + { + "epoch": 0.5860659608631067, + "grad_norm": 0.3907320201396942, + "learning_rate": 4.157231185809267e-05, + "loss": 0.2772698163986206, + "step": 136510 + }, + { + "epoch": 0.5861088929531267, + "grad_norm": 0.04549934342503548, + "learning_rate": 4.1568000137975046e-05, + "loss": 0.10799868106842041, + "step": 136520 + }, + { + "epoch": 0.5861518250431468, + "grad_norm": 0.05788165330886841, + "learning_rate": 4.156368841785742e-05, + "loss": 0.22272298336029053, + "step": 136530 + }, + { + "epoch": 0.5861947571331667, + "grad_norm": 0.045943133533000946, + "learning_rate": 4.15593766977398e-05, + "loss": 0.04048386216163635, + "step": 136540 + }, + { + "epoch": 0.5862376892231868, + "grad_norm": 0.02736165188252926, + "learning_rate": 4.155506497762218e-05, + "loss": 0.21302416324615478, + "step": 136550 + }, + { + "epoch": 0.5862806213132068, + "grad_norm": 0.0015208977274596691, + "learning_rate": 4.155075325750455e-05, + "loss": 0.04716380536556244, + "step": 136560 + }, + { + "epoch": 0.5863235534032267, + "grad_norm": 0.8822836875915527, + "learning_rate": 4.1546441537386926e-05, + "loss": 0.10152556896209716, + "step": 136570 + }, + { + "epoch": 0.5863664854932468, + "grad_norm": 0.005391272716224194, + "learning_rate": 4.15421298172693e-05, + "loss": 0.38786365985870364, + "step": 136580 + }, + { + "epoch": 0.5864094175832668, + "grad_norm": 0.001887031365185976, + "learning_rate": 4.153781809715168e-05, + "loss": 0.16974475383758544, + "step": 136590 + }, + { + "epoch": 0.5864523496732867, + "grad_norm": 0.05451970919966698, + "learning_rate": 4.153350637703405e-05, + "loss": 0.0324835479259491, + "step": 136600 + }, + { + "epoch": 0.5864952817633068, + "grad_norm": 0.5006858706474304, + "learning_rate": 4.152919465691643e-05, + "loss": 0.01447709947824478, + "step": 136610 + }, + { + "epoch": 0.5865382138533268, + "grad_norm": 0.9552084803581238, + "learning_rate": 4.1524882936798806e-05, + "loss": 0.21060488224029542, + "step": 136620 + }, + { + "epoch": 0.5865811459433468, + "grad_norm": 12.546867370605469, + "learning_rate": 4.152057121668119e-05, + "loss": 0.28877010345458987, + "step": 136630 + }, + { + "epoch": 0.5866240780333668, + "grad_norm": 0.008536313660442829, + "learning_rate": 4.151625949656356e-05, + "loss": 0.25785582065582274, + "step": 136640 + }, + { + "epoch": 0.5866670101233868, + "grad_norm": 10.071556091308594, + "learning_rate": 4.151194777644594e-05, + "loss": 0.3655982494354248, + "step": 136650 + }, + { + "epoch": 0.5867099422134069, + "grad_norm": 0.36693134903907776, + "learning_rate": 4.1507636056328315e-05, + "loss": 0.12875553369522094, + "step": 136660 + }, + { + "epoch": 0.5867528743034268, + "grad_norm": 0.004068433307111263, + "learning_rate": 4.150332433621069e-05, + "loss": 0.19526032209396363, + "step": 136670 + }, + { + "epoch": 0.5867958063934469, + "grad_norm": 0.011420795693993568, + "learning_rate": 4.149901261609306e-05, + "loss": 0.047950607538223264, + "step": 136680 + }, + { + "epoch": 0.5868387384834669, + "grad_norm": 1.7081316709518433, + "learning_rate": 4.149470089597544e-05, + "loss": 0.14381338357925416, + "step": 136690 + }, + { + "epoch": 0.5868816705734868, + "grad_norm": 0.000932833063416183, + "learning_rate": 4.149038917585782e-05, + "loss": 0.09737264513969421, + "step": 136700 + }, + { + "epoch": 0.5869246026635069, + "grad_norm": 0.6827306747436523, + "learning_rate": 4.1486077455740195e-05, + "loss": 0.3292029857635498, + "step": 136710 + }, + { + "epoch": 0.5869675347535269, + "grad_norm": 0.247352734208107, + "learning_rate": 4.1481765735622566e-05, + "loss": 0.20491859912872315, + "step": 136720 + }, + { + "epoch": 0.5870104668435469, + "grad_norm": 2.4465835094451904, + "learning_rate": 4.147745401550494e-05, + "loss": 0.5619071960449219, + "step": 136730 + }, + { + "epoch": 0.5870533989335669, + "grad_norm": 4.127469062805176, + "learning_rate": 4.147314229538733e-05, + "loss": 0.30172004699707033, + "step": 136740 + }, + { + "epoch": 0.5870963310235869, + "grad_norm": 0.15143781900405884, + "learning_rate": 4.1468830575269704e-05, + "loss": 0.04340499341487884, + "step": 136750 + }, + { + "epoch": 0.5871392631136069, + "grad_norm": 1.3688448667526245, + "learning_rate": 4.146451885515208e-05, + "loss": 0.26014227867126466, + "step": 136760 + }, + { + "epoch": 0.5871821952036269, + "grad_norm": 0.8132941722869873, + "learning_rate": 4.146020713503445e-05, + "loss": 0.20901427268981934, + "step": 136770 + }, + { + "epoch": 0.587225127293647, + "grad_norm": 1.0680354833602905, + "learning_rate": 4.145589541491683e-05, + "loss": 0.13693716526031494, + "step": 136780 + }, + { + "epoch": 0.5872680593836669, + "grad_norm": 0.024545999243855476, + "learning_rate": 4.145158369479921e-05, + "loss": 0.17069380283355712, + "step": 136790 + }, + { + "epoch": 0.5873109914736869, + "grad_norm": 0.020502969622612, + "learning_rate": 4.1447271974681584e-05, + "loss": 0.09372333884239196, + "step": 136800 + }, + { + "epoch": 0.587353923563707, + "grad_norm": 1.3037524223327637, + "learning_rate": 4.1442960254563955e-05, + "loss": 0.2567019462585449, + "step": 136810 + }, + { + "epoch": 0.5873968556537269, + "grad_norm": 0.127951979637146, + "learning_rate": 4.143864853444633e-05, + "loss": 0.3536144018173218, + "step": 136820 + }, + { + "epoch": 0.5874397877437469, + "grad_norm": 0.030075104907155037, + "learning_rate": 4.143433681432871e-05, + "loss": 0.14973297119140624, + "step": 136830 + }, + { + "epoch": 0.587482719833767, + "grad_norm": 0.015654215589165688, + "learning_rate": 4.143002509421109e-05, + "loss": 0.3059299230575562, + "step": 136840 + }, + { + "epoch": 0.5875256519237869, + "grad_norm": 0.15828372538089752, + "learning_rate": 4.1425713374093464e-05, + "loss": 0.17411817312240602, + "step": 136850 + }, + { + "epoch": 0.587568584013807, + "grad_norm": 2.2743732929229736, + "learning_rate": 4.142140165397584e-05, + "loss": 0.32296817302703856, + "step": 136860 + }, + { + "epoch": 0.587611516103827, + "grad_norm": 0.006136009003967047, + "learning_rate": 4.141708993385822e-05, + "loss": 0.1016544222831726, + "step": 136870 + }, + { + "epoch": 0.5876544481938469, + "grad_norm": 0.22323651611804962, + "learning_rate": 4.1412778213740596e-05, + "loss": 0.13947161436080932, + "step": 136880 + }, + { + "epoch": 0.587697380283867, + "grad_norm": 5.105968952178955, + "learning_rate": 4.140846649362297e-05, + "loss": 0.22951793670654297, + "step": 136890 + }, + { + "epoch": 0.587740312373887, + "grad_norm": 1.0845637321472168, + "learning_rate": 4.1404154773505344e-05, + "loss": 0.28329808712005616, + "step": 136900 + }, + { + "epoch": 0.587783244463907, + "grad_norm": 1.287009835243225, + "learning_rate": 4.139984305338772e-05, + "loss": 0.3320283889770508, + "step": 136910 + }, + { + "epoch": 0.587826176553927, + "grad_norm": 0.004043887369334698, + "learning_rate": 4.13955313332701e-05, + "loss": 0.13691928386688232, + "step": 136920 + }, + { + "epoch": 0.587869108643947, + "grad_norm": 0.025284389033913612, + "learning_rate": 4.139121961315247e-05, + "loss": 0.2444899320602417, + "step": 136930 + }, + { + "epoch": 0.587912040733967, + "grad_norm": 1.5387986898422241, + "learning_rate": 4.138690789303485e-05, + "loss": 0.3062190055847168, + "step": 136940 + }, + { + "epoch": 0.587954972823987, + "grad_norm": 0.003443187801167369, + "learning_rate": 4.1382596172917224e-05, + "loss": 0.07713412046432495, + "step": 136950 + }, + { + "epoch": 0.587997904914007, + "grad_norm": 0.006569002289324999, + "learning_rate": 4.13782844527996e-05, + "loss": 0.05037772059440613, + "step": 136960 + }, + { + "epoch": 0.588040837004027, + "grad_norm": 0.034869421273469925, + "learning_rate": 4.137397273268198e-05, + "loss": 0.2030102491378784, + "step": 136970 + }, + { + "epoch": 0.588083769094047, + "grad_norm": 0.007195018697530031, + "learning_rate": 4.1369661012564356e-05, + "loss": 0.29705181121826174, + "step": 136980 + }, + { + "epoch": 0.5881267011840671, + "grad_norm": 0.0021977697033435106, + "learning_rate": 4.1365349292446733e-05, + "loss": 0.39129085540771485, + "step": 136990 + }, + { + "epoch": 0.588169633274087, + "grad_norm": 4.102361679077148, + "learning_rate": 4.136103757232911e-05, + "loss": 0.2191321849822998, + "step": 137000 + }, + { + "epoch": 0.588169633274087, + "eval_loss": 0.3964814841747284, + "eval_runtime": 27.2479, + "eval_samples_per_second": 3.67, + "eval_steps_per_second": 3.67, + "step": 137000 + }, + { + "epoch": 0.588212565364107, + "grad_norm": 0.01128480490297079, + "learning_rate": 4.135672585221148e-05, + "loss": 0.16608066558837892, + "step": 137010 + }, + { + "epoch": 0.5882554974541271, + "grad_norm": 2.656604051589966, + "learning_rate": 4.135241413209386e-05, + "loss": 0.27288265228271485, + "step": 137020 + }, + { + "epoch": 0.588298429544147, + "grad_norm": 0.0048962910659611225, + "learning_rate": 4.1348102411976236e-05, + "loss": 0.3048482656478882, + "step": 137030 + }, + { + "epoch": 0.5883413616341671, + "grad_norm": 1.3734676837921143, + "learning_rate": 4.1343790691858613e-05, + "loss": 0.1047094702720642, + "step": 137040 + }, + { + "epoch": 0.5883842937241871, + "grad_norm": 0.005558597389608622, + "learning_rate": 4.1339478971740984e-05, + "loss": 0.18236162662506103, + "step": 137050 + }, + { + "epoch": 0.588427225814207, + "grad_norm": 0.23292341828346252, + "learning_rate": 4.133516725162336e-05, + "loss": 0.3300944328308105, + "step": 137060 + }, + { + "epoch": 0.5884701579042271, + "grad_norm": 1.9232966899871826, + "learning_rate": 4.133085553150574e-05, + "loss": 0.2621130466461182, + "step": 137070 + }, + { + "epoch": 0.5885130899942471, + "grad_norm": 0.0006944802007637918, + "learning_rate": 4.1326543811388116e-05, + "loss": 0.26037561893463135, + "step": 137080 + }, + { + "epoch": 0.5885560220842672, + "grad_norm": 0.08561205863952637, + "learning_rate": 4.132223209127049e-05, + "loss": 0.2053438186645508, + "step": 137090 + }, + { + "epoch": 0.5885989541742871, + "grad_norm": 0.37227103114128113, + "learning_rate": 4.131792037115287e-05, + "loss": 0.22090797424316405, + "step": 137100 + }, + { + "epoch": 0.5886418862643071, + "grad_norm": 1.0453672409057617, + "learning_rate": 4.131360865103525e-05, + "loss": 0.3329441547393799, + "step": 137110 + }, + { + "epoch": 0.5886848183543272, + "grad_norm": 0.08478416502475739, + "learning_rate": 4.1309296930917625e-05, + "loss": 0.28282577991485597, + "step": 137120 + }, + { + "epoch": 0.5887277504443471, + "grad_norm": 0.0006766520091332495, + "learning_rate": 4.13049852108e-05, + "loss": 0.08505859375, + "step": 137130 + }, + { + "epoch": 0.5887706825343672, + "grad_norm": 1.1568154096603394, + "learning_rate": 4.130067349068237e-05, + "loss": 0.0769594967365265, + "step": 137140 + }, + { + "epoch": 0.5888136146243872, + "grad_norm": 0.004215045366436243, + "learning_rate": 4.129636177056475e-05, + "loss": 0.16643118858337402, + "step": 137150 + }, + { + "epoch": 0.5888565467144071, + "grad_norm": 12.162041664123535, + "learning_rate": 4.129205005044713e-05, + "loss": 0.15993156433105468, + "step": 137160 + }, + { + "epoch": 0.5888994788044272, + "grad_norm": 0.13196447491645813, + "learning_rate": 4.1287738330329505e-05, + "loss": 0.1578353762626648, + "step": 137170 + }, + { + "epoch": 0.5889424108944472, + "grad_norm": 0.6302490830421448, + "learning_rate": 4.1283426610211876e-05, + "loss": 0.1752261519432068, + "step": 137180 + }, + { + "epoch": 0.5889853429844671, + "grad_norm": 4.5182013511657715, + "learning_rate": 4.127911489009425e-05, + "loss": 0.3914973497390747, + "step": 137190 + }, + { + "epoch": 0.5890282750744872, + "grad_norm": 0.09273222088813782, + "learning_rate": 4.127480316997663e-05, + "loss": 0.0532687246799469, + "step": 137200 + }, + { + "epoch": 0.5890712071645072, + "grad_norm": 0.080787293612957, + "learning_rate": 4.127049144985901e-05, + "loss": 0.14222522974014282, + "step": 137210 + }, + { + "epoch": 0.5891141392545272, + "grad_norm": 1.3788886070251465, + "learning_rate": 4.1266179729741385e-05, + "loss": 0.22479968070983886, + "step": 137220 + }, + { + "epoch": 0.5891570713445472, + "grad_norm": 0.0028310460038483143, + "learning_rate": 4.126186800962376e-05, + "loss": 0.2262204885482788, + "step": 137230 + }, + { + "epoch": 0.5892000034345672, + "grad_norm": 3.5362157821655273, + "learning_rate": 4.125755628950614e-05, + "loss": 0.4202101230621338, + "step": 137240 + }, + { + "epoch": 0.5892429355245872, + "grad_norm": 0.7681101560592651, + "learning_rate": 4.125324456938852e-05, + "loss": 0.19651840925216674, + "step": 137250 + }, + { + "epoch": 0.5892858676146072, + "grad_norm": 0.762321949005127, + "learning_rate": 4.124893284927089e-05, + "loss": 0.362211275100708, + "step": 137260 + }, + { + "epoch": 0.5893287997046273, + "grad_norm": 0.074244923889637, + "learning_rate": 4.1244621129153265e-05, + "loss": 0.13443008661270142, + "step": 137270 + }, + { + "epoch": 0.5893717317946472, + "grad_norm": 0.258041650056839, + "learning_rate": 4.124030940903564e-05, + "loss": 0.1949497103691101, + "step": 137280 + }, + { + "epoch": 0.5894146638846672, + "grad_norm": 0.00303852092474699, + "learning_rate": 4.123599768891802e-05, + "loss": 0.056662237644195555, + "step": 137290 + }, + { + "epoch": 0.5894575959746873, + "grad_norm": 0.027162009850144386, + "learning_rate": 4.123168596880039e-05, + "loss": 0.2888782501220703, + "step": 137300 + }, + { + "epoch": 0.5895005280647072, + "grad_norm": 4.259036064147949, + "learning_rate": 4.122737424868277e-05, + "loss": 0.4366584300994873, + "step": 137310 + }, + { + "epoch": 0.5895434601547273, + "grad_norm": 0.08108695596456528, + "learning_rate": 4.1223062528565145e-05, + "loss": 0.02968863248825073, + "step": 137320 + }, + { + "epoch": 0.5895863922447473, + "grad_norm": 1.7827603816986084, + "learning_rate": 4.121875080844753e-05, + "loss": 0.3578782081604004, + "step": 137330 + }, + { + "epoch": 0.5896293243347672, + "grad_norm": 1.2169469594955444, + "learning_rate": 4.12144390883299e-05, + "loss": 0.074959796667099, + "step": 137340 + }, + { + "epoch": 0.5896722564247873, + "grad_norm": 1.509886384010315, + "learning_rate": 4.121012736821228e-05, + "loss": 0.3637458086013794, + "step": 137350 + }, + { + "epoch": 0.5897151885148073, + "grad_norm": 1.6214172840118408, + "learning_rate": 4.1205815648094655e-05, + "loss": 0.22257492542266846, + "step": 137360 + }, + { + "epoch": 0.5897581206048272, + "grad_norm": 0.003348621539771557, + "learning_rate": 4.120150392797703e-05, + "loss": 0.07453447580337524, + "step": 137370 + }, + { + "epoch": 0.5898010526948473, + "grad_norm": 0.012746520340442657, + "learning_rate": 4.11971922078594e-05, + "loss": 0.30892822742462156, + "step": 137380 + }, + { + "epoch": 0.5898439847848673, + "grad_norm": 4.19691276550293, + "learning_rate": 4.119288048774178e-05, + "loss": 0.2979278564453125, + "step": 137390 + }, + { + "epoch": 0.5898869168748873, + "grad_norm": 2.4384191036224365, + "learning_rate": 4.118856876762416e-05, + "loss": 0.21095747947692872, + "step": 137400 + }, + { + "epoch": 0.5899298489649073, + "grad_norm": 0.03696135804057121, + "learning_rate": 4.1184257047506534e-05, + "loss": 0.10608190298080444, + "step": 137410 + }, + { + "epoch": 0.5899727810549273, + "grad_norm": 0.05472942441701889, + "learning_rate": 4.1179945327388905e-05, + "loss": 0.04571995139122009, + "step": 137420 + }, + { + "epoch": 0.5900157131449473, + "grad_norm": 0.10192541033029556, + "learning_rate": 4.117563360727128e-05, + "loss": 0.10677586793899536, + "step": 137430 + }, + { + "epoch": 0.5900586452349673, + "grad_norm": 5.087360858917236, + "learning_rate": 4.1171321887153666e-05, + "loss": 0.3197747230529785, + "step": 137440 + }, + { + "epoch": 0.5901015773249874, + "grad_norm": 0.09760670363903046, + "learning_rate": 4.1167010167036044e-05, + "loss": 0.05229000449180603, + "step": 137450 + }, + { + "epoch": 0.5901445094150073, + "grad_norm": 0.377534419298172, + "learning_rate": 4.1162698446918414e-05, + "loss": 0.12750136852264404, + "step": 137460 + }, + { + "epoch": 0.5901874415050273, + "grad_norm": 0.02546530030667782, + "learning_rate": 4.115838672680079e-05, + "loss": 0.16499568223953248, + "step": 137470 + }, + { + "epoch": 0.5902303735950474, + "grad_norm": 3.4862186908721924, + "learning_rate": 4.115407500668317e-05, + "loss": 0.31398231983184816, + "step": 137480 + }, + { + "epoch": 0.5902733056850673, + "grad_norm": 7.320980072021484, + "learning_rate": 4.1149763286565546e-05, + "loss": 0.25650997161865235, + "step": 137490 + }, + { + "epoch": 0.5903162377750873, + "grad_norm": 15.762680053710938, + "learning_rate": 4.1145451566447924e-05, + "loss": 0.4242871284484863, + "step": 137500 + }, + { + "epoch": 0.5903591698651074, + "grad_norm": 0.008897298946976662, + "learning_rate": 4.1141139846330294e-05, + "loss": 0.11407696008682251, + "step": 137510 + }, + { + "epoch": 0.5904021019551274, + "grad_norm": 33.672706604003906, + "learning_rate": 4.113682812621267e-05, + "loss": 0.3351729869842529, + "step": 137520 + }, + { + "epoch": 0.5904450340451474, + "grad_norm": 8.253597259521484, + "learning_rate": 4.113251640609505e-05, + "loss": 0.5670902252197265, + "step": 137530 + }, + { + "epoch": 0.5904879661351674, + "grad_norm": 0.49867475032806396, + "learning_rate": 4.1128204685977426e-05, + "loss": 0.07423478364944458, + "step": 137540 + }, + { + "epoch": 0.5905308982251875, + "grad_norm": 0.0026952065527439117, + "learning_rate": 4.1123892965859804e-05, + "loss": 0.2668862819671631, + "step": 137550 + }, + { + "epoch": 0.5905738303152074, + "grad_norm": 0.010958666913211346, + "learning_rate": 4.111958124574218e-05, + "loss": 0.07709743976593017, + "step": 137560 + }, + { + "epoch": 0.5906167624052274, + "grad_norm": 0.01955607905983925, + "learning_rate": 4.111526952562456e-05, + "loss": 0.09136215448379517, + "step": 137570 + }, + { + "epoch": 0.5906596944952475, + "grad_norm": 0.005562023725360632, + "learning_rate": 4.1110957805506936e-05, + "loss": 0.1114089846611023, + "step": 137580 + }, + { + "epoch": 0.5907026265852674, + "grad_norm": 0.03999653458595276, + "learning_rate": 4.1106646085389306e-05, + "loss": 0.16414806842803956, + "step": 137590 + }, + { + "epoch": 0.5907455586752874, + "grad_norm": 0.008433864451944828, + "learning_rate": 4.1102334365271684e-05, + "loss": 0.3284775257110596, + "step": 137600 + }, + { + "epoch": 0.5907884907653075, + "grad_norm": 0.011860636994242668, + "learning_rate": 4.109802264515406e-05, + "loss": 0.2962829828262329, + "step": 137610 + }, + { + "epoch": 0.5908314228553274, + "grad_norm": 0.029396725818514824, + "learning_rate": 4.109371092503644e-05, + "loss": 0.20384597778320312, + "step": 137620 + }, + { + "epoch": 0.5908743549453475, + "grad_norm": 1.1890825033187866, + "learning_rate": 4.108939920491881e-05, + "loss": 0.12327685356140136, + "step": 137630 + }, + { + "epoch": 0.5909172870353675, + "grad_norm": 1.923099398612976, + "learning_rate": 4.1085087484801186e-05, + "loss": 0.18159937858581543, + "step": 137640 + }, + { + "epoch": 0.5909602191253874, + "grad_norm": 0.0022021017502993345, + "learning_rate": 4.1080775764683564e-05, + "loss": 0.2301816463470459, + "step": 137650 + }, + { + "epoch": 0.5910031512154075, + "grad_norm": 1.2682898044586182, + "learning_rate": 4.107646404456594e-05, + "loss": 0.2662432909011841, + "step": 137660 + }, + { + "epoch": 0.5910460833054275, + "grad_norm": 0.0558965802192688, + "learning_rate": 4.107215232444832e-05, + "loss": 0.24787378311157227, + "step": 137670 + }, + { + "epoch": 0.5910890153954474, + "grad_norm": 0.0006554294959641993, + "learning_rate": 4.1067840604330696e-05, + "loss": 0.08674585819244385, + "step": 137680 + }, + { + "epoch": 0.5911319474854675, + "grad_norm": 0.0752268061041832, + "learning_rate": 4.106352888421307e-05, + "loss": 0.3500924587249756, + "step": 137690 + }, + { + "epoch": 0.5911748795754875, + "grad_norm": 4.126988410949707, + "learning_rate": 4.105921716409545e-05, + "loss": 0.12794922590255736, + "step": 137700 + }, + { + "epoch": 0.5912178116655075, + "grad_norm": 2.2941765785217285, + "learning_rate": 4.105490544397782e-05, + "loss": 0.14108363389968873, + "step": 137710 + }, + { + "epoch": 0.5912607437555275, + "grad_norm": 0.008788630366325378, + "learning_rate": 4.10505937238602e-05, + "loss": 0.15378959178924562, + "step": 137720 + }, + { + "epoch": 0.5913036758455475, + "grad_norm": 0.009542332962155342, + "learning_rate": 4.1046282003742576e-05, + "loss": 0.16104387044906615, + "step": 137730 + }, + { + "epoch": 0.5913466079355675, + "grad_norm": 1.5980418920516968, + "learning_rate": 4.104197028362495e-05, + "loss": 0.3658533811569214, + "step": 137740 + }, + { + "epoch": 0.5913895400255875, + "grad_norm": 2.4138245582580566, + "learning_rate": 4.1037658563507323e-05, + "loss": 0.2696866035461426, + "step": 137750 + }, + { + "epoch": 0.5914324721156076, + "grad_norm": 0.588241696357727, + "learning_rate": 4.10333468433897e-05, + "loss": 0.21146905422210693, + "step": 137760 + }, + { + "epoch": 0.5914754042056275, + "grad_norm": 0.44645923376083374, + "learning_rate": 4.102903512327208e-05, + "loss": 0.17989760637283325, + "step": 137770 + }, + { + "epoch": 0.5915183362956475, + "grad_norm": 0.5778979063034058, + "learning_rate": 4.1024723403154455e-05, + "loss": 0.15354671478271484, + "step": 137780 + }, + { + "epoch": 0.5915612683856676, + "grad_norm": 0.0002853994374163449, + "learning_rate": 4.102041168303683e-05, + "loss": 0.27571873664855956, + "step": 137790 + }, + { + "epoch": 0.5916042004756875, + "grad_norm": 0.005918090231716633, + "learning_rate": 4.101609996291921e-05, + "loss": 0.2757784128189087, + "step": 137800 + }, + { + "epoch": 0.5916471325657076, + "grad_norm": 0.0009141897899098694, + "learning_rate": 4.101178824280159e-05, + "loss": 0.11418241262435913, + "step": 137810 + }, + { + "epoch": 0.5916900646557276, + "grad_norm": 0.02456584945321083, + "learning_rate": 4.1007476522683965e-05, + "loss": 0.15693914890289307, + "step": 137820 + }, + { + "epoch": 0.5917329967457475, + "grad_norm": 0.002220330759882927, + "learning_rate": 4.1003164802566335e-05, + "loss": 0.252367901802063, + "step": 137830 + }, + { + "epoch": 0.5917759288357676, + "grad_norm": 1.6629785299301147, + "learning_rate": 4.099885308244871e-05, + "loss": 0.17761930227279663, + "step": 137840 + }, + { + "epoch": 0.5918188609257876, + "grad_norm": 1.0520602464675903, + "learning_rate": 4.099454136233109e-05, + "loss": 0.11995333433151245, + "step": 137850 + }, + { + "epoch": 0.5918617930158075, + "grad_norm": 0.0027449713088572025, + "learning_rate": 4.099022964221347e-05, + "loss": 0.3772153854370117, + "step": 137860 + }, + { + "epoch": 0.5919047251058276, + "grad_norm": 0.0017124215373769403, + "learning_rate": 4.0985917922095845e-05, + "loss": 0.16851363182067872, + "step": 137870 + }, + { + "epoch": 0.5919476571958476, + "grad_norm": 0.55049067735672, + "learning_rate": 4.0981606201978215e-05, + "loss": 0.09786216020584107, + "step": 137880 + }, + { + "epoch": 0.5919905892858676, + "grad_norm": 1.6705586910247803, + "learning_rate": 4.097729448186059e-05, + "loss": 0.17155834436416625, + "step": 137890 + }, + { + "epoch": 0.5920335213758876, + "grad_norm": 0.9425643682479858, + "learning_rate": 4.097298276174297e-05, + "loss": 0.032494351267814636, + "step": 137900 + }, + { + "epoch": 0.5920764534659076, + "grad_norm": 3.071489095687866, + "learning_rate": 4.096867104162535e-05, + "loss": 0.09281741380691529, + "step": 137910 + }, + { + "epoch": 0.5921193855559276, + "grad_norm": 2.000361919403076, + "learning_rate": 4.0964359321507725e-05, + "loss": 0.26554825305938723, + "step": 137920 + }, + { + "epoch": 0.5921623176459476, + "grad_norm": 0.0017703615594655275, + "learning_rate": 4.09600476013901e-05, + "loss": 0.08092322945594788, + "step": 137930 + }, + { + "epoch": 0.5922052497359677, + "grad_norm": 0.1381872445344925, + "learning_rate": 4.095573588127248e-05, + "loss": 0.18154406547546387, + "step": 137940 + }, + { + "epoch": 0.5922481818259877, + "grad_norm": 10.791793823242188, + "learning_rate": 4.095142416115486e-05, + "loss": 0.3010018110275269, + "step": 137950 + }, + { + "epoch": 0.5922911139160076, + "grad_norm": 0.0003797747485805303, + "learning_rate": 4.094711244103723e-05, + "loss": 0.09593486785888672, + "step": 137960 + }, + { + "epoch": 0.5923340460060277, + "grad_norm": 0.6908388137817383, + "learning_rate": 4.0942800720919605e-05, + "loss": 0.16052920818328859, + "step": 137970 + }, + { + "epoch": 0.5923769780960477, + "grad_norm": 1.987916350364685, + "learning_rate": 4.093848900080198e-05, + "loss": 0.29268622398376465, + "step": 137980 + }, + { + "epoch": 0.5924199101860677, + "grad_norm": 0.0018756146309897304, + "learning_rate": 4.093417728068436e-05, + "loss": 0.18456127643585205, + "step": 137990 + }, + { + "epoch": 0.5924628422760877, + "grad_norm": 17.71635627746582, + "learning_rate": 4.092986556056673e-05, + "loss": 0.21318068504333496, + "step": 138000 + }, + { + "epoch": 0.5924628422760877, + "eval_loss": 0.3928622305393219, + "eval_runtime": 27.2123, + "eval_samples_per_second": 3.675, + "eval_steps_per_second": 3.675, + "step": 138000 + }, + { + "epoch": 0.5925057743661077, + "grad_norm": 2.7416698932647705, + "learning_rate": 4.092555384044911e-05, + "loss": 0.19002535343170165, + "step": 138010 + }, + { + "epoch": 0.5925487064561277, + "grad_norm": 0.0005497373058460653, + "learning_rate": 4.0921242120331485e-05, + "loss": 0.2710278987884521, + "step": 138020 + }, + { + "epoch": 0.5925916385461477, + "grad_norm": 0.013134874403476715, + "learning_rate": 4.091693040021387e-05, + "loss": 0.3821166753768921, + "step": 138030 + }, + { + "epoch": 0.5926345706361678, + "grad_norm": 0.16501188278198242, + "learning_rate": 4.091261868009624e-05, + "loss": 0.14334306716918946, + "step": 138040 + }, + { + "epoch": 0.5926775027261877, + "grad_norm": 0.011494440026581287, + "learning_rate": 4.090830695997862e-05, + "loss": 0.20109798908233642, + "step": 138050 + }, + { + "epoch": 0.5927204348162077, + "grad_norm": 0.027806028723716736, + "learning_rate": 4.0903995239860994e-05, + "loss": 0.11723495721817016, + "step": 138060 + }, + { + "epoch": 0.5927633669062278, + "grad_norm": 1.9640231132507324, + "learning_rate": 4.089968351974337e-05, + "loss": 0.32733218669891356, + "step": 138070 + }, + { + "epoch": 0.5928062989962477, + "grad_norm": 0.0590534470975399, + "learning_rate": 4.089537179962574e-05, + "loss": 0.034361538290977475, + "step": 138080 + }, + { + "epoch": 0.5928492310862677, + "grad_norm": 0.04286135733127594, + "learning_rate": 4.089106007950812e-05, + "loss": 0.06894341707229615, + "step": 138090 + }, + { + "epoch": 0.5928921631762878, + "grad_norm": 0.04913228377699852, + "learning_rate": 4.0886748359390497e-05, + "loss": 0.04862775206565857, + "step": 138100 + }, + { + "epoch": 0.5929350952663077, + "grad_norm": 0.0012994530843570828, + "learning_rate": 4.0882436639272874e-05, + "loss": 0.311983060836792, + "step": 138110 + }, + { + "epoch": 0.5929780273563278, + "grad_norm": 0.03957013785839081, + "learning_rate": 4.0878124919155244e-05, + "loss": 0.390786337852478, + "step": 138120 + }, + { + "epoch": 0.5930209594463478, + "grad_norm": 0.04898397624492645, + "learning_rate": 4.087381319903762e-05, + "loss": 0.21631765365600586, + "step": 138130 + }, + { + "epoch": 0.5930638915363677, + "grad_norm": 5.083647727966309, + "learning_rate": 4.0869501478920006e-05, + "loss": 0.2936803102493286, + "step": 138140 + }, + { + "epoch": 0.5931068236263878, + "grad_norm": 2.1491951942443848, + "learning_rate": 4.086518975880238e-05, + "loss": 0.14799619913101197, + "step": 138150 + }, + { + "epoch": 0.5931497557164078, + "grad_norm": 3.132175922393799, + "learning_rate": 4.0860878038684754e-05, + "loss": 0.2657571077346802, + "step": 138160 + }, + { + "epoch": 0.5931926878064278, + "grad_norm": 0.015707481652498245, + "learning_rate": 4.085656631856713e-05, + "loss": 0.12225933074951172, + "step": 138170 + }, + { + "epoch": 0.5932356198964478, + "grad_norm": 1.2788748741149902, + "learning_rate": 4.085225459844951e-05, + "loss": 0.1507628083229065, + "step": 138180 + }, + { + "epoch": 0.5932785519864678, + "grad_norm": 0.042883019894361496, + "learning_rate": 4.0847942878331886e-05, + "loss": 0.26679019927978515, + "step": 138190 + }, + { + "epoch": 0.5933214840764878, + "grad_norm": 0.10420157760381699, + "learning_rate": 4.0843631158214256e-05, + "loss": 0.07498223185539246, + "step": 138200 + }, + { + "epoch": 0.5933644161665078, + "grad_norm": 23.004911422729492, + "learning_rate": 4.0839319438096634e-05, + "loss": 0.3628678560256958, + "step": 138210 + }, + { + "epoch": 0.5934073482565279, + "grad_norm": 0.019570820033550262, + "learning_rate": 4.083500771797901e-05, + "loss": 0.14614968299865722, + "step": 138220 + }, + { + "epoch": 0.5934502803465478, + "grad_norm": 0.01614968292415142, + "learning_rate": 4.083069599786139e-05, + "loss": 0.1902191162109375, + "step": 138230 + }, + { + "epoch": 0.5934932124365678, + "grad_norm": 0.009563574567437172, + "learning_rate": 4.0826384277743766e-05, + "loss": 0.1708337903022766, + "step": 138240 + }, + { + "epoch": 0.5935361445265879, + "grad_norm": 0.003760694758966565, + "learning_rate": 4.082207255762614e-05, + "loss": 0.03917034566402435, + "step": 138250 + }, + { + "epoch": 0.5935790766166078, + "grad_norm": 0.029983574524521828, + "learning_rate": 4.081776083750852e-05, + "loss": 0.15364230871200563, + "step": 138260 + }, + { + "epoch": 0.5936220087066278, + "grad_norm": 0.0570920892059803, + "learning_rate": 4.08134491173909e-05, + "loss": 0.043169844150543216, + "step": 138270 + }, + { + "epoch": 0.5936649407966479, + "grad_norm": 0.0028020674362778664, + "learning_rate": 4.0809137397273275e-05, + "loss": 0.3166723966598511, + "step": 138280 + }, + { + "epoch": 0.5937078728866678, + "grad_norm": 0.026932211592793465, + "learning_rate": 4.0804825677155646e-05, + "loss": 0.24892525672912597, + "step": 138290 + }, + { + "epoch": 0.5937508049766879, + "grad_norm": 0.009527665562927723, + "learning_rate": 4.080051395703802e-05, + "loss": 0.11965830326080322, + "step": 138300 + }, + { + "epoch": 0.5937937370667079, + "grad_norm": 1.14619779586792, + "learning_rate": 4.07962022369204e-05, + "loss": 0.17031424045562743, + "step": 138310 + }, + { + "epoch": 0.5938366691567278, + "grad_norm": 0.0052214134484529495, + "learning_rate": 4.079189051680278e-05, + "loss": 0.2664251089096069, + "step": 138320 + }, + { + "epoch": 0.5938796012467479, + "grad_norm": 2.2635042667388916, + "learning_rate": 4.078757879668515e-05, + "loss": 0.1969763994216919, + "step": 138330 + }, + { + "epoch": 0.5939225333367679, + "grad_norm": 0.00652336934581399, + "learning_rate": 4.0783267076567526e-05, + "loss": 0.4026478290557861, + "step": 138340 + }, + { + "epoch": 0.5939654654267879, + "grad_norm": 1.975538730621338, + "learning_rate": 4.07789553564499e-05, + "loss": 0.4983255863189697, + "step": 138350 + }, + { + "epoch": 0.5940083975168079, + "grad_norm": 0.02280971221625805, + "learning_rate": 4.077464363633228e-05, + "loss": 0.27170419692993164, + "step": 138360 + }, + { + "epoch": 0.5940513296068279, + "grad_norm": 4.479470729827881, + "learning_rate": 4.077033191621466e-05, + "loss": 0.22238686084747314, + "step": 138370 + }, + { + "epoch": 0.594094261696848, + "grad_norm": 0.004820001777261496, + "learning_rate": 4.0766020196097035e-05, + "loss": 0.2552459478378296, + "step": 138380 + }, + { + "epoch": 0.5941371937868679, + "grad_norm": 0.20357632637023926, + "learning_rate": 4.076170847597941e-05, + "loss": 0.3800384759902954, + "step": 138390 + }, + { + "epoch": 0.594180125876888, + "grad_norm": 1.3113895654678345, + "learning_rate": 4.075739675586179e-05, + "loss": 0.19150335788726808, + "step": 138400 + }, + { + "epoch": 0.594223057966908, + "grad_norm": 0.018385406583547592, + "learning_rate": 4.075308503574416e-05, + "loss": 0.19509342908859253, + "step": 138410 + }, + { + "epoch": 0.5942659900569279, + "grad_norm": 5.595469951629639, + "learning_rate": 4.074877331562654e-05, + "loss": 0.1117366075515747, + "step": 138420 + }, + { + "epoch": 0.594308922146948, + "grad_norm": 0.20212681591510773, + "learning_rate": 4.0744461595508915e-05, + "loss": 0.3475062370300293, + "step": 138430 + }, + { + "epoch": 0.594351854236968, + "grad_norm": 0.028907861560583115, + "learning_rate": 4.074014987539129e-05, + "loss": 0.12352882623672486, + "step": 138440 + }, + { + "epoch": 0.594394786326988, + "grad_norm": 0.03655627369880676, + "learning_rate": 4.073583815527366e-05, + "loss": 0.29398033618927, + "step": 138450 + }, + { + "epoch": 0.594437718417008, + "grad_norm": 3.132746696472168, + "learning_rate": 4.073152643515604e-05, + "loss": 0.36105301380157473, + "step": 138460 + }, + { + "epoch": 0.594480650507028, + "grad_norm": 0.06194941699504852, + "learning_rate": 4.072721471503842e-05, + "loss": 0.23717799186706542, + "step": 138470 + }, + { + "epoch": 0.594523582597048, + "grad_norm": 0.014329830184578896, + "learning_rate": 4.0722902994920795e-05, + "loss": 0.16158952713012695, + "step": 138480 + }, + { + "epoch": 0.594566514687068, + "grad_norm": 2.4566140174865723, + "learning_rate": 4.071859127480317e-05, + "loss": 0.13322386741638184, + "step": 138490 + }, + { + "epoch": 0.594609446777088, + "grad_norm": 0.051132217049598694, + "learning_rate": 4.071427955468555e-05, + "loss": 0.15299041271209718, + "step": 138500 + }, + { + "epoch": 0.594652378867108, + "grad_norm": 0.057655222713947296, + "learning_rate": 4.070996783456793e-05, + "loss": 0.2507340669631958, + "step": 138510 + }, + { + "epoch": 0.594695310957128, + "grad_norm": 1.7659785747528076, + "learning_rate": 4.0705656114450304e-05, + "loss": 0.2532395839691162, + "step": 138520 + }, + { + "epoch": 0.5947382430471481, + "grad_norm": 0.2619655728340149, + "learning_rate": 4.0701344394332675e-05, + "loss": 0.025167009234428404, + "step": 138530 + }, + { + "epoch": 0.594781175137168, + "grad_norm": 0.259661465883255, + "learning_rate": 4.069703267421505e-05, + "loss": 0.14469207525253297, + "step": 138540 + }, + { + "epoch": 0.594824107227188, + "grad_norm": 0.0074020447209477425, + "learning_rate": 4.069272095409743e-05, + "loss": 0.1733548879623413, + "step": 138550 + }, + { + "epoch": 0.5948670393172081, + "grad_norm": 0.006281828507781029, + "learning_rate": 4.068840923397981e-05, + "loss": 0.30566720962524413, + "step": 138560 + }, + { + "epoch": 0.594909971407228, + "grad_norm": 3.7366669178009033, + "learning_rate": 4.068409751386218e-05, + "loss": 0.1994355797767639, + "step": 138570 + }, + { + "epoch": 0.594952903497248, + "grad_norm": 0.050392650067806244, + "learning_rate": 4.0679785793744555e-05, + "loss": 0.27683262825012206, + "step": 138580 + }, + { + "epoch": 0.5949958355872681, + "grad_norm": 0.030940018594264984, + "learning_rate": 4.067547407362693e-05, + "loss": 0.10335183143615723, + "step": 138590 + }, + { + "epoch": 0.595038767677288, + "grad_norm": 0.08072816580533981, + "learning_rate": 4.067116235350931e-05, + "loss": 0.17044700384140016, + "step": 138600 + }, + { + "epoch": 0.5950816997673081, + "grad_norm": 0.10144093632698059, + "learning_rate": 4.066685063339169e-05, + "loss": 0.18469510078430176, + "step": 138610 + }, + { + "epoch": 0.5951246318573281, + "grad_norm": 0.40955373644828796, + "learning_rate": 4.0662538913274064e-05, + "loss": 0.33158886432647705, + "step": 138620 + }, + { + "epoch": 0.595167563947348, + "grad_norm": 0.8902355432510376, + "learning_rate": 4.065822719315644e-05, + "loss": 0.1779848337173462, + "step": 138630 + }, + { + "epoch": 0.5952104960373681, + "grad_norm": 3.3796346187591553, + "learning_rate": 4.065391547303882e-05, + "loss": 0.16189804077148437, + "step": 138640 + }, + { + "epoch": 0.5952534281273881, + "grad_norm": 0.0013126464327797294, + "learning_rate": 4.0649603752921196e-05, + "loss": 0.1056704044342041, + "step": 138650 + }, + { + "epoch": 0.5952963602174081, + "grad_norm": 2.0929417610168457, + "learning_rate": 4.064529203280357e-05, + "loss": 0.12593846321105956, + "step": 138660 + }, + { + "epoch": 0.5953392923074281, + "grad_norm": 0.014701430685818195, + "learning_rate": 4.0640980312685944e-05, + "loss": 0.10860245227813721, + "step": 138670 + }, + { + "epoch": 0.5953822243974481, + "grad_norm": 0.00810084119439125, + "learning_rate": 4.063666859256832e-05, + "loss": 0.01611505150794983, + "step": 138680 + }, + { + "epoch": 0.5954251564874681, + "grad_norm": 0.4513009190559387, + "learning_rate": 4.06323568724507e-05, + "loss": 0.1624898910522461, + "step": 138690 + }, + { + "epoch": 0.5954680885774881, + "grad_norm": 0.0283687524497509, + "learning_rate": 4.062804515233307e-05, + "loss": 0.2079389810562134, + "step": 138700 + }, + { + "epoch": 0.5955110206675082, + "grad_norm": 1.9030858278274536, + "learning_rate": 4.062373343221545e-05, + "loss": 0.03300126194953919, + "step": 138710 + }, + { + "epoch": 0.5955539527575281, + "grad_norm": 0.001472063479013741, + "learning_rate": 4.0619421712097824e-05, + "loss": 0.26835658550262453, + "step": 138720 + }, + { + "epoch": 0.5955968848475481, + "grad_norm": 0.013345958665013313, + "learning_rate": 4.06151099919802e-05, + "loss": 0.24954426288604736, + "step": 138730 + }, + { + "epoch": 0.5956398169375682, + "grad_norm": 0.07247161865234375, + "learning_rate": 4.061079827186258e-05, + "loss": 0.13264119625091553, + "step": 138740 + }, + { + "epoch": 0.5956827490275881, + "grad_norm": 0.723473072052002, + "learning_rate": 4.0606486551744956e-05, + "loss": 0.22525534629821778, + "step": 138750 + }, + { + "epoch": 0.5957256811176082, + "grad_norm": 1.1872649192810059, + "learning_rate": 4.0602174831627333e-05, + "loss": 0.3446241617202759, + "step": 138760 + }, + { + "epoch": 0.5957686132076282, + "grad_norm": 0.7071019411087036, + "learning_rate": 4.059786311150971e-05, + "loss": 0.2649399995803833, + "step": 138770 + }, + { + "epoch": 0.5958115452976481, + "grad_norm": 0.024992501363158226, + "learning_rate": 4.059355139139208e-05, + "loss": 0.18880668878555298, + "step": 138780 + }, + { + "epoch": 0.5958544773876682, + "grad_norm": 3.2234370708465576, + "learning_rate": 4.058923967127446e-05, + "loss": 0.1728742837905884, + "step": 138790 + }, + { + "epoch": 0.5958974094776882, + "grad_norm": 13.19248104095459, + "learning_rate": 4.0584927951156836e-05, + "loss": 0.24392056465148926, + "step": 138800 + }, + { + "epoch": 0.5959403415677083, + "grad_norm": 2.747596025466919, + "learning_rate": 4.058061623103921e-05, + "loss": 0.28858864307403564, + "step": 138810 + }, + { + "epoch": 0.5959832736577282, + "grad_norm": 0.17838746309280396, + "learning_rate": 4.0576304510921584e-05, + "loss": 0.27601745128631594, + "step": 138820 + }, + { + "epoch": 0.5960262057477482, + "grad_norm": 0.02302752248942852, + "learning_rate": 4.057199279080396e-05, + "loss": 0.2467722177505493, + "step": 138830 + }, + { + "epoch": 0.5960691378377683, + "grad_norm": 0.04111037775874138, + "learning_rate": 4.056768107068634e-05, + "loss": 0.32037968635559083, + "step": 138840 + }, + { + "epoch": 0.5961120699277882, + "grad_norm": 0.021072717383503914, + "learning_rate": 4.056336935056872e-05, + "loss": 0.28800349235534667, + "step": 138850 + }, + { + "epoch": 0.5961550020178082, + "grad_norm": 5.909653663635254, + "learning_rate": 4.055905763045109e-05, + "loss": 0.3441181659698486, + "step": 138860 + }, + { + "epoch": 0.5961979341078283, + "grad_norm": 9.896764755249023, + "learning_rate": 4.055474591033347e-05, + "loss": 0.1869523286819458, + "step": 138870 + }, + { + "epoch": 0.5962408661978482, + "grad_norm": 8.402483940124512, + "learning_rate": 4.055043419021585e-05, + "loss": 0.2832054138183594, + "step": 138880 + }, + { + "epoch": 0.5962837982878683, + "grad_norm": 0.009975524619221687, + "learning_rate": 4.0546122470098225e-05, + "loss": 0.15514372587203978, + "step": 138890 + }, + { + "epoch": 0.5963267303778883, + "grad_norm": 0.10215646028518677, + "learning_rate": 4.0541810749980596e-05, + "loss": 0.3102398872375488, + "step": 138900 + }, + { + "epoch": 0.5963696624679082, + "grad_norm": 0.17041859030723572, + "learning_rate": 4.053749902986297e-05, + "loss": 0.27873940467834474, + "step": 138910 + }, + { + "epoch": 0.5964125945579283, + "grad_norm": 2.4230082035064697, + "learning_rate": 4.053318730974535e-05, + "loss": 0.21358671188354492, + "step": 138920 + }, + { + "epoch": 0.5964555266479483, + "grad_norm": 0.0803167000412941, + "learning_rate": 4.052887558962773e-05, + "loss": 0.07980765104293823, + "step": 138930 + }, + { + "epoch": 0.5964984587379683, + "grad_norm": 23.38479995727539, + "learning_rate": 4.05245638695101e-05, + "loss": 0.06409929990768433, + "step": 138940 + }, + { + "epoch": 0.5965413908279883, + "grad_norm": 0.002030240371823311, + "learning_rate": 4.0520252149392476e-05, + "loss": 0.17009581327438356, + "step": 138950 + }, + { + "epoch": 0.5965843229180083, + "grad_norm": 0.023227572441101074, + "learning_rate": 4.051594042927486e-05, + "loss": 0.34738569259643554, + "step": 138960 + }, + { + "epoch": 0.5966272550080283, + "grad_norm": 1.8141781091690063, + "learning_rate": 4.051162870915724e-05, + "loss": 0.24157049655914306, + "step": 138970 + }, + { + "epoch": 0.5966701870980483, + "grad_norm": 1.13673734664917, + "learning_rate": 4.0507316989039615e-05, + "loss": 0.13472883701324462, + "step": 138980 + }, + { + "epoch": 0.5967131191880684, + "grad_norm": 0.0068895393051207066, + "learning_rate": 4.0503005268921985e-05, + "loss": 0.24904513359069824, + "step": 138990 + }, + { + "epoch": 0.5967560512780883, + "grad_norm": 0.01904173754155636, + "learning_rate": 4.049869354880436e-05, + "loss": 0.3177989721298218, + "step": 139000 + }, + { + "epoch": 0.5967560512780883, + "eval_loss": 0.4010623097419739, + "eval_runtime": 27.1679, + "eval_samples_per_second": 3.681, + "eval_steps_per_second": 3.681, + "step": 139000 + }, + { + "epoch": 0.5967989833681083, + "grad_norm": 0.02827509678900242, + "learning_rate": 4.049438182868674e-05, + "loss": 0.1803138494491577, + "step": 139010 + }, + { + "epoch": 0.5968419154581284, + "grad_norm": 0.09278185665607452, + "learning_rate": 4.049007010856912e-05, + "loss": 0.08599947690963745, + "step": 139020 + }, + { + "epoch": 0.5968848475481483, + "grad_norm": 2.0457799434661865, + "learning_rate": 4.048575838845149e-05, + "loss": 0.1696843385696411, + "step": 139030 + }, + { + "epoch": 0.5969277796381683, + "grad_norm": 0.28158533573150635, + "learning_rate": 4.0481446668333865e-05, + "loss": 0.31057536602020264, + "step": 139040 + }, + { + "epoch": 0.5969707117281884, + "grad_norm": 1.8282661437988281, + "learning_rate": 4.047713494821624e-05, + "loss": 0.20569372177124023, + "step": 139050 + }, + { + "epoch": 0.5970136438182083, + "grad_norm": 0.027583178132772446, + "learning_rate": 4.047282322809862e-05, + "loss": 0.32853972911834717, + "step": 139060 + }, + { + "epoch": 0.5970565759082284, + "grad_norm": 0.039123062044382095, + "learning_rate": 4.0468511507981e-05, + "loss": 0.14497545957565308, + "step": 139070 + }, + { + "epoch": 0.5970995079982484, + "grad_norm": 0.0013616773067042232, + "learning_rate": 4.0464199787863375e-05, + "loss": 0.2893491268157959, + "step": 139080 + }, + { + "epoch": 0.5971424400882683, + "grad_norm": 0.20029796659946442, + "learning_rate": 4.045988806774575e-05, + "loss": 0.18861448764801025, + "step": 139090 + }, + { + "epoch": 0.5971853721782884, + "grad_norm": 0.01592946983873844, + "learning_rate": 4.045557634762813e-05, + "loss": 0.13244056701660156, + "step": 139100 + }, + { + "epoch": 0.5972283042683084, + "grad_norm": 4.180685520172119, + "learning_rate": 4.04512646275105e-05, + "loss": 0.4059516429901123, + "step": 139110 + }, + { + "epoch": 0.5972712363583284, + "grad_norm": 1.714571475982666, + "learning_rate": 4.044695290739288e-05, + "loss": 0.38700339794158933, + "step": 139120 + }, + { + "epoch": 0.5973141684483484, + "grad_norm": 8.81942081451416, + "learning_rate": 4.0442641187275254e-05, + "loss": 0.07162479162216187, + "step": 139130 + }, + { + "epoch": 0.5973571005383684, + "grad_norm": 4.870468616485596, + "learning_rate": 4.043832946715763e-05, + "loss": 0.0521313488483429, + "step": 139140 + }, + { + "epoch": 0.5974000326283884, + "grad_norm": 0.16712331771850586, + "learning_rate": 4.043401774704e-05, + "loss": 0.24226417541503906, + "step": 139150 + }, + { + "epoch": 0.5974429647184084, + "grad_norm": 3.7403082847595215, + "learning_rate": 4.042970602692238e-05, + "loss": 0.2585084676742554, + "step": 139160 + }, + { + "epoch": 0.5974858968084285, + "grad_norm": 1.1525099277496338, + "learning_rate": 4.042539430680476e-05, + "loss": 0.2858480453491211, + "step": 139170 + }, + { + "epoch": 0.5975288288984484, + "grad_norm": 0.0030668508261442184, + "learning_rate": 4.0421082586687134e-05, + "loss": 0.08854332566261292, + "step": 139180 + }, + { + "epoch": 0.5975717609884684, + "grad_norm": 8.544142723083496, + "learning_rate": 4.041677086656951e-05, + "loss": 0.20749473571777344, + "step": 139190 + }, + { + "epoch": 0.5976146930784885, + "grad_norm": 0.05123743787407875, + "learning_rate": 4.041245914645189e-05, + "loss": 0.28833374977111814, + "step": 139200 + }, + { + "epoch": 0.5976576251685084, + "grad_norm": 0.4078068137168884, + "learning_rate": 4.0408147426334266e-05, + "loss": 0.15883285999298097, + "step": 139210 + }, + { + "epoch": 0.5977005572585284, + "grad_norm": 0.5867086052894592, + "learning_rate": 4.0403835706216644e-05, + "loss": 0.0185800701379776, + "step": 139220 + }, + { + "epoch": 0.5977434893485485, + "grad_norm": 0.020372772589325905, + "learning_rate": 4.0399523986099014e-05, + "loss": 0.14691412448883057, + "step": 139230 + }, + { + "epoch": 0.5977864214385685, + "grad_norm": 2.4413959980010986, + "learning_rate": 4.039521226598139e-05, + "loss": 0.04680218100547791, + "step": 139240 + }, + { + "epoch": 0.5978293535285885, + "grad_norm": 0.1127878874540329, + "learning_rate": 4.039090054586377e-05, + "loss": 0.24358739852905273, + "step": 139250 + }, + { + "epoch": 0.5978722856186085, + "grad_norm": 0.0011141104623675346, + "learning_rate": 4.0386588825746146e-05, + "loss": 0.11816627979278564, + "step": 139260 + }, + { + "epoch": 0.5979152177086285, + "grad_norm": 0.07920151203870773, + "learning_rate": 4.038227710562852e-05, + "loss": 0.19519985914230348, + "step": 139270 + }, + { + "epoch": 0.5979581497986485, + "grad_norm": 3.432372808456421, + "learning_rate": 4.0377965385510894e-05, + "loss": 0.16987917423248292, + "step": 139280 + }, + { + "epoch": 0.5980010818886685, + "grad_norm": 0.006460071075707674, + "learning_rate": 4.037365366539327e-05, + "loss": 0.1509072184562683, + "step": 139290 + }, + { + "epoch": 0.5980440139786886, + "grad_norm": 10.475028991699219, + "learning_rate": 4.036934194527565e-05, + "loss": 0.16783927679061889, + "step": 139300 + }, + { + "epoch": 0.5980869460687085, + "grad_norm": 0.22108608484268188, + "learning_rate": 4.0365030225158026e-05, + "loss": 0.22498059272766113, + "step": 139310 + }, + { + "epoch": 0.5981298781587285, + "grad_norm": 1.1432753801345825, + "learning_rate": 4.0360718505040404e-05, + "loss": 0.2228973388671875, + "step": 139320 + }, + { + "epoch": 0.5981728102487486, + "grad_norm": 1.3360220193862915, + "learning_rate": 4.035640678492278e-05, + "loss": 0.1929041028022766, + "step": 139330 + }, + { + "epoch": 0.5982157423387685, + "grad_norm": 0.007678360678255558, + "learning_rate": 4.035209506480516e-05, + "loss": 0.11292246580123902, + "step": 139340 + }, + { + "epoch": 0.5982586744287886, + "grad_norm": 1.5282012224197388, + "learning_rate": 4.0347783344687536e-05, + "loss": 0.3063904523849487, + "step": 139350 + }, + { + "epoch": 0.5983016065188086, + "grad_norm": 0.27093306183815, + "learning_rate": 4.0343471624569906e-05, + "loss": 0.13120408058166505, + "step": 139360 + }, + { + "epoch": 0.5983445386088285, + "grad_norm": 1.001611351966858, + "learning_rate": 4.0339159904452284e-05, + "loss": 0.11028392314910888, + "step": 139370 + }, + { + "epoch": 0.5983874706988486, + "grad_norm": 0.1968022882938385, + "learning_rate": 4.033484818433466e-05, + "loss": 0.35245230197906496, + "step": 139380 + }, + { + "epoch": 0.5984304027888686, + "grad_norm": 0.031681448221206665, + "learning_rate": 4.033053646421704e-05, + "loss": 0.22298216819763184, + "step": 139390 + }, + { + "epoch": 0.5984733348788885, + "grad_norm": 1.4180032014846802, + "learning_rate": 4.032622474409941e-05, + "loss": 0.31283843517303467, + "step": 139400 + }, + { + "epoch": 0.5985162669689086, + "grad_norm": 0.010278967209160328, + "learning_rate": 4.0321913023981786e-05, + "loss": 0.31977884769439696, + "step": 139410 + }, + { + "epoch": 0.5985591990589286, + "grad_norm": 0.08360231667757034, + "learning_rate": 4.0317601303864164e-05, + "loss": 0.2159090518951416, + "step": 139420 + }, + { + "epoch": 0.5986021311489486, + "grad_norm": 0.27361536026000977, + "learning_rate": 4.031328958374654e-05, + "loss": 0.1017007827758789, + "step": 139430 + }, + { + "epoch": 0.5986450632389686, + "grad_norm": 0.026083212345838547, + "learning_rate": 4.030897786362892e-05, + "loss": 0.2089712142944336, + "step": 139440 + }, + { + "epoch": 0.5986879953289886, + "grad_norm": 1.9075127840042114, + "learning_rate": 4.0304666143511296e-05, + "loss": 0.2128284215927124, + "step": 139450 + }, + { + "epoch": 0.5987309274190086, + "grad_norm": 0.016359496861696243, + "learning_rate": 4.030035442339367e-05, + "loss": 0.09640793800354004, + "step": 139460 + }, + { + "epoch": 0.5987738595090286, + "grad_norm": 0.010604800656437874, + "learning_rate": 4.029604270327605e-05, + "loss": 0.3107742786407471, + "step": 139470 + }, + { + "epoch": 0.5988167915990487, + "grad_norm": 4.734330177307129, + "learning_rate": 4.029173098315842e-05, + "loss": 0.3754775047302246, + "step": 139480 + }, + { + "epoch": 0.5988597236890686, + "grad_norm": 4.360969066619873, + "learning_rate": 4.02874192630408e-05, + "loss": 0.45675835609436033, + "step": 139490 + }, + { + "epoch": 0.5989026557790886, + "grad_norm": 0.01934775337576866, + "learning_rate": 4.0283107542923176e-05, + "loss": 0.2801548957824707, + "step": 139500 + }, + { + "epoch": 0.5989455878691087, + "grad_norm": 1.2892874479293823, + "learning_rate": 4.027879582280555e-05, + "loss": 0.11840264797210694, + "step": 139510 + }, + { + "epoch": 0.5989885199591286, + "grad_norm": 0.0935051441192627, + "learning_rate": 4.0274484102687923e-05, + "loss": 0.11873539686203002, + "step": 139520 + }, + { + "epoch": 0.5990314520491486, + "grad_norm": 0.010609208606183529, + "learning_rate": 4.02701723825703e-05, + "loss": 0.12016826868057251, + "step": 139530 + }, + { + "epoch": 0.5990743841391687, + "grad_norm": 0.0024884147569537163, + "learning_rate": 4.026586066245268e-05, + "loss": 0.13507968187332153, + "step": 139540 + }, + { + "epoch": 0.5991173162291886, + "grad_norm": 0.003081399481743574, + "learning_rate": 4.026154894233506e-05, + "loss": 0.07944802641868591, + "step": 139550 + }, + { + "epoch": 0.5991602483192087, + "grad_norm": 1.4031720161437988, + "learning_rate": 4.025723722221743e-05, + "loss": 0.28442862033843996, + "step": 139560 + }, + { + "epoch": 0.5992031804092287, + "grad_norm": 0.6752078533172607, + "learning_rate": 4.025292550209981e-05, + "loss": 0.26588103771209715, + "step": 139570 + }, + { + "epoch": 0.5992461124992486, + "grad_norm": 0.8555001616477966, + "learning_rate": 4.024861378198219e-05, + "loss": 0.11324497461318969, + "step": 139580 + }, + { + "epoch": 0.5992890445892687, + "grad_norm": 0.08063609153032303, + "learning_rate": 4.0244302061864565e-05, + "loss": 0.20769956111907958, + "step": 139590 + }, + { + "epoch": 0.5993319766792887, + "grad_norm": 0.008143313229084015, + "learning_rate": 4.0239990341746935e-05, + "loss": 0.07645671367645264, + "step": 139600 + }, + { + "epoch": 0.5993749087693087, + "grad_norm": 0.04053580388426781, + "learning_rate": 4.023567862162931e-05, + "loss": 0.13342757225036622, + "step": 139610 + }, + { + "epoch": 0.5994178408593287, + "grad_norm": 2.0824387073516846, + "learning_rate": 4.023136690151169e-05, + "loss": 0.24516801834106444, + "step": 139620 + }, + { + "epoch": 0.5994607729493487, + "grad_norm": 0.0006679339567199349, + "learning_rate": 4.022705518139407e-05, + "loss": 0.1988581895828247, + "step": 139630 + }, + { + "epoch": 0.5995037050393687, + "grad_norm": 0.024627642706036568, + "learning_rate": 4.022274346127644e-05, + "loss": 0.2887857437133789, + "step": 139640 + }, + { + "epoch": 0.5995466371293887, + "grad_norm": 0.0028000574093312025, + "learning_rate": 4.0218431741158815e-05, + "loss": 0.30062689781188967, + "step": 139650 + }, + { + "epoch": 0.5995895692194088, + "grad_norm": 0.7627494931221008, + "learning_rate": 4.02141200210412e-05, + "loss": 0.425076961517334, + "step": 139660 + }, + { + "epoch": 0.5996325013094288, + "grad_norm": 0.004241509363055229, + "learning_rate": 4.020980830092358e-05, + "loss": 0.044894880056381224, + "step": 139670 + }, + { + "epoch": 0.5996754333994487, + "grad_norm": 0.020708652213215828, + "learning_rate": 4.0205496580805954e-05, + "loss": 0.2606109619140625, + "step": 139680 + }, + { + "epoch": 0.5997183654894688, + "grad_norm": 3.3840079307556152, + "learning_rate": 4.0201184860688325e-05, + "loss": 0.28748269081115724, + "step": 139690 + }, + { + "epoch": 0.5997612975794888, + "grad_norm": 0.025514988228678703, + "learning_rate": 4.01968731405707e-05, + "loss": 0.1521053433418274, + "step": 139700 + }, + { + "epoch": 0.5998042296695087, + "grad_norm": 0.3507746458053589, + "learning_rate": 4.019256142045308e-05, + "loss": 0.183748197555542, + "step": 139710 + }, + { + "epoch": 0.5998471617595288, + "grad_norm": 1.4777569770812988, + "learning_rate": 4.018824970033546e-05, + "loss": 0.20436084270477295, + "step": 139720 + }, + { + "epoch": 0.5998900938495488, + "grad_norm": 2.184016227722168, + "learning_rate": 4.018393798021783e-05, + "loss": 0.32288227081298826, + "step": 139730 + }, + { + "epoch": 0.5999330259395688, + "grad_norm": 3.9362363815307617, + "learning_rate": 4.0179626260100205e-05, + "loss": 0.18843557834625244, + "step": 139740 + }, + { + "epoch": 0.5999759580295888, + "grad_norm": 2.1692709922790527, + "learning_rate": 4.017531453998258e-05, + "loss": 0.41624040603637696, + "step": 139750 + }, + { + "epoch": 0.6000188901196088, + "grad_norm": 0.14851386845111847, + "learning_rate": 4.017100281986496e-05, + "loss": 0.21869902610778807, + "step": 139760 + }, + { + "epoch": 0.6000618222096288, + "grad_norm": 0.011462419293820858, + "learning_rate": 4.016669109974734e-05, + "loss": 0.14370282888412475, + "step": 139770 + }, + { + "epoch": 0.6001047542996488, + "grad_norm": 0.0862903892993927, + "learning_rate": 4.0162379379629714e-05, + "loss": 0.2030428409576416, + "step": 139780 + }, + { + "epoch": 0.6001476863896689, + "grad_norm": 0.006694111507385969, + "learning_rate": 4.015806765951209e-05, + "loss": 0.10970304012298585, + "step": 139790 + }, + { + "epoch": 0.6001906184796888, + "grad_norm": 0.04753762483596802, + "learning_rate": 4.015375593939447e-05, + "loss": 0.20441656112670897, + "step": 139800 + }, + { + "epoch": 0.6002335505697088, + "grad_norm": 0.1337500959634781, + "learning_rate": 4.014944421927684e-05, + "loss": 0.18382371664047242, + "step": 139810 + }, + { + "epoch": 0.6002764826597289, + "grad_norm": 8.118321418762207, + "learning_rate": 4.0145132499159217e-05, + "loss": 0.4093769073486328, + "step": 139820 + }, + { + "epoch": 0.6003194147497488, + "grad_norm": 0.02916407212615013, + "learning_rate": 4.0140820779041594e-05, + "loss": 0.24807960987091066, + "step": 139830 + }, + { + "epoch": 0.6003623468397689, + "grad_norm": 0.6688809990882874, + "learning_rate": 4.013650905892397e-05, + "loss": 0.1378745436668396, + "step": 139840 + }, + { + "epoch": 0.6004052789297889, + "grad_norm": 0.007759707979857922, + "learning_rate": 4.013219733880634e-05, + "loss": 0.17140055894851686, + "step": 139850 + }, + { + "epoch": 0.6004482110198088, + "grad_norm": 0.008196969516575336, + "learning_rate": 4.012788561868872e-05, + "loss": 0.29869420528411866, + "step": 139860 + }, + { + "epoch": 0.6004911431098289, + "grad_norm": 0.00213833199813962, + "learning_rate": 4.0123573898571097e-05, + "loss": 0.2550304889678955, + "step": 139870 + }, + { + "epoch": 0.6005340751998489, + "grad_norm": 1.2392281293869019, + "learning_rate": 4.0119262178453474e-05, + "loss": 0.31737627983093264, + "step": 139880 + }, + { + "epoch": 0.6005770072898688, + "grad_norm": 1.2636436223983765, + "learning_rate": 4.011495045833585e-05, + "loss": 0.3847317695617676, + "step": 139890 + }, + { + "epoch": 0.6006199393798889, + "grad_norm": 0.0047286092303693295, + "learning_rate": 4.011063873821823e-05, + "loss": 0.17933709621429444, + "step": 139900 + }, + { + "epoch": 0.6006628714699089, + "grad_norm": 0.007108923979103565, + "learning_rate": 4.0106327018100606e-05, + "loss": 0.16458051204681395, + "step": 139910 + }, + { + "epoch": 0.6007058035599289, + "grad_norm": 0.6757831573486328, + "learning_rate": 4.010201529798298e-05, + "loss": 0.2534809589385986, + "step": 139920 + }, + { + "epoch": 0.6007487356499489, + "grad_norm": 0.1733238697052002, + "learning_rate": 4.0097703577865354e-05, + "loss": 0.24300312995910645, + "step": 139930 + }, + { + "epoch": 0.600791667739969, + "grad_norm": 0.0027367551811039448, + "learning_rate": 4.009339185774773e-05, + "loss": 0.14856563806533812, + "step": 139940 + }, + { + "epoch": 0.6008345998299889, + "grad_norm": 0.24011345207691193, + "learning_rate": 4.008908013763011e-05, + "loss": 0.057327890396118165, + "step": 139950 + }, + { + "epoch": 0.6008775319200089, + "grad_norm": 0.11384332180023193, + "learning_rate": 4.0084768417512486e-05, + "loss": 0.18769536018371583, + "step": 139960 + }, + { + "epoch": 0.600920464010029, + "grad_norm": 0.014897527173161507, + "learning_rate": 4.0080456697394856e-05, + "loss": 0.18856242895126343, + "step": 139970 + }, + { + "epoch": 0.6009633961000489, + "grad_norm": 0.5122405290603638, + "learning_rate": 4.0076144977277234e-05, + "loss": 0.12947138547897338, + "step": 139980 + }, + { + "epoch": 0.6010063281900689, + "grad_norm": 0.085267573595047, + "learning_rate": 4.007183325715961e-05, + "loss": 0.23959081172943114, + "step": 139990 + }, + { + "epoch": 0.601049260280089, + "grad_norm": 1.2138646841049194, + "learning_rate": 4.006752153704199e-05, + "loss": 0.24218153953552246, + "step": 140000 + }, + { + "epoch": 0.601049260280089, + "eval_loss": 0.40640848875045776, + "eval_runtime": 27.2521, + "eval_samples_per_second": 3.669, + "eval_steps_per_second": 3.669, + "step": 140000 + }, + { + "epoch": 0.6010921923701089, + "grad_norm": 2.4744691848754883, + "learning_rate": 4.0063209816924366e-05, + "loss": 0.21239218711853028, + "step": 140010 + }, + { + "epoch": 0.601135124460129, + "grad_norm": 0.18298260867595673, + "learning_rate": 4.005889809680674e-05, + "loss": 0.1979839324951172, + "step": 140020 + }, + { + "epoch": 0.601178056550149, + "grad_norm": 0.24607233703136444, + "learning_rate": 4.005458637668912e-05, + "loss": 0.14506577253341674, + "step": 140030 + }, + { + "epoch": 0.6012209886401689, + "grad_norm": 1.946406602859497, + "learning_rate": 4.00502746565715e-05, + "loss": 0.19754066467285156, + "step": 140040 + }, + { + "epoch": 0.601263920730189, + "grad_norm": 0.16587841510772705, + "learning_rate": 4.0045962936453875e-05, + "loss": 0.20553255081176758, + "step": 140050 + }, + { + "epoch": 0.601306852820209, + "grad_norm": 0.08923459053039551, + "learning_rate": 4.0041651216336246e-05, + "loss": 0.2600035429000854, + "step": 140060 + }, + { + "epoch": 0.601349784910229, + "grad_norm": 0.17558036744594574, + "learning_rate": 4.003733949621862e-05, + "loss": 0.29572343826293945, + "step": 140070 + }, + { + "epoch": 0.601392717000249, + "grad_norm": 0.0013027727836742997, + "learning_rate": 4.0033027776101e-05, + "loss": 0.058405238389968875, + "step": 140080 + }, + { + "epoch": 0.601435649090269, + "grad_norm": 0.0022462320048362017, + "learning_rate": 4.002871605598338e-05, + "loss": 0.28202013969421386, + "step": 140090 + }, + { + "epoch": 0.6014785811802891, + "grad_norm": 0.0036439145915210247, + "learning_rate": 4.002440433586575e-05, + "loss": 0.160456120967865, + "step": 140100 + }, + { + "epoch": 0.601521513270309, + "grad_norm": 1.2931004762649536, + "learning_rate": 4.0020092615748126e-05, + "loss": 0.2139185905456543, + "step": 140110 + }, + { + "epoch": 0.601564445360329, + "grad_norm": 4.611878395080566, + "learning_rate": 4.00157808956305e-05, + "loss": 0.3115818738937378, + "step": 140120 + }, + { + "epoch": 0.6016073774503491, + "grad_norm": 0.5753462314605713, + "learning_rate": 4.001146917551288e-05, + "loss": 0.21315619945526124, + "step": 140130 + }, + { + "epoch": 0.601650309540369, + "grad_norm": 0.18079976737499237, + "learning_rate": 4.000715745539526e-05, + "loss": 0.09467348456382751, + "step": 140140 + }, + { + "epoch": 0.6016932416303891, + "grad_norm": 0.18340753018856049, + "learning_rate": 4.0002845735277635e-05, + "loss": 0.20837154388427734, + "step": 140150 + }, + { + "epoch": 0.6017361737204091, + "grad_norm": 1.1413424015045166, + "learning_rate": 3.999853401516001e-05, + "loss": 0.11949852705001832, + "step": 140160 + }, + { + "epoch": 0.601779105810429, + "grad_norm": 5.983482360839844, + "learning_rate": 3.999422229504239e-05, + "loss": 0.3660741329193115, + "step": 140170 + }, + { + "epoch": 0.6018220379004491, + "grad_norm": 0.005374810192734003, + "learning_rate": 3.998991057492476e-05, + "loss": 0.2321415901184082, + "step": 140180 + }, + { + "epoch": 0.6018649699904691, + "grad_norm": 0.2554090917110443, + "learning_rate": 3.998559885480714e-05, + "loss": 0.19533088207244872, + "step": 140190 + }, + { + "epoch": 0.601907902080489, + "grad_norm": 0.40389445424079895, + "learning_rate": 3.9981287134689515e-05, + "loss": 0.17982652187347412, + "step": 140200 + }, + { + "epoch": 0.6019508341705091, + "grad_norm": 0.14264102280139923, + "learning_rate": 3.997697541457189e-05, + "loss": 0.11623998880386352, + "step": 140210 + }, + { + "epoch": 0.6019937662605291, + "grad_norm": 11.018896102905273, + "learning_rate": 3.997266369445426e-05, + "loss": 0.29056406021118164, + "step": 140220 + }, + { + "epoch": 0.6020366983505491, + "grad_norm": 0.019281527027487755, + "learning_rate": 3.996835197433664e-05, + "loss": 0.13616764545440674, + "step": 140230 + }, + { + "epoch": 0.6020796304405691, + "grad_norm": 6.079987049102783, + "learning_rate": 3.996404025421902e-05, + "loss": 0.2820961236953735, + "step": 140240 + }, + { + "epoch": 0.6021225625305892, + "grad_norm": 0.09342295676469803, + "learning_rate": 3.9959728534101395e-05, + "loss": 0.393782639503479, + "step": 140250 + }, + { + "epoch": 0.6021654946206091, + "grad_norm": 0.13038615882396698, + "learning_rate": 3.995541681398377e-05, + "loss": 0.08735232949256896, + "step": 140260 + }, + { + "epoch": 0.6022084267106291, + "grad_norm": 0.2240753024816513, + "learning_rate": 3.995110509386615e-05, + "loss": 0.5003287315368652, + "step": 140270 + }, + { + "epoch": 0.6022513588006492, + "grad_norm": 0.0016490390989929438, + "learning_rate": 3.994679337374853e-05, + "loss": 0.23796720504760743, + "step": 140280 + }, + { + "epoch": 0.6022942908906691, + "grad_norm": 0.2137441635131836, + "learning_rate": 3.9942481653630904e-05, + "loss": 0.06123405694961548, + "step": 140290 + }, + { + "epoch": 0.6023372229806891, + "grad_norm": 2.8598108291625977, + "learning_rate": 3.9938169933513275e-05, + "loss": 0.14074127674102782, + "step": 140300 + }, + { + "epoch": 0.6023801550707092, + "grad_norm": 2.1544981002807617, + "learning_rate": 3.993385821339565e-05, + "loss": 0.21434509754180908, + "step": 140310 + }, + { + "epoch": 0.6024230871607291, + "grad_norm": 9.761442184448242, + "learning_rate": 3.992954649327803e-05, + "loss": 0.43427672386169436, + "step": 140320 + }, + { + "epoch": 0.6024660192507492, + "grad_norm": 1.8055157661437988, + "learning_rate": 3.992523477316041e-05, + "loss": 0.2403231143951416, + "step": 140330 + }, + { + "epoch": 0.6025089513407692, + "grad_norm": 0.7270634770393372, + "learning_rate": 3.992092305304278e-05, + "loss": 0.11149526834487915, + "step": 140340 + }, + { + "epoch": 0.6025518834307891, + "grad_norm": 8.909162521362305, + "learning_rate": 3.9916611332925155e-05, + "loss": 0.2438430070877075, + "step": 140350 + }, + { + "epoch": 0.6025948155208092, + "grad_norm": 1.5331367254257202, + "learning_rate": 3.991229961280753e-05, + "loss": 0.4124717712402344, + "step": 140360 + }, + { + "epoch": 0.6026377476108292, + "grad_norm": 3.782731771469116, + "learning_rate": 3.9907987892689916e-05, + "loss": 0.3205994129180908, + "step": 140370 + }, + { + "epoch": 0.6026806797008492, + "grad_norm": 0.04977316036820412, + "learning_rate": 3.990367617257229e-05, + "loss": 0.26922576427459716, + "step": 140380 + }, + { + "epoch": 0.6027236117908692, + "grad_norm": 0.2401779443025589, + "learning_rate": 3.9899364452454664e-05, + "loss": 0.4137077331542969, + "step": 140390 + }, + { + "epoch": 0.6027665438808892, + "grad_norm": 0.03330448642373085, + "learning_rate": 3.989505273233704e-05, + "loss": 0.19204211235046387, + "step": 140400 + }, + { + "epoch": 0.6028094759709092, + "grad_norm": 0.19992871582508087, + "learning_rate": 3.989074101221942e-05, + "loss": 0.11128789186477661, + "step": 140410 + }, + { + "epoch": 0.6028524080609292, + "grad_norm": 0.02906624972820282, + "learning_rate": 3.9886429292101796e-05, + "loss": 0.08270058035850525, + "step": 140420 + }, + { + "epoch": 0.6028953401509493, + "grad_norm": 1.2285542488098145, + "learning_rate": 3.988211757198417e-05, + "loss": 0.3234747886657715, + "step": 140430 + }, + { + "epoch": 0.6029382722409692, + "grad_norm": 0.488831102848053, + "learning_rate": 3.9877805851866544e-05, + "loss": 0.25629940032958987, + "step": 140440 + }, + { + "epoch": 0.6029812043309892, + "grad_norm": 0.21581482887268066, + "learning_rate": 3.987349413174892e-05, + "loss": 0.1724982500076294, + "step": 140450 + }, + { + "epoch": 0.6030241364210093, + "grad_norm": 0.7537409663200378, + "learning_rate": 3.98691824116313e-05, + "loss": 0.25681922435760496, + "step": 140460 + }, + { + "epoch": 0.6030670685110292, + "grad_norm": 0.03951547294855118, + "learning_rate": 3.986487069151367e-05, + "loss": 0.06932097673416138, + "step": 140470 + }, + { + "epoch": 0.6031100006010492, + "grad_norm": 0.018945086747407913, + "learning_rate": 3.9860558971396053e-05, + "loss": 0.22218098640441894, + "step": 140480 + }, + { + "epoch": 0.6031529326910693, + "grad_norm": 0.23054622113704681, + "learning_rate": 3.985624725127843e-05, + "loss": 0.215470552444458, + "step": 140490 + }, + { + "epoch": 0.6031958647810892, + "grad_norm": 0.20627227425575256, + "learning_rate": 3.985193553116081e-05, + "loss": 0.3484283208847046, + "step": 140500 + }, + { + "epoch": 0.6032387968711093, + "grad_norm": 0.10565587133169174, + "learning_rate": 3.984762381104318e-05, + "loss": 0.21985170841217042, + "step": 140510 + }, + { + "epoch": 0.6032817289611293, + "grad_norm": 3.1740968227386475, + "learning_rate": 3.9843312090925556e-05, + "loss": 0.40097837448120116, + "step": 140520 + }, + { + "epoch": 0.6033246610511493, + "grad_norm": 0.024558668956160545, + "learning_rate": 3.9839000370807933e-05, + "loss": 0.19059033393859864, + "step": 140530 + }, + { + "epoch": 0.6033675931411693, + "grad_norm": 0.9281611442565918, + "learning_rate": 3.983468865069031e-05, + "loss": 0.14230889081954956, + "step": 140540 + }, + { + "epoch": 0.6034105252311893, + "grad_norm": 0.009123001247644424, + "learning_rate": 3.983037693057268e-05, + "loss": 0.2806967496871948, + "step": 140550 + }, + { + "epoch": 0.6034534573212094, + "grad_norm": 1.394128680229187, + "learning_rate": 3.982606521045506e-05, + "loss": 0.15292155742645264, + "step": 140560 + }, + { + "epoch": 0.6034963894112293, + "grad_norm": 3.4160406589508057, + "learning_rate": 3.9821753490337436e-05, + "loss": 0.3051342248916626, + "step": 140570 + }, + { + "epoch": 0.6035393215012493, + "grad_norm": 0.4787931740283966, + "learning_rate": 3.981744177021981e-05, + "loss": 0.19603606462478637, + "step": 140580 + }, + { + "epoch": 0.6035822535912694, + "grad_norm": 0.012746613472700119, + "learning_rate": 3.981313005010219e-05, + "loss": 0.14620459079742432, + "step": 140590 + }, + { + "epoch": 0.6036251856812893, + "grad_norm": 0.11777313798666, + "learning_rate": 3.980881832998457e-05, + "loss": 0.43863778114318847, + "step": 140600 + }, + { + "epoch": 0.6036681177713094, + "grad_norm": 0.0012925468618050218, + "learning_rate": 3.9804506609866945e-05, + "loss": 0.27455117702484133, + "step": 140610 + }, + { + "epoch": 0.6037110498613294, + "grad_norm": 0.625106930732727, + "learning_rate": 3.980019488974932e-05, + "loss": 0.250502610206604, + "step": 140620 + }, + { + "epoch": 0.6037539819513493, + "grad_norm": 0.003046454396098852, + "learning_rate": 3.979588316963169e-05, + "loss": 0.021387167274951935, + "step": 140630 + }, + { + "epoch": 0.6037969140413694, + "grad_norm": 0.2070937603712082, + "learning_rate": 3.979157144951407e-05, + "loss": 0.2071608781814575, + "step": 140640 + }, + { + "epoch": 0.6038398461313894, + "grad_norm": 0.05185168609023094, + "learning_rate": 3.978725972939645e-05, + "loss": 0.0429812103509903, + "step": 140650 + }, + { + "epoch": 0.6038827782214093, + "grad_norm": 0.05144479125738144, + "learning_rate": 3.9782948009278825e-05, + "loss": 0.3867565870285034, + "step": 140660 + }, + { + "epoch": 0.6039257103114294, + "grad_norm": 1.467725396156311, + "learning_rate": 3.9778636289161196e-05, + "loss": 0.09524292349815369, + "step": 140670 + }, + { + "epoch": 0.6039686424014494, + "grad_norm": 0.16519929468631744, + "learning_rate": 3.977432456904357e-05, + "loss": 0.3241549015045166, + "step": 140680 + }, + { + "epoch": 0.6040115744914694, + "grad_norm": 16.647319793701172, + "learning_rate": 3.977001284892595e-05, + "loss": 0.3598466396331787, + "step": 140690 + }, + { + "epoch": 0.6040545065814894, + "grad_norm": 0.0008348033879883587, + "learning_rate": 3.976570112880833e-05, + "loss": 0.22508018016815184, + "step": 140700 + }, + { + "epoch": 0.6040974386715094, + "grad_norm": 0.00723852077499032, + "learning_rate": 3.9761389408690705e-05, + "loss": 0.23433120250701905, + "step": 140710 + }, + { + "epoch": 0.6041403707615294, + "grad_norm": 0.5035370588302612, + "learning_rate": 3.975707768857308e-05, + "loss": 0.15878416299819947, + "step": 140720 + }, + { + "epoch": 0.6041833028515494, + "grad_norm": 0.12092899531126022, + "learning_rate": 3.975276596845546e-05, + "loss": 0.16906187534332276, + "step": 140730 + }, + { + "epoch": 0.6042262349415695, + "grad_norm": 0.40479961037635803, + "learning_rate": 3.974845424833784e-05, + "loss": 0.20884625911712645, + "step": 140740 + }, + { + "epoch": 0.6042691670315894, + "grad_norm": 0.9106727838516235, + "learning_rate": 3.974414252822021e-05, + "loss": 0.23875579833984376, + "step": 140750 + }, + { + "epoch": 0.6043120991216094, + "grad_norm": 0.01491815596818924, + "learning_rate": 3.9739830808102585e-05, + "loss": 0.19347492456436158, + "step": 140760 + }, + { + "epoch": 0.6043550312116295, + "grad_norm": 1.5120779275894165, + "learning_rate": 3.973551908798496e-05, + "loss": 0.1883600115776062, + "step": 140770 + }, + { + "epoch": 0.6043979633016494, + "grad_norm": 1.720261573791504, + "learning_rate": 3.973120736786734e-05, + "loss": 0.41421709060668943, + "step": 140780 + }, + { + "epoch": 0.6044408953916695, + "grad_norm": 0.0006754127098247409, + "learning_rate": 3.972689564774972e-05, + "loss": 0.07793102860450744, + "step": 140790 + }, + { + "epoch": 0.6044838274816895, + "grad_norm": 0.01753504015505314, + "learning_rate": 3.972258392763209e-05, + "loss": 0.13585551977157592, + "step": 140800 + }, + { + "epoch": 0.6045267595717094, + "grad_norm": 1.807799220085144, + "learning_rate": 3.9718272207514465e-05, + "loss": 0.2476651668548584, + "step": 140810 + }, + { + "epoch": 0.6045696916617295, + "grad_norm": 0.3717762231826782, + "learning_rate": 3.971396048739684e-05, + "loss": 0.1306217670440674, + "step": 140820 + }, + { + "epoch": 0.6046126237517495, + "grad_norm": 0.0009543145424686372, + "learning_rate": 3.970964876727922e-05, + "loss": 0.3181750774383545, + "step": 140830 + }, + { + "epoch": 0.6046555558417694, + "grad_norm": 1.5006979703903198, + "learning_rate": 3.97053370471616e-05, + "loss": 0.1982407569885254, + "step": 140840 + }, + { + "epoch": 0.6046984879317895, + "grad_norm": 0.015589235350489616, + "learning_rate": 3.9701025327043975e-05, + "loss": 0.026549032330513, + "step": 140850 + }, + { + "epoch": 0.6047414200218095, + "grad_norm": 0.0038561576511710882, + "learning_rate": 3.969671360692635e-05, + "loss": 0.2876553773880005, + "step": 140860 + }, + { + "epoch": 0.6047843521118295, + "grad_norm": 0.00041575246723368764, + "learning_rate": 3.969240188680873e-05, + "loss": 0.08002186417579651, + "step": 140870 + }, + { + "epoch": 0.6048272842018495, + "grad_norm": 0.47469818592071533, + "learning_rate": 3.96880901666911e-05, + "loss": 0.22481439113616944, + "step": 140880 + }, + { + "epoch": 0.6048702162918695, + "grad_norm": 15.686685562133789, + "learning_rate": 3.968377844657348e-05, + "loss": 0.25566079616546633, + "step": 140890 + }, + { + "epoch": 0.6049131483818895, + "grad_norm": 1.3537100553512573, + "learning_rate": 3.9679466726455854e-05, + "loss": 0.2686317920684814, + "step": 140900 + }, + { + "epoch": 0.6049560804719095, + "grad_norm": 0.0857565775513649, + "learning_rate": 3.967515500633823e-05, + "loss": 0.21077220439910888, + "step": 140910 + }, + { + "epoch": 0.6049990125619296, + "grad_norm": 20.2806339263916, + "learning_rate": 3.96708432862206e-05, + "loss": 0.3465421199798584, + "step": 140920 + }, + { + "epoch": 0.6050419446519495, + "grad_norm": 0.06577757745981216, + "learning_rate": 3.966653156610298e-05, + "loss": 0.10634375810623169, + "step": 140930 + }, + { + "epoch": 0.6050848767419695, + "grad_norm": 0.6883453726768494, + "learning_rate": 3.966221984598536e-05, + "loss": 0.1529320001602173, + "step": 140940 + }, + { + "epoch": 0.6051278088319896, + "grad_norm": 0.33841758966445923, + "learning_rate": 3.9657908125867734e-05, + "loss": 0.12229356765747071, + "step": 140950 + }, + { + "epoch": 0.6051707409220096, + "grad_norm": 0.032311972230672836, + "learning_rate": 3.965359640575011e-05, + "loss": 0.20672781467437745, + "step": 140960 + }, + { + "epoch": 0.6052136730120296, + "grad_norm": 0.029374683275818825, + "learning_rate": 3.964928468563249e-05, + "loss": 0.27740681171417236, + "step": 140970 + }, + { + "epoch": 0.6052566051020496, + "grad_norm": 1.1965817213058472, + "learning_rate": 3.9644972965514866e-05, + "loss": 0.49901623725891114, + "step": 140980 + }, + { + "epoch": 0.6052995371920696, + "grad_norm": 1.5310200452804565, + "learning_rate": 3.9640661245397244e-05, + "loss": 0.13384013175964354, + "step": 140990 + }, + { + "epoch": 0.6053424692820896, + "grad_norm": 2.602834939956665, + "learning_rate": 3.9636349525279614e-05, + "loss": 0.12581024169921876, + "step": 141000 + }, + { + "epoch": 0.6053424692820896, + "eval_loss": 0.4089200496673584, + "eval_runtime": 27.1954, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 3.677, + "step": 141000 + }, + { + "epoch": 0.6053854013721096, + "grad_norm": 0.05095406249165535, + "learning_rate": 3.963203780516199e-05, + "loss": 0.11216226816177369, + "step": 141010 + }, + { + "epoch": 0.6054283334621297, + "grad_norm": 0.002075101248919964, + "learning_rate": 3.962772608504437e-05, + "loss": 0.15308403968811035, + "step": 141020 + }, + { + "epoch": 0.6054712655521496, + "grad_norm": 2.099597454071045, + "learning_rate": 3.9623414364926746e-05, + "loss": 0.33690292835235597, + "step": 141030 + }, + { + "epoch": 0.6055141976421696, + "grad_norm": 0.05643483251333237, + "learning_rate": 3.961910264480912e-05, + "loss": 0.1339421510696411, + "step": 141040 + }, + { + "epoch": 0.6055571297321897, + "grad_norm": 0.0012016056571155787, + "learning_rate": 3.9614790924691494e-05, + "loss": 0.1170223593711853, + "step": 141050 + }, + { + "epoch": 0.6056000618222096, + "grad_norm": 0.9147559404373169, + "learning_rate": 3.961047920457387e-05, + "loss": 0.4862982749938965, + "step": 141060 + }, + { + "epoch": 0.6056429939122296, + "grad_norm": 0.005203854292631149, + "learning_rate": 3.9606167484456256e-05, + "loss": 0.26890594959259034, + "step": 141070 + }, + { + "epoch": 0.6056859260022497, + "grad_norm": 1.590675950050354, + "learning_rate": 3.9601855764338626e-05, + "loss": 0.26728928089141846, + "step": 141080 + }, + { + "epoch": 0.6057288580922696, + "grad_norm": 0.009610733948647976, + "learning_rate": 3.9597544044221004e-05, + "loss": 0.17799413204193115, + "step": 141090 + }, + { + "epoch": 0.6057717901822897, + "grad_norm": 0.0018764605047181249, + "learning_rate": 3.959323232410338e-05, + "loss": 0.1783639073371887, + "step": 141100 + }, + { + "epoch": 0.6058147222723097, + "grad_norm": 2.180757761001587, + "learning_rate": 3.958892060398576e-05, + "loss": 0.3086270332336426, + "step": 141110 + }, + { + "epoch": 0.6058576543623296, + "grad_norm": 0.050399571657180786, + "learning_rate": 3.958460888386813e-05, + "loss": 0.13583248853683472, + "step": 141120 + }, + { + "epoch": 0.6059005864523497, + "grad_norm": 0.2559893727302551, + "learning_rate": 3.9580297163750506e-05, + "loss": 0.23775179386138917, + "step": 141130 + }, + { + "epoch": 0.6059435185423697, + "grad_norm": 0.005343136377632618, + "learning_rate": 3.9575985443632884e-05, + "loss": 0.1065142273902893, + "step": 141140 + }, + { + "epoch": 0.6059864506323897, + "grad_norm": 0.011462339200079441, + "learning_rate": 3.957167372351526e-05, + "loss": 0.2745601892471313, + "step": 141150 + }, + { + "epoch": 0.6060293827224097, + "grad_norm": 2.439439535140991, + "learning_rate": 3.956736200339764e-05, + "loss": 0.18180274963378906, + "step": 141160 + }, + { + "epoch": 0.6060723148124297, + "grad_norm": 0.012357541359961033, + "learning_rate": 3.956305028328001e-05, + "loss": 0.18113479614257813, + "step": 141170 + }, + { + "epoch": 0.6061152469024497, + "grad_norm": 0.0018625404918566346, + "learning_rate": 3.955873856316239e-05, + "loss": 0.07990905046463012, + "step": 141180 + }, + { + "epoch": 0.6061581789924697, + "grad_norm": 0.032597437500953674, + "learning_rate": 3.955442684304477e-05, + "loss": 0.1915552258491516, + "step": 141190 + }, + { + "epoch": 0.6062011110824898, + "grad_norm": 0.004464549943804741, + "learning_rate": 3.955011512292715e-05, + "loss": 0.1384279489517212, + "step": 141200 + }, + { + "epoch": 0.6062440431725097, + "grad_norm": 0.2348497062921524, + "learning_rate": 3.954580340280952e-05, + "loss": 0.13170111179351807, + "step": 141210 + }, + { + "epoch": 0.6062869752625297, + "grad_norm": 0.0035776502918452024, + "learning_rate": 3.9541491682691896e-05, + "loss": 0.2760654926300049, + "step": 141220 + }, + { + "epoch": 0.6063299073525498, + "grad_norm": 1.5733288526535034, + "learning_rate": 3.953717996257427e-05, + "loss": 0.22948665618896485, + "step": 141230 + }, + { + "epoch": 0.6063728394425697, + "grad_norm": 1.8536306619644165, + "learning_rate": 3.953286824245665e-05, + "loss": 0.33787682056427004, + "step": 141240 + }, + { + "epoch": 0.6064157715325897, + "grad_norm": 1.747023344039917, + "learning_rate": 3.952855652233902e-05, + "loss": 0.14881832599639894, + "step": 141250 + }, + { + "epoch": 0.6064587036226098, + "grad_norm": 0.2886177897453308, + "learning_rate": 3.95242448022214e-05, + "loss": 0.338626503944397, + "step": 141260 + }, + { + "epoch": 0.6065016357126297, + "grad_norm": 0.0006161421770229936, + "learning_rate": 3.9519933082103775e-05, + "loss": 0.11174997091293334, + "step": 141270 + }, + { + "epoch": 0.6065445678026498, + "grad_norm": 3.2979769706726074, + "learning_rate": 3.951562136198615e-05, + "loss": 0.18851048946380616, + "step": 141280 + }, + { + "epoch": 0.6065874998926698, + "grad_norm": 0.11197972297668457, + "learning_rate": 3.951130964186853e-05, + "loss": 0.10484591722488404, + "step": 141290 + }, + { + "epoch": 0.6066304319826897, + "grad_norm": 0.16652366518974304, + "learning_rate": 3.950699792175091e-05, + "loss": 0.0662794828414917, + "step": 141300 + }, + { + "epoch": 0.6066733640727098, + "grad_norm": 0.09492175281047821, + "learning_rate": 3.9502686201633285e-05, + "loss": 0.18237814903259278, + "step": 141310 + }, + { + "epoch": 0.6067162961627298, + "grad_norm": 0.8800640106201172, + "learning_rate": 3.949837448151566e-05, + "loss": 0.3102026700973511, + "step": 141320 + }, + { + "epoch": 0.6067592282527497, + "grad_norm": 0.0023318880703300238, + "learning_rate": 3.949406276139803e-05, + "loss": 0.030414551496505737, + "step": 141330 + }, + { + "epoch": 0.6068021603427698, + "grad_norm": 0.007045125123113394, + "learning_rate": 3.948975104128041e-05, + "loss": 0.1279890775680542, + "step": 141340 + }, + { + "epoch": 0.6068450924327898, + "grad_norm": 1.4796124696731567, + "learning_rate": 3.948543932116279e-05, + "loss": 0.21116414070129394, + "step": 141350 + }, + { + "epoch": 0.6068880245228098, + "grad_norm": 0.23528429865837097, + "learning_rate": 3.9481127601045165e-05, + "loss": 0.12599529027938844, + "step": 141360 + }, + { + "epoch": 0.6069309566128298, + "grad_norm": 1.651747703552246, + "learning_rate": 3.9476815880927535e-05, + "loss": 0.37949261665344236, + "step": 141370 + }, + { + "epoch": 0.6069738887028499, + "grad_norm": 0.061352405697107315, + "learning_rate": 3.947250416080991e-05, + "loss": 0.17497749328613282, + "step": 141380 + }, + { + "epoch": 0.6070168207928699, + "grad_norm": 0.0331333689391613, + "learning_rate": 3.946819244069229e-05, + "loss": 0.11293892860412598, + "step": 141390 + }, + { + "epoch": 0.6070597528828898, + "grad_norm": 5.993484020233154, + "learning_rate": 3.946388072057467e-05, + "loss": 0.40219316482543943, + "step": 141400 + }, + { + "epoch": 0.6071026849729099, + "grad_norm": 0.0046135191805660725, + "learning_rate": 3.9459569000457045e-05, + "loss": 0.10798419713973999, + "step": 141410 + }, + { + "epoch": 0.6071456170629299, + "grad_norm": 3.0338857173919678, + "learning_rate": 3.945525728033942e-05, + "loss": 0.20181164741516114, + "step": 141420 + }, + { + "epoch": 0.6071885491529498, + "grad_norm": 1.8571960926055908, + "learning_rate": 3.94509455602218e-05, + "loss": 0.214121413230896, + "step": 141430 + }, + { + "epoch": 0.6072314812429699, + "grad_norm": 2.187960147857666, + "learning_rate": 3.944663384010418e-05, + "loss": 0.403093957901001, + "step": 141440 + }, + { + "epoch": 0.6072744133329899, + "grad_norm": 0.41850295662879944, + "learning_rate": 3.944232211998655e-05, + "loss": 0.18242256641387938, + "step": 141450 + }, + { + "epoch": 0.6073173454230099, + "grad_norm": 0.011669473722577095, + "learning_rate": 3.9438010399868925e-05, + "loss": 0.0749740481376648, + "step": 141460 + }, + { + "epoch": 0.6073602775130299, + "grad_norm": 0.7934962511062622, + "learning_rate": 3.94336986797513e-05, + "loss": 0.2681394338607788, + "step": 141470 + }, + { + "epoch": 0.6074032096030499, + "grad_norm": 0.0006156415329314768, + "learning_rate": 3.942938695963368e-05, + "loss": 0.16833350658416749, + "step": 141480 + }, + { + "epoch": 0.6074461416930699, + "grad_norm": 0.021375132724642754, + "learning_rate": 3.942507523951606e-05, + "loss": 0.23237297534942628, + "step": 141490 + }, + { + "epoch": 0.6074890737830899, + "grad_norm": 0.5006760358810425, + "learning_rate": 3.942076351939843e-05, + "loss": 0.26445937156677246, + "step": 141500 + }, + { + "epoch": 0.60753200587311, + "grad_norm": 0.8343283534049988, + "learning_rate": 3.9416451799280805e-05, + "loss": 0.3213693380355835, + "step": 141510 + }, + { + "epoch": 0.6075749379631299, + "grad_norm": 3.1894664764404297, + "learning_rate": 3.941214007916318e-05, + "loss": 0.2887598514556885, + "step": 141520 + }, + { + "epoch": 0.6076178700531499, + "grad_norm": 0.023277558386325836, + "learning_rate": 3.940782835904556e-05, + "loss": 0.2421741008758545, + "step": 141530 + }, + { + "epoch": 0.60766080214317, + "grad_norm": 0.29840824007987976, + "learning_rate": 3.940351663892794e-05, + "loss": 0.17128605842590333, + "step": 141540 + }, + { + "epoch": 0.6077037342331899, + "grad_norm": 0.03273274376988411, + "learning_rate": 3.9399204918810314e-05, + "loss": 0.5258442878723144, + "step": 141550 + }, + { + "epoch": 0.60774666632321, + "grad_norm": 0.3960159718990326, + "learning_rate": 3.939489319869269e-05, + "loss": 0.12907603979110718, + "step": 141560 + }, + { + "epoch": 0.60778959841323, + "grad_norm": 0.0017227198695763946, + "learning_rate": 3.939058147857507e-05, + "loss": 0.177092981338501, + "step": 141570 + }, + { + "epoch": 0.6078325305032499, + "grad_norm": 0.002443228615447879, + "learning_rate": 3.938626975845744e-05, + "loss": 0.3996154308319092, + "step": 141580 + }, + { + "epoch": 0.60787546259327, + "grad_norm": 0.15868256986141205, + "learning_rate": 3.9381958038339817e-05, + "loss": 0.5284997463226319, + "step": 141590 + }, + { + "epoch": 0.60791839468329, + "grad_norm": 0.03645209223031998, + "learning_rate": 3.9377646318222194e-05, + "loss": 0.303369140625, + "step": 141600 + }, + { + "epoch": 0.6079613267733099, + "grad_norm": 1.6641205549240112, + "learning_rate": 3.937333459810457e-05, + "loss": 0.13876266479492189, + "step": 141610 + }, + { + "epoch": 0.60800425886333, + "grad_norm": 0.05909671634435654, + "learning_rate": 3.936902287798694e-05, + "loss": 0.21023335456848144, + "step": 141620 + }, + { + "epoch": 0.60804719095335, + "grad_norm": 0.0023271851241588593, + "learning_rate": 3.936471115786932e-05, + "loss": 0.028406143188476562, + "step": 141630 + }, + { + "epoch": 0.60809012304337, + "grad_norm": 0.004466942045837641, + "learning_rate": 3.9360399437751697e-05, + "loss": 0.19075484275817872, + "step": 141640 + }, + { + "epoch": 0.60813305513339, + "grad_norm": 1.219118595123291, + "learning_rate": 3.9356087717634074e-05, + "loss": 0.07533459663391114, + "step": 141650 + }, + { + "epoch": 0.60817598722341, + "grad_norm": 0.23978543281555176, + "learning_rate": 3.935177599751645e-05, + "loss": 0.16630566120147705, + "step": 141660 + }, + { + "epoch": 0.60821891931343, + "grad_norm": 97.8420639038086, + "learning_rate": 3.934746427739883e-05, + "loss": 0.40468668937683105, + "step": 141670 + }, + { + "epoch": 0.60826185140345, + "grad_norm": 0.0013681944692507386, + "learning_rate": 3.9343152557281206e-05, + "loss": 0.1450944185256958, + "step": 141680 + }, + { + "epoch": 0.6083047834934701, + "grad_norm": 0.004667914938181639, + "learning_rate": 3.933884083716358e-05, + "loss": 0.21857268810272218, + "step": 141690 + }, + { + "epoch": 0.60834771558349, + "grad_norm": 1.4839063882827759, + "learning_rate": 3.9334529117045954e-05, + "loss": 0.04907590448856354, + "step": 141700 + }, + { + "epoch": 0.60839064767351, + "grad_norm": 0.12947788834571838, + "learning_rate": 3.933021739692833e-05, + "loss": 0.36401753425598143, + "step": 141710 + }, + { + "epoch": 0.6084335797635301, + "grad_norm": 1.1010395288467407, + "learning_rate": 3.932590567681071e-05, + "loss": 0.36334273815155027, + "step": 141720 + }, + { + "epoch": 0.60847651185355, + "grad_norm": 1.1198389530181885, + "learning_rate": 3.9321593956693086e-05, + "loss": 0.12767027616500853, + "step": 141730 + }, + { + "epoch": 0.60851944394357, + "grad_norm": 6.048811435699463, + "learning_rate": 3.9317282236575456e-05, + "loss": 0.2986024856567383, + "step": 141740 + }, + { + "epoch": 0.6085623760335901, + "grad_norm": 0.14294730126857758, + "learning_rate": 3.9312970516457834e-05, + "loss": 0.2689626455307007, + "step": 141750 + }, + { + "epoch": 0.60860530812361, + "grad_norm": 19.958438873291016, + "learning_rate": 3.930865879634021e-05, + "loss": 0.3430215120315552, + "step": 141760 + }, + { + "epoch": 0.6086482402136301, + "grad_norm": 0.4098569452762604, + "learning_rate": 3.9304347076222595e-05, + "loss": 0.1653934359550476, + "step": 141770 + }, + { + "epoch": 0.6086911723036501, + "grad_norm": 0.013435311615467072, + "learning_rate": 3.9300035356104966e-05, + "loss": 0.2686276912689209, + "step": 141780 + }, + { + "epoch": 0.60873410439367, + "grad_norm": 1.2856467962265015, + "learning_rate": 3.929572363598734e-05, + "loss": 0.4310938358306885, + "step": 141790 + }, + { + "epoch": 0.6087770364836901, + "grad_norm": 0.027985993772745132, + "learning_rate": 3.929141191586972e-05, + "loss": 0.2021188497543335, + "step": 141800 + }, + { + "epoch": 0.6088199685737101, + "grad_norm": 0.003142294241115451, + "learning_rate": 3.92871001957521e-05, + "loss": 0.09466840624809265, + "step": 141810 + }, + { + "epoch": 0.6088629006637302, + "grad_norm": 1.6350462436676025, + "learning_rate": 3.928278847563447e-05, + "loss": 0.14503989219665528, + "step": 141820 + }, + { + "epoch": 0.6089058327537501, + "grad_norm": 0.10769698023796082, + "learning_rate": 3.9278476755516846e-05, + "loss": 0.1177414059638977, + "step": 141830 + }, + { + "epoch": 0.6089487648437701, + "grad_norm": 2.373969554901123, + "learning_rate": 3.927416503539922e-05, + "loss": 0.20538089275360108, + "step": 141840 + }, + { + "epoch": 0.6089916969337902, + "grad_norm": 0.05289050564169884, + "learning_rate": 3.92698533152816e-05, + "loss": 0.04121732711791992, + "step": 141850 + }, + { + "epoch": 0.6090346290238101, + "grad_norm": 2.833883047103882, + "learning_rate": 3.926554159516398e-05, + "loss": 0.2589694023132324, + "step": 141860 + }, + { + "epoch": 0.6090775611138302, + "grad_norm": 0.02419205754995346, + "learning_rate": 3.926122987504635e-05, + "loss": 0.255047082901001, + "step": 141870 + }, + { + "epoch": 0.6091204932038502, + "grad_norm": 0.03955872729420662, + "learning_rate": 3.925691815492873e-05, + "loss": 0.19322144985198975, + "step": 141880 + }, + { + "epoch": 0.6091634252938701, + "grad_norm": 3.4558498859405518, + "learning_rate": 3.925260643481111e-05, + "loss": 0.1118241548538208, + "step": 141890 + }, + { + "epoch": 0.6092063573838902, + "grad_norm": 1.9339896440505981, + "learning_rate": 3.924829471469349e-05, + "loss": 0.30376248359680175, + "step": 141900 + }, + { + "epoch": 0.6092492894739102, + "grad_norm": 1.3876532316207886, + "learning_rate": 3.924398299457586e-05, + "loss": 0.29599320888519287, + "step": 141910 + }, + { + "epoch": 0.6092922215639301, + "grad_norm": 0.10916638374328613, + "learning_rate": 3.9239671274458235e-05, + "loss": 0.2979604244232178, + "step": 141920 + }, + { + "epoch": 0.6093351536539502, + "grad_norm": 0.005515061318874359, + "learning_rate": 3.923535955434061e-05, + "loss": 0.2690037965774536, + "step": 141930 + }, + { + "epoch": 0.6093780857439702, + "grad_norm": 0.38121163845062256, + "learning_rate": 3.923104783422299e-05, + "loss": 0.22012245655059814, + "step": 141940 + }, + { + "epoch": 0.6094210178339902, + "grad_norm": 2.7628302574157715, + "learning_rate": 3.922673611410536e-05, + "loss": 0.14492123126983641, + "step": 141950 + }, + { + "epoch": 0.6094639499240102, + "grad_norm": 0.08726619929075241, + "learning_rate": 3.922242439398774e-05, + "loss": 0.10085601806640625, + "step": 141960 + }, + { + "epoch": 0.6095068820140302, + "grad_norm": 11.465222358703613, + "learning_rate": 3.9218112673870115e-05, + "loss": 0.3472140312194824, + "step": 141970 + }, + { + "epoch": 0.6095498141040502, + "grad_norm": 1.9945697784423828, + "learning_rate": 3.921380095375249e-05, + "loss": 0.10087604522705078, + "step": 141980 + }, + { + "epoch": 0.6095927461940702, + "grad_norm": 2.085869312286377, + "learning_rate": 3.920948923363487e-05, + "loss": 0.2568961620330811, + "step": 141990 + }, + { + "epoch": 0.6096356782840903, + "grad_norm": 0.0020460772793740034, + "learning_rate": 3.920517751351725e-05, + "loss": 0.3749083042144775, + "step": 142000 + }, + { + "epoch": 0.6096356782840903, + "eval_loss": 0.39280468225479126, + "eval_runtime": 27.3058, + "eval_samples_per_second": 3.662, + "eval_steps_per_second": 3.662, + "step": 142000 + }, + { + "epoch": 0.6096786103741102, + "grad_norm": 7.264312744140625, + "learning_rate": 3.9200865793399624e-05, + "loss": 0.13132262229919434, + "step": 142010 + }, + { + "epoch": 0.6097215424641302, + "grad_norm": 0.07945489883422852, + "learning_rate": 3.9196554073282e-05, + "loss": 0.24379079341888427, + "step": 142020 + }, + { + "epoch": 0.6097644745541503, + "grad_norm": 1.0327321290969849, + "learning_rate": 3.919224235316437e-05, + "loss": 0.35031702518463137, + "step": 142030 + }, + { + "epoch": 0.6098074066441702, + "grad_norm": 4.742274761199951, + "learning_rate": 3.918793063304675e-05, + "loss": 0.30789053440093994, + "step": 142040 + }, + { + "epoch": 0.6098503387341903, + "grad_norm": 1.298659324645996, + "learning_rate": 3.918361891292913e-05, + "loss": 0.3997587919235229, + "step": 142050 + }, + { + "epoch": 0.6098932708242103, + "grad_norm": 0.18839246034622192, + "learning_rate": 3.9179307192811504e-05, + "loss": 0.26359546184539795, + "step": 142060 + }, + { + "epoch": 0.6099362029142302, + "grad_norm": 0.2514694333076477, + "learning_rate": 3.9174995472693875e-05, + "loss": 0.08809687495231629, + "step": 142070 + }, + { + "epoch": 0.6099791350042503, + "grad_norm": 0.8888358473777771, + "learning_rate": 3.917068375257625e-05, + "loss": 0.23953211307525635, + "step": 142080 + }, + { + "epoch": 0.6100220670942703, + "grad_norm": 0.018821191042661667, + "learning_rate": 3.916637203245863e-05, + "loss": 0.18834341764450074, + "step": 142090 + }, + { + "epoch": 0.6100649991842902, + "grad_norm": 0.03325490280985832, + "learning_rate": 3.916206031234101e-05, + "loss": 0.2581282377243042, + "step": 142100 + }, + { + "epoch": 0.6101079312743103, + "grad_norm": 5.251251697540283, + "learning_rate": 3.9157748592223384e-05, + "loss": 0.5048004627227783, + "step": 142110 + }, + { + "epoch": 0.6101508633643303, + "grad_norm": 0.020616553723812103, + "learning_rate": 3.915343687210576e-05, + "loss": 0.07277900576591492, + "step": 142120 + }, + { + "epoch": 0.6101937954543503, + "grad_norm": 0.04141886159777641, + "learning_rate": 3.914912515198814e-05, + "loss": 0.2258600950241089, + "step": 142130 + }, + { + "epoch": 0.6102367275443703, + "grad_norm": 4.039946556091309, + "learning_rate": 3.9144813431870516e-05, + "loss": 0.38728866577148435, + "step": 142140 + }, + { + "epoch": 0.6102796596343903, + "grad_norm": 2.5496718883514404, + "learning_rate": 3.914050171175289e-05, + "loss": 0.3502103328704834, + "step": 142150 + }, + { + "epoch": 0.6103225917244103, + "grad_norm": 0.0676613301038742, + "learning_rate": 3.9136189991635264e-05, + "loss": 0.17965009212493896, + "step": 142160 + }, + { + "epoch": 0.6103655238144303, + "grad_norm": 1.1712474822998047, + "learning_rate": 3.913187827151764e-05, + "loss": 0.21342895030975342, + "step": 142170 + }, + { + "epoch": 0.6104084559044504, + "grad_norm": 1.3096929788589478, + "learning_rate": 3.912756655140002e-05, + "loss": 0.23940718173980713, + "step": 142180 + }, + { + "epoch": 0.6104513879944703, + "grad_norm": 0.05468279868364334, + "learning_rate": 3.912325483128239e-05, + "loss": 0.20399603843688965, + "step": 142190 + }, + { + "epoch": 0.6104943200844903, + "grad_norm": 2.2957608699798584, + "learning_rate": 3.911894311116477e-05, + "loss": 0.19245316982269287, + "step": 142200 + }, + { + "epoch": 0.6105372521745104, + "grad_norm": 0.005418090615421534, + "learning_rate": 3.9114631391047144e-05, + "loss": 0.14810930490493773, + "step": 142210 + }, + { + "epoch": 0.6105801842645303, + "grad_norm": 5.267305850982666, + "learning_rate": 3.911031967092952e-05, + "loss": 0.3291645526885986, + "step": 142220 + }, + { + "epoch": 0.6106231163545504, + "grad_norm": 1.2653834819793701, + "learning_rate": 3.91060079508119e-05, + "loss": 0.17146894931793213, + "step": 142230 + }, + { + "epoch": 0.6106660484445704, + "grad_norm": 0.12628792226314545, + "learning_rate": 3.9101696230694276e-05, + "loss": 0.16329647302627565, + "step": 142240 + }, + { + "epoch": 0.6107089805345904, + "grad_norm": 0.09851165860891342, + "learning_rate": 3.9097384510576653e-05, + "loss": 0.1806574583053589, + "step": 142250 + }, + { + "epoch": 0.6107519126246104, + "grad_norm": 4.4101972579956055, + "learning_rate": 3.909307279045903e-05, + "loss": 0.15007705688476564, + "step": 142260 + }, + { + "epoch": 0.6107948447146304, + "grad_norm": 0.046127188950777054, + "learning_rate": 3.908876107034141e-05, + "loss": 0.15361984968185424, + "step": 142270 + }, + { + "epoch": 0.6108377768046505, + "grad_norm": 0.008575751446187496, + "learning_rate": 3.908444935022378e-05, + "loss": 0.18621805906295777, + "step": 142280 + }, + { + "epoch": 0.6108807088946704, + "grad_norm": 0.1520209163427353, + "learning_rate": 3.9080137630106156e-05, + "loss": 0.2147531270980835, + "step": 142290 + }, + { + "epoch": 0.6109236409846904, + "grad_norm": 0.02115943655371666, + "learning_rate": 3.907582590998853e-05, + "loss": 0.2546267032623291, + "step": 142300 + }, + { + "epoch": 0.6109665730747105, + "grad_norm": 4.71589469909668, + "learning_rate": 3.907151418987091e-05, + "loss": 0.1917663335800171, + "step": 142310 + }, + { + "epoch": 0.6110095051647304, + "grad_norm": 0.17938746511936188, + "learning_rate": 3.906720246975328e-05, + "loss": 0.33880517482757566, + "step": 142320 + }, + { + "epoch": 0.6110524372547504, + "grad_norm": 1.8493738174438477, + "learning_rate": 3.906289074963566e-05, + "loss": 0.2039203405380249, + "step": 142330 + }, + { + "epoch": 0.6110953693447705, + "grad_norm": 69.82435607910156, + "learning_rate": 3.9058579029518036e-05, + "loss": 0.14799585342407226, + "step": 142340 + }, + { + "epoch": 0.6111383014347904, + "grad_norm": 0.15120545029640198, + "learning_rate": 3.905426730940041e-05, + "loss": 0.2303483009338379, + "step": 142350 + }, + { + "epoch": 0.6111812335248105, + "grad_norm": 1.926548957824707, + "learning_rate": 3.904995558928279e-05, + "loss": 0.4089049816131592, + "step": 142360 + }, + { + "epoch": 0.6112241656148305, + "grad_norm": 0.0761425718665123, + "learning_rate": 3.904564386916517e-05, + "loss": 0.24638357162475585, + "step": 142370 + }, + { + "epoch": 0.6112670977048504, + "grad_norm": 0.027300817891955376, + "learning_rate": 3.9041332149047545e-05, + "loss": 0.18842120170593263, + "step": 142380 + }, + { + "epoch": 0.6113100297948705, + "grad_norm": 0.06413847208023071, + "learning_rate": 3.903702042892992e-05, + "loss": 0.23671393394470214, + "step": 142390 + }, + { + "epoch": 0.6113529618848905, + "grad_norm": 0.11595964431762695, + "learning_rate": 3.903270870881229e-05, + "loss": 0.1801469087600708, + "step": 142400 + }, + { + "epoch": 0.6113958939749105, + "grad_norm": 5.405128002166748, + "learning_rate": 3.902839698869467e-05, + "loss": 0.1529282808303833, + "step": 142410 + }, + { + "epoch": 0.6114388260649305, + "grad_norm": 1.0243592262268066, + "learning_rate": 3.902408526857705e-05, + "loss": 0.21499390602111818, + "step": 142420 + }, + { + "epoch": 0.6114817581549505, + "grad_norm": 0.0854133814573288, + "learning_rate": 3.9019773548459425e-05, + "loss": 0.11400735378265381, + "step": 142430 + }, + { + "epoch": 0.6115246902449705, + "grad_norm": 2.3615777492523193, + "learning_rate": 3.9015461828341796e-05, + "loss": 0.4467350959777832, + "step": 142440 + }, + { + "epoch": 0.6115676223349905, + "grad_norm": 0.018909169360995293, + "learning_rate": 3.901115010822417e-05, + "loss": 0.4240890026092529, + "step": 142450 + }, + { + "epoch": 0.6116105544250106, + "grad_norm": 0.8031514883041382, + "learning_rate": 3.900683838810655e-05, + "loss": 0.21565210819244385, + "step": 142460 + }, + { + "epoch": 0.6116534865150305, + "grad_norm": 0.15044718980789185, + "learning_rate": 3.900252666798893e-05, + "loss": 0.11793738603591919, + "step": 142470 + }, + { + "epoch": 0.6116964186050505, + "grad_norm": 0.17571763694286346, + "learning_rate": 3.8998214947871305e-05, + "loss": 0.157479989528656, + "step": 142480 + }, + { + "epoch": 0.6117393506950706, + "grad_norm": 1.6672226190567017, + "learning_rate": 3.899390322775368e-05, + "loss": 0.10348098278045655, + "step": 142490 + }, + { + "epoch": 0.6117822827850905, + "grad_norm": 9.708442687988281, + "learning_rate": 3.898959150763606e-05, + "loss": 0.262995171546936, + "step": 142500 + }, + { + "epoch": 0.6118252148751105, + "grad_norm": 0.09473815560340881, + "learning_rate": 3.898527978751844e-05, + "loss": 0.4231609344482422, + "step": 142510 + }, + { + "epoch": 0.6118681469651306, + "grad_norm": 0.07334093749523163, + "learning_rate": 3.898096806740081e-05, + "loss": 0.10099009275436402, + "step": 142520 + }, + { + "epoch": 0.6119110790551505, + "grad_norm": 0.014422636479139328, + "learning_rate": 3.8976656347283185e-05, + "loss": 0.05562713146209717, + "step": 142530 + }, + { + "epoch": 0.6119540111451706, + "grad_norm": 5.1654253005981445, + "learning_rate": 3.897234462716556e-05, + "loss": 0.37032556533813477, + "step": 142540 + }, + { + "epoch": 0.6119969432351906, + "grad_norm": 0.010609438642859459, + "learning_rate": 3.896803290704794e-05, + "loss": 0.2431696653366089, + "step": 142550 + }, + { + "epoch": 0.6120398753252105, + "grad_norm": 0.1465774029493332, + "learning_rate": 3.896372118693031e-05, + "loss": 0.03999505937099457, + "step": 142560 + }, + { + "epoch": 0.6120828074152306, + "grad_norm": 0.014877895824611187, + "learning_rate": 3.895940946681269e-05, + "loss": 0.1237523078918457, + "step": 142570 + }, + { + "epoch": 0.6121257395052506, + "grad_norm": 1.9582277536392212, + "learning_rate": 3.8955097746695065e-05, + "loss": 0.14951646327972412, + "step": 142580 + }, + { + "epoch": 0.6121686715952706, + "grad_norm": 5.586146831512451, + "learning_rate": 3.895078602657745e-05, + "loss": 0.39240322113037107, + "step": 142590 + }, + { + "epoch": 0.6122116036852906, + "grad_norm": 0.027195017784833908, + "learning_rate": 3.8946474306459827e-05, + "loss": 0.3054402589797974, + "step": 142600 + }, + { + "epoch": 0.6122545357753106, + "grad_norm": 0.0076742833480238914, + "learning_rate": 3.89421625863422e-05, + "loss": 0.29276094436645506, + "step": 142610 + }, + { + "epoch": 0.6122974678653306, + "grad_norm": 1.3209526538848877, + "learning_rate": 3.8937850866224574e-05, + "loss": 0.18471418619155883, + "step": 142620 + }, + { + "epoch": 0.6123403999553506, + "grad_norm": 0.16654518246650696, + "learning_rate": 3.893353914610695e-05, + "loss": 0.2270416498184204, + "step": 142630 + }, + { + "epoch": 0.6123833320453707, + "grad_norm": 0.20178063213825226, + "learning_rate": 3.892922742598933e-05, + "loss": 0.22354300022125245, + "step": 142640 + }, + { + "epoch": 0.6124262641353906, + "grad_norm": 0.03456205502152443, + "learning_rate": 3.89249157058717e-05, + "loss": 0.20727345943450928, + "step": 142650 + }, + { + "epoch": 0.6124691962254106, + "grad_norm": 3.280965805053711, + "learning_rate": 3.892060398575408e-05, + "loss": 0.24969584941864015, + "step": 142660 + }, + { + "epoch": 0.6125121283154307, + "grad_norm": 0.42193174362182617, + "learning_rate": 3.8916292265636454e-05, + "loss": 0.05913207530975342, + "step": 142670 + }, + { + "epoch": 0.6125550604054507, + "grad_norm": 0.016481934115290642, + "learning_rate": 3.891198054551883e-05, + "loss": 0.14308911561965942, + "step": 142680 + }, + { + "epoch": 0.6125979924954706, + "grad_norm": 0.6766266822814941, + "learning_rate": 3.89076688254012e-05, + "loss": 0.009996140748262406, + "step": 142690 + }, + { + "epoch": 0.6126409245854907, + "grad_norm": 2.5119316577911377, + "learning_rate": 3.8903357105283586e-05, + "loss": 0.09946958422660827, + "step": 142700 + }, + { + "epoch": 0.6126838566755107, + "grad_norm": 0.35976850986480713, + "learning_rate": 3.8899045385165964e-05, + "loss": 0.18059909343719482, + "step": 142710 + }, + { + "epoch": 0.6127267887655307, + "grad_norm": 0.004872175864875317, + "learning_rate": 3.889473366504834e-05, + "loss": 0.07150285840034484, + "step": 142720 + }, + { + "epoch": 0.6127697208555507, + "grad_norm": 0.5548312067985535, + "learning_rate": 3.889042194493071e-05, + "loss": 0.17871575355529784, + "step": 142730 + }, + { + "epoch": 0.6128126529455707, + "grad_norm": 0.24687838554382324, + "learning_rate": 3.888611022481309e-05, + "loss": 0.13710942268371581, + "step": 142740 + }, + { + "epoch": 0.6128555850355907, + "grad_norm": 0.01010800525546074, + "learning_rate": 3.8881798504695466e-05, + "loss": 0.21247475147247313, + "step": 142750 + }, + { + "epoch": 0.6128985171256107, + "grad_norm": 0.0031628634314984083, + "learning_rate": 3.8877486784577844e-05, + "loss": 0.3880982160568237, + "step": 142760 + }, + { + "epoch": 0.6129414492156308, + "grad_norm": 0.043660301715135574, + "learning_rate": 3.8873175064460214e-05, + "loss": 0.3001490592956543, + "step": 142770 + }, + { + "epoch": 0.6129843813056507, + "grad_norm": 0.03375304117798805, + "learning_rate": 3.886886334434259e-05, + "loss": 0.24639558792114258, + "step": 142780 + }, + { + "epoch": 0.6130273133956707, + "grad_norm": 0.09769673645496368, + "learning_rate": 3.886455162422497e-05, + "loss": 0.2595521926879883, + "step": 142790 + }, + { + "epoch": 0.6130702454856908, + "grad_norm": 3.6060831546783447, + "learning_rate": 3.8860239904107346e-05, + "loss": 0.12040402889251708, + "step": 142800 + }, + { + "epoch": 0.6131131775757107, + "grad_norm": 0.007844222709536552, + "learning_rate": 3.8855928183989724e-05, + "loss": 0.23807475566864014, + "step": 142810 + }, + { + "epoch": 0.6131561096657308, + "grad_norm": 1.8894518613815308, + "learning_rate": 3.88516164638721e-05, + "loss": 0.2573350429534912, + "step": 142820 + }, + { + "epoch": 0.6131990417557508, + "grad_norm": 0.003673046361654997, + "learning_rate": 3.884730474375448e-05, + "loss": 0.17068955898284913, + "step": 142830 + }, + { + "epoch": 0.6132419738457707, + "grad_norm": 1.4258671998977661, + "learning_rate": 3.8842993023636856e-05, + "loss": 0.1882157564163208, + "step": 142840 + }, + { + "epoch": 0.6132849059357908, + "grad_norm": 2.0518877506256104, + "learning_rate": 3.8838681303519226e-05, + "loss": 0.12432767152786255, + "step": 142850 + }, + { + "epoch": 0.6133278380258108, + "grad_norm": 0.9698437452316284, + "learning_rate": 3.8834369583401604e-05, + "loss": 0.29598345756530764, + "step": 142860 + }, + { + "epoch": 0.6133707701158307, + "grad_norm": 0.05316584184765816, + "learning_rate": 3.883005786328398e-05, + "loss": 0.10486148595809937, + "step": 142870 + }, + { + "epoch": 0.6134137022058508, + "grad_norm": 1.7184795141220093, + "learning_rate": 3.882574614316636e-05, + "loss": 0.33672361373901366, + "step": 142880 + }, + { + "epoch": 0.6134566342958708, + "grad_norm": 0.24001701176166534, + "learning_rate": 3.882143442304873e-05, + "loss": 0.20460429191589355, + "step": 142890 + }, + { + "epoch": 0.6134995663858908, + "grad_norm": 0.5448141098022461, + "learning_rate": 3.8817122702931106e-05, + "loss": 0.1534830689430237, + "step": 142900 + }, + { + "epoch": 0.6135424984759108, + "grad_norm": 7.385931491851807, + "learning_rate": 3.8812810982813484e-05, + "loss": 0.19792778491973878, + "step": 142910 + }, + { + "epoch": 0.6135854305659308, + "grad_norm": 0.0035218922421336174, + "learning_rate": 3.880849926269586e-05, + "loss": 0.11510688066482544, + "step": 142920 + }, + { + "epoch": 0.6136283626559508, + "grad_norm": 1.7946984767913818, + "learning_rate": 3.880418754257824e-05, + "loss": 0.4070457458496094, + "step": 142930 + }, + { + "epoch": 0.6136712947459708, + "grad_norm": 8.210290908813477, + "learning_rate": 3.8799875822460616e-05, + "loss": 0.32269439697265623, + "step": 142940 + }, + { + "epoch": 0.6137142268359909, + "grad_norm": 4.748100757598877, + "learning_rate": 3.879556410234299e-05, + "loss": 0.13236244916915893, + "step": 142950 + }, + { + "epoch": 0.6137571589260108, + "grad_norm": 0.0008868540753610432, + "learning_rate": 3.879125238222537e-05, + "loss": 0.18349295854568481, + "step": 142960 + }, + { + "epoch": 0.6138000910160308, + "grad_norm": 1.8099790811538696, + "learning_rate": 3.878694066210775e-05, + "loss": 0.302026629447937, + "step": 142970 + }, + { + "epoch": 0.6138430231060509, + "grad_norm": 0.024444634094834328, + "learning_rate": 3.878262894199012e-05, + "loss": 0.053810220956802365, + "step": 142980 + }, + { + "epoch": 0.6138859551960708, + "grad_norm": 0.14125758409500122, + "learning_rate": 3.8778317221872496e-05, + "loss": 0.22748353481292724, + "step": 142990 + }, + { + "epoch": 0.6139288872860909, + "grad_norm": 1.6531537771224976, + "learning_rate": 3.877400550175487e-05, + "loss": 0.362827730178833, + "step": 143000 + }, + { + "epoch": 0.6139288872860909, + "eval_loss": 0.3971373736858368, + "eval_runtime": 27.343, + "eval_samples_per_second": 3.657, + "eval_steps_per_second": 3.657, + "step": 143000 + }, + { + "epoch": 0.6139718193761109, + "grad_norm": 0.00045808134018443525, + "learning_rate": 3.876969378163725e-05, + "loss": 0.10025212764739991, + "step": 143010 + }, + { + "epoch": 0.6140147514661308, + "grad_norm": 0.14784424006938934, + "learning_rate": 3.876538206151962e-05, + "loss": 0.11128251552581787, + "step": 143020 + }, + { + "epoch": 0.6140576835561509, + "grad_norm": 0.003462345339357853, + "learning_rate": 3.8761070341402e-05, + "loss": 0.17818193435668944, + "step": 143030 + }, + { + "epoch": 0.6141006156461709, + "grad_norm": 0.0004410098772495985, + "learning_rate": 3.8756758621284375e-05, + "loss": 0.11047359704971313, + "step": 143040 + }, + { + "epoch": 0.6141435477361908, + "grad_norm": 0.009439370594918728, + "learning_rate": 3.875244690116675e-05, + "loss": 0.12234883308410645, + "step": 143050 + }, + { + "epoch": 0.6141864798262109, + "grad_norm": 0.09685683995485306, + "learning_rate": 3.874813518104913e-05, + "loss": 0.14245309829711914, + "step": 143060 + }, + { + "epoch": 0.6142294119162309, + "grad_norm": 0.1996997743844986, + "learning_rate": 3.874382346093151e-05, + "loss": 0.20710551738739014, + "step": 143070 + }, + { + "epoch": 0.6142723440062509, + "grad_norm": 0.0035963598638772964, + "learning_rate": 3.8739511740813885e-05, + "loss": 0.31514573097229004, + "step": 143080 + }, + { + "epoch": 0.6143152760962709, + "grad_norm": 0.0031128383707255125, + "learning_rate": 3.873520002069626e-05, + "loss": 0.10121103525161743, + "step": 143090 + }, + { + "epoch": 0.6143582081862909, + "grad_norm": 0.004904964007437229, + "learning_rate": 3.873088830057863e-05, + "loss": 0.32856738567352295, + "step": 143100 + }, + { + "epoch": 0.614401140276311, + "grad_norm": 0.7072336673736572, + "learning_rate": 3.872657658046101e-05, + "loss": 0.03897510170936584, + "step": 143110 + }, + { + "epoch": 0.6144440723663309, + "grad_norm": 0.01982838474214077, + "learning_rate": 3.872226486034339e-05, + "loss": 0.24557275772094728, + "step": 143120 + }, + { + "epoch": 0.614487004456351, + "grad_norm": 0.00509743532165885, + "learning_rate": 3.8717953140225765e-05, + "loss": 0.1875273823738098, + "step": 143130 + }, + { + "epoch": 0.614529936546371, + "grad_norm": 0.0008203135221265256, + "learning_rate": 3.8713641420108135e-05, + "loss": 0.21112377643585206, + "step": 143140 + }, + { + "epoch": 0.6145728686363909, + "grad_norm": 0.007646126672625542, + "learning_rate": 3.870932969999051e-05, + "loss": 0.13793174028396607, + "step": 143150 + }, + { + "epoch": 0.614615800726411, + "grad_norm": 0.0029893110040575266, + "learning_rate": 3.870501797987289e-05, + "loss": 0.1003826379776001, + "step": 143160 + }, + { + "epoch": 0.614658732816431, + "grad_norm": 0.0017700279131531715, + "learning_rate": 3.870070625975527e-05, + "loss": 0.12775927782058716, + "step": 143170 + }, + { + "epoch": 0.614701664906451, + "grad_norm": 0.017108073458075523, + "learning_rate": 3.8696394539637645e-05, + "loss": 0.12151342630386353, + "step": 143180 + }, + { + "epoch": 0.614744596996471, + "grad_norm": 0.08749516308307648, + "learning_rate": 3.869208281952002e-05, + "loss": 0.08751832246780396, + "step": 143190 + }, + { + "epoch": 0.614787529086491, + "grad_norm": 0.14753209054470062, + "learning_rate": 3.86877710994024e-05, + "loss": 0.220550799369812, + "step": 143200 + }, + { + "epoch": 0.614830461176511, + "grad_norm": 7.232389450073242, + "learning_rate": 3.868345937928478e-05, + "loss": 0.2188883066177368, + "step": 143210 + }, + { + "epoch": 0.614873393266531, + "grad_norm": 0.18595461547374725, + "learning_rate": 3.867914765916715e-05, + "loss": 0.263457727432251, + "step": 143220 + }, + { + "epoch": 0.614916325356551, + "grad_norm": 0.011233457364141941, + "learning_rate": 3.8674835939049525e-05, + "loss": 0.22754213809967042, + "step": 143230 + }, + { + "epoch": 0.614959257446571, + "grad_norm": 2.888434886932373, + "learning_rate": 3.86705242189319e-05, + "loss": 0.29729480743408204, + "step": 143240 + }, + { + "epoch": 0.615002189536591, + "grad_norm": 0.039356231689453125, + "learning_rate": 3.866621249881428e-05, + "loss": 0.1607893705368042, + "step": 143250 + }, + { + "epoch": 0.6150451216266111, + "grad_norm": 1.696097731590271, + "learning_rate": 3.866190077869665e-05, + "loss": 0.05476242899894714, + "step": 143260 + }, + { + "epoch": 0.615088053716631, + "grad_norm": 0.11227918416261673, + "learning_rate": 3.865758905857903e-05, + "loss": 0.3089680433273315, + "step": 143270 + }, + { + "epoch": 0.615130985806651, + "grad_norm": 20.85646629333496, + "learning_rate": 3.8653277338461405e-05, + "loss": 0.1662601947784424, + "step": 143280 + }, + { + "epoch": 0.6151739178966711, + "grad_norm": 0.04491984099149704, + "learning_rate": 3.864896561834379e-05, + "loss": 0.27137112617492676, + "step": 143290 + }, + { + "epoch": 0.615216849986691, + "grad_norm": 0.0005721793859265745, + "learning_rate": 3.864465389822616e-05, + "loss": 0.2376784563064575, + "step": 143300 + }, + { + "epoch": 0.6152597820767111, + "grad_norm": 0.027530111372470856, + "learning_rate": 3.8640342178108537e-05, + "loss": 0.26707456111907957, + "step": 143310 + }, + { + "epoch": 0.6153027141667311, + "grad_norm": 0.008097376674413681, + "learning_rate": 3.8636030457990914e-05, + "loss": 0.15281684398651124, + "step": 143320 + }, + { + "epoch": 0.615345646256751, + "grad_norm": 18.04004669189453, + "learning_rate": 3.863171873787329e-05, + "loss": 0.1304473400115967, + "step": 143330 + }, + { + "epoch": 0.6153885783467711, + "grad_norm": 6.95366096496582, + "learning_rate": 3.862740701775567e-05, + "loss": 0.2441573143005371, + "step": 143340 + }, + { + "epoch": 0.6154315104367911, + "grad_norm": 10.125812530517578, + "learning_rate": 3.862309529763804e-05, + "loss": 0.13682811260223388, + "step": 143350 + }, + { + "epoch": 0.615474442526811, + "grad_norm": 7.338260650634766, + "learning_rate": 3.8618783577520417e-05, + "loss": 0.21427173614501954, + "step": 143360 + }, + { + "epoch": 0.6155173746168311, + "grad_norm": 0.017023073509335518, + "learning_rate": 3.8614471857402794e-05, + "loss": 0.16689443588256836, + "step": 143370 + }, + { + "epoch": 0.6155603067068511, + "grad_norm": 0.33701053261756897, + "learning_rate": 3.861016013728517e-05, + "loss": 0.25445683002471925, + "step": 143380 + }, + { + "epoch": 0.6156032387968711, + "grad_norm": 5.320155143737793, + "learning_rate": 3.860584841716754e-05, + "loss": 0.23360719680786132, + "step": 143390 + }, + { + "epoch": 0.6156461708868911, + "grad_norm": 1.3005390167236328, + "learning_rate": 3.8601536697049926e-05, + "loss": 0.34432909488677976, + "step": 143400 + }, + { + "epoch": 0.6156891029769112, + "grad_norm": 0.03673168644309044, + "learning_rate": 3.85972249769323e-05, + "loss": 0.1609882354736328, + "step": 143410 + }, + { + "epoch": 0.6157320350669311, + "grad_norm": 34.08330154418945, + "learning_rate": 3.859291325681468e-05, + "loss": 0.2362835168838501, + "step": 143420 + }, + { + "epoch": 0.6157749671569511, + "grad_norm": 0.06983064115047455, + "learning_rate": 3.858860153669705e-05, + "loss": 0.2938406229019165, + "step": 143430 + }, + { + "epoch": 0.6158178992469712, + "grad_norm": 0.055000558495521545, + "learning_rate": 3.858428981657943e-05, + "loss": 0.39708521366119387, + "step": 143440 + }, + { + "epoch": 0.6158608313369911, + "grad_norm": 0.011660448275506496, + "learning_rate": 3.8579978096461806e-05, + "loss": 0.15488802194595336, + "step": 143450 + }, + { + "epoch": 0.6159037634270111, + "grad_norm": 0.36187925934791565, + "learning_rate": 3.857566637634418e-05, + "loss": 0.11263597011566162, + "step": 143460 + }, + { + "epoch": 0.6159466955170312, + "grad_norm": 4.819761276245117, + "learning_rate": 3.8571354656226554e-05, + "loss": 0.2282076358795166, + "step": 143470 + }, + { + "epoch": 0.6159896276070511, + "grad_norm": 0.590923547744751, + "learning_rate": 3.856704293610893e-05, + "loss": 0.15642644166946412, + "step": 143480 + }, + { + "epoch": 0.6160325596970712, + "grad_norm": 0.0005970151396468282, + "learning_rate": 3.856273121599131e-05, + "loss": 0.21688811779022216, + "step": 143490 + }, + { + "epoch": 0.6160754917870912, + "grad_norm": 1.820595622062683, + "learning_rate": 3.8558419495873686e-05, + "loss": 0.22994556427001953, + "step": 143500 + }, + { + "epoch": 0.6161184238771111, + "grad_norm": 0.025294188410043716, + "learning_rate": 3.855410777575606e-05, + "loss": 0.14468964338302612, + "step": 143510 + }, + { + "epoch": 0.6161613559671312, + "grad_norm": 1.5548747777938843, + "learning_rate": 3.854979605563844e-05, + "loss": 0.16419532299041747, + "step": 143520 + }, + { + "epoch": 0.6162042880571512, + "grad_norm": 0.6060371398925781, + "learning_rate": 3.854548433552082e-05, + "loss": 0.2152492046356201, + "step": 143530 + }, + { + "epoch": 0.6162472201471713, + "grad_norm": 0.6445576548576355, + "learning_rate": 3.8541172615403195e-05, + "loss": 0.2481471061706543, + "step": 143540 + }, + { + "epoch": 0.6162901522371912, + "grad_norm": 13.752565383911133, + "learning_rate": 3.8536860895285566e-05, + "loss": 0.19086780548095703, + "step": 143550 + }, + { + "epoch": 0.6163330843272112, + "grad_norm": 0.0020951321348547935, + "learning_rate": 3.853254917516794e-05, + "loss": 0.27932713031768797, + "step": 143560 + }, + { + "epoch": 0.6163760164172313, + "grad_norm": 0.6050500273704529, + "learning_rate": 3.852823745505032e-05, + "loss": 0.29224677085876466, + "step": 143570 + }, + { + "epoch": 0.6164189485072512, + "grad_norm": 0.04540263116359711, + "learning_rate": 3.85239257349327e-05, + "loss": 0.3663180828094482, + "step": 143580 + }, + { + "epoch": 0.6164618805972713, + "grad_norm": 0.041277043521404266, + "learning_rate": 3.851961401481507e-05, + "loss": 0.14575035572052003, + "step": 143590 + }, + { + "epoch": 0.6165048126872913, + "grad_norm": 0.7824796438217163, + "learning_rate": 3.8515302294697446e-05, + "loss": 0.2100764751434326, + "step": 143600 + }, + { + "epoch": 0.6165477447773112, + "grad_norm": 0.005907750688493252, + "learning_rate": 3.851099057457982e-05, + "loss": 0.11084829568862915, + "step": 143610 + }, + { + "epoch": 0.6165906768673313, + "grad_norm": 0.744647741317749, + "learning_rate": 3.85066788544622e-05, + "loss": 0.26470563411712644, + "step": 143620 + }, + { + "epoch": 0.6166336089573513, + "grad_norm": 2.1389269828796387, + "learning_rate": 3.850236713434458e-05, + "loss": 0.21109626293182374, + "step": 143630 + }, + { + "epoch": 0.6166765410473712, + "grad_norm": 5.421672821044922, + "learning_rate": 3.8498055414226955e-05, + "loss": 0.28545246124267576, + "step": 143640 + }, + { + "epoch": 0.6167194731373913, + "grad_norm": 2.370431900024414, + "learning_rate": 3.849374369410933e-05, + "loss": 0.06665264964103698, + "step": 143650 + }, + { + "epoch": 0.6167624052274113, + "grad_norm": 0.0015999609604477882, + "learning_rate": 3.848943197399171e-05, + "loss": 0.3594416618347168, + "step": 143660 + }, + { + "epoch": 0.6168053373174313, + "grad_norm": 0.9188343286514282, + "learning_rate": 3.848512025387408e-05, + "loss": 0.10321264266967774, + "step": 143670 + }, + { + "epoch": 0.6168482694074513, + "grad_norm": 3.1624345779418945, + "learning_rate": 3.848080853375646e-05, + "loss": 0.28934388160705565, + "step": 143680 + }, + { + "epoch": 0.6168912014974713, + "grad_norm": 1.9765626192092896, + "learning_rate": 3.8476496813638835e-05, + "loss": 0.35861854553222655, + "step": 143690 + }, + { + "epoch": 0.6169341335874913, + "grad_norm": 5.686208248138428, + "learning_rate": 3.847218509352121e-05, + "loss": 0.17112207412719727, + "step": 143700 + }, + { + "epoch": 0.6169770656775113, + "grad_norm": 0.000594784040004015, + "learning_rate": 3.846787337340359e-05, + "loss": 0.1451743721961975, + "step": 143710 + }, + { + "epoch": 0.6170199977675314, + "grad_norm": 1.208206295967102, + "learning_rate": 3.846356165328596e-05, + "loss": 0.19009263515472413, + "step": 143720 + }, + { + "epoch": 0.6170629298575513, + "grad_norm": 1.7347217798233032, + "learning_rate": 3.845924993316834e-05, + "loss": 0.4419961452484131, + "step": 143730 + }, + { + "epoch": 0.6171058619475713, + "grad_norm": 0.16214624047279358, + "learning_rate": 3.8454938213050715e-05, + "loss": 0.26940882205963135, + "step": 143740 + }, + { + "epoch": 0.6171487940375914, + "grad_norm": 0.03651418536901474, + "learning_rate": 3.845062649293309e-05, + "loss": 0.39347841739654543, + "step": 143750 + }, + { + "epoch": 0.6171917261276113, + "grad_norm": 43.0539436340332, + "learning_rate": 3.844631477281547e-05, + "loss": 0.14974191188812255, + "step": 143760 + }, + { + "epoch": 0.6172346582176313, + "grad_norm": 0.0016830979147925973, + "learning_rate": 3.844200305269785e-05, + "loss": 0.2485567808151245, + "step": 143770 + }, + { + "epoch": 0.6172775903076514, + "grad_norm": 12.779747009277344, + "learning_rate": 3.8437691332580224e-05, + "loss": 0.16967169046401978, + "step": 143780 + }, + { + "epoch": 0.6173205223976713, + "grad_norm": 3.1827280521392822, + "learning_rate": 3.84333796124626e-05, + "loss": 0.3932061672210693, + "step": 143790 + }, + { + "epoch": 0.6173634544876914, + "grad_norm": 0.009968830272555351, + "learning_rate": 3.842906789234497e-05, + "loss": 0.10252895355224609, + "step": 143800 + }, + { + "epoch": 0.6174063865777114, + "grad_norm": 0.02346714586019516, + "learning_rate": 3.842475617222735e-05, + "loss": 0.45845975875854494, + "step": 143810 + }, + { + "epoch": 0.6174493186677313, + "grad_norm": 0.45332077145576477, + "learning_rate": 3.842044445210973e-05, + "loss": 0.19608376026153565, + "step": 143820 + }, + { + "epoch": 0.6174922507577514, + "grad_norm": 0.005778151098638773, + "learning_rate": 3.8416132731992104e-05, + "loss": 0.013501368463039398, + "step": 143830 + }, + { + "epoch": 0.6175351828477714, + "grad_norm": 0.03759386017918587, + "learning_rate": 3.8411821011874475e-05, + "loss": 0.12773208618164061, + "step": 143840 + }, + { + "epoch": 0.6175781149377914, + "grad_norm": 0.18952840566635132, + "learning_rate": 3.840750929175685e-05, + "loss": 0.2644594669342041, + "step": 143850 + }, + { + "epoch": 0.6176210470278114, + "grad_norm": 0.5408098101615906, + "learning_rate": 3.840319757163923e-05, + "loss": 0.10454925298690795, + "step": 143860 + }, + { + "epoch": 0.6176639791178314, + "grad_norm": 0.0044852206483483315, + "learning_rate": 3.839888585152161e-05, + "loss": 0.19259582757949828, + "step": 143870 + }, + { + "epoch": 0.6177069112078514, + "grad_norm": 1.8733210563659668, + "learning_rate": 3.8394574131403984e-05, + "loss": 0.1476469397544861, + "step": 143880 + }, + { + "epoch": 0.6177498432978714, + "grad_norm": 1.6489322185516357, + "learning_rate": 3.839026241128636e-05, + "loss": 0.44382553100585936, + "step": 143890 + }, + { + "epoch": 0.6177927753878915, + "grad_norm": 1.372307538986206, + "learning_rate": 3.838595069116874e-05, + "loss": 0.15867252349853517, + "step": 143900 + }, + { + "epoch": 0.6178357074779114, + "grad_norm": 0.27712729573249817, + "learning_rate": 3.8381638971051116e-05, + "loss": 0.3588222026824951, + "step": 143910 + }, + { + "epoch": 0.6178786395679314, + "grad_norm": 1.5185275077819824, + "learning_rate": 3.837732725093349e-05, + "loss": 0.24487559795379638, + "step": 143920 + }, + { + "epoch": 0.6179215716579515, + "grad_norm": 2.4732537269592285, + "learning_rate": 3.8373015530815864e-05, + "loss": 0.23896138668060302, + "step": 143930 + }, + { + "epoch": 0.6179645037479714, + "grad_norm": 4.313063621520996, + "learning_rate": 3.836870381069824e-05, + "loss": 0.23469119071960448, + "step": 143940 + }, + { + "epoch": 0.6180074358379914, + "grad_norm": 0.010658573359251022, + "learning_rate": 3.836439209058062e-05, + "loss": 0.061583518981933594, + "step": 143950 + }, + { + "epoch": 0.6180503679280115, + "grad_norm": 0.14485430717468262, + "learning_rate": 3.836008037046299e-05, + "loss": 0.20397756099700928, + "step": 143960 + }, + { + "epoch": 0.6180933000180315, + "grad_norm": 1.583567500114441, + "learning_rate": 3.835576865034537e-05, + "loss": 0.3420498609542847, + "step": 143970 + }, + { + "epoch": 0.6181362321080515, + "grad_norm": 0.0020854922477155924, + "learning_rate": 3.8351456930227744e-05, + "loss": 0.14520049095153809, + "step": 143980 + }, + { + "epoch": 0.6181791641980715, + "grad_norm": 0.9290441870689392, + "learning_rate": 3.834714521011012e-05, + "loss": 0.20561542510986328, + "step": 143990 + }, + { + "epoch": 0.6182220962880915, + "grad_norm": 0.0007875105366110802, + "learning_rate": 3.83428334899925e-05, + "loss": 0.27081058025360105, + "step": 144000 + }, + { + "epoch": 0.6182220962880915, + "eval_loss": 0.4025852084159851, + "eval_runtime": 27.3094, + "eval_samples_per_second": 3.662, + "eval_steps_per_second": 3.662, + "step": 144000 + }, + { + "epoch": 0.6182650283781115, + "grad_norm": 0.1286296844482422, + "learning_rate": 3.8338521769874876e-05, + "loss": 0.1636170744895935, + "step": 144010 + }, + { + "epoch": 0.6183079604681315, + "grad_norm": 0.0414075031876564, + "learning_rate": 3.8334210049757253e-05, + "loss": 0.19566885232925416, + "step": 144020 + }, + { + "epoch": 0.6183508925581516, + "grad_norm": 3.047715425491333, + "learning_rate": 3.832989832963963e-05, + "loss": 0.35408968925476075, + "step": 144030 + }, + { + "epoch": 0.6183938246481715, + "grad_norm": 0.16599392890930176, + "learning_rate": 3.832558660952201e-05, + "loss": 0.1797704815864563, + "step": 144040 + }, + { + "epoch": 0.6184367567381915, + "grad_norm": 0.008573425933718681, + "learning_rate": 3.832127488940438e-05, + "loss": 0.28909251689910886, + "step": 144050 + }, + { + "epoch": 0.6184796888282116, + "grad_norm": 1.5337269306182861, + "learning_rate": 3.8316963169286756e-05, + "loss": 0.19386523962020874, + "step": 144060 + }, + { + "epoch": 0.6185226209182315, + "grad_norm": 0.03713132068514824, + "learning_rate": 3.831265144916913e-05, + "loss": 0.16312804222106933, + "step": 144070 + }, + { + "epoch": 0.6185655530082516, + "grad_norm": 0.3644580543041229, + "learning_rate": 3.830833972905151e-05, + "loss": 0.2069899320602417, + "step": 144080 + }, + { + "epoch": 0.6186084850982716, + "grad_norm": 0.4827655553817749, + "learning_rate": 3.830402800893388e-05, + "loss": 0.11150888204574586, + "step": 144090 + }, + { + "epoch": 0.6186514171882915, + "grad_norm": 2.00545072555542, + "learning_rate": 3.829971628881626e-05, + "loss": 0.04419018030166626, + "step": 144100 + }, + { + "epoch": 0.6186943492783116, + "grad_norm": 1.2583513259887695, + "learning_rate": 3.829540456869864e-05, + "loss": 0.15232290029525758, + "step": 144110 + }, + { + "epoch": 0.6187372813683316, + "grad_norm": 0.6249735951423645, + "learning_rate": 3.829109284858102e-05, + "loss": 0.22450728416442872, + "step": 144120 + }, + { + "epoch": 0.6187802134583515, + "grad_norm": 3.7458887100219727, + "learning_rate": 3.828678112846339e-05, + "loss": 0.2486107349395752, + "step": 144130 + }, + { + "epoch": 0.6188231455483716, + "grad_norm": 0.007021801546216011, + "learning_rate": 3.828246940834577e-05, + "loss": 0.11499238014221191, + "step": 144140 + }, + { + "epoch": 0.6188660776383916, + "grad_norm": 3.2239348888397217, + "learning_rate": 3.8278157688228145e-05, + "loss": 0.19586840867996216, + "step": 144150 + }, + { + "epoch": 0.6189090097284116, + "grad_norm": 1.8541079759597778, + "learning_rate": 3.827384596811052e-05, + "loss": 0.04876286089420319, + "step": 144160 + }, + { + "epoch": 0.6189519418184316, + "grad_norm": 0.020189929753541946, + "learning_rate": 3.826953424799289e-05, + "loss": 0.18910752534866332, + "step": 144170 + }, + { + "epoch": 0.6189948739084516, + "grad_norm": 0.11692792177200317, + "learning_rate": 3.826522252787527e-05, + "loss": 0.06171929240226746, + "step": 144180 + }, + { + "epoch": 0.6190378059984716, + "grad_norm": 0.5069569945335388, + "learning_rate": 3.826091080775765e-05, + "loss": 0.10949817895889283, + "step": 144190 + }, + { + "epoch": 0.6190807380884916, + "grad_norm": 0.0024032050278037786, + "learning_rate": 3.8256599087640025e-05, + "loss": 0.3095686435699463, + "step": 144200 + }, + { + "epoch": 0.6191236701785117, + "grad_norm": 1.413897156715393, + "learning_rate": 3.8252287367522396e-05, + "loss": 0.5190535545349121, + "step": 144210 + }, + { + "epoch": 0.6191666022685316, + "grad_norm": 0.0012539130402728915, + "learning_rate": 3.824797564740478e-05, + "loss": 0.08090531826019287, + "step": 144220 + }, + { + "epoch": 0.6192095343585516, + "grad_norm": 0.18290802836418152, + "learning_rate": 3.824366392728716e-05, + "loss": 0.20504300594329833, + "step": 144230 + }, + { + "epoch": 0.6192524664485717, + "grad_norm": 0.0021957934368401766, + "learning_rate": 3.8239352207169535e-05, + "loss": 0.2403766393661499, + "step": 144240 + }, + { + "epoch": 0.6192953985385916, + "grad_norm": 0.006666228640824556, + "learning_rate": 3.8235040487051905e-05, + "loss": 0.3289052963256836, + "step": 144250 + }, + { + "epoch": 0.6193383306286117, + "grad_norm": 0.10093910992145538, + "learning_rate": 3.823072876693428e-05, + "loss": 0.07276955842971802, + "step": 144260 + }, + { + "epoch": 0.6193812627186317, + "grad_norm": 0.022097958251833916, + "learning_rate": 3.822641704681666e-05, + "loss": 0.21653876304626465, + "step": 144270 + }, + { + "epoch": 0.6194241948086516, + "grad_norm": 1.6240912675857544, + "learning_rate": 3.822210532669904e-05, + "loss": 0.15800448656082153, + "step": 144280 + }, + { + "epoch": 0.6194671268986717, + "grad_norm": 2.436265468597412, + "learning_rate": 3.821779360658141e-05, + "loss": 0.21124651432037353, + "step": 144290 + }, + { + "epoch": 0.6195100589886917, + "grad_norm": 1.256800651550293, + "learning_rate": 3.8213481886463785e-05, + "loss": 0.21225075721740722, + "step": 144300 + }, + { + "epoch": 0.6195529910787116, + "grad_norm": 10.628231048583984, + "learning_rate": 3.820917016634616e-05, + "loss": 0.22397346496582032, + "step": 144310 + }, + { + "epoch": 0.6195959231687317, + "grad_norm": 0.10220340639352798, + "learning_rate": 3.820485844622854e-05, + "loss": 0.2670298099517822, + "step": 144320 + }, + { + "epoch": 0.6196388552587517, + "grad_norm": 0.010059397667646408, + "learning_rate": 3.820054672611092e-05, + "loss": 0.17682617902755737, + "step": 144330 + }, + { + "epoch": 0.6196817873487717, + "grad_norm": 0.3097505271434784, + "learning_rate": 3.8196235005993294e-05, + "loss": 0.16717482805252076, + "step": 144340 + }, + { + "epoch": 0.6197247194387917, + "grad_norm": 0.4097038805484772, + "learning_rate": 3.819192328587567e-05, + "loss": 0.3388197422027588, + "step": 144350 + }, + { + "epoch": 0.6197676515288117, + "grad_norm": 10.982796669006348, + "learning_rate": 3.818761156575805e-05, + "loss": 0.04242531061172485, + "step": 144360 + }, + { + "epoch": 0.6198105836188317, + "grad_norm": 0.06076984852552414, + "learning_rate": 3.818329984564042e-05, + "loss": 0.00288843959569931, + "step": 144370 + }, + { + "epoch": 0.6198535157088517, + "grad_norm": 0.003789462149143219, + "learning_rate": 3.81789881255228e-05, + "loss": 0.15669078826904298, + "step": 144380 + }, + { + "epoch": 0.6198964477988718, + "grad_norm": 0.7226512432098389, + "learning_rate": 3.8174676405405174e-05, + "loss": 0.1839470624923706, + "step": 144390 + }, + { + "epoch": 0.6199393798888918, + "grad_norm": 3.0010428428649902, + "learning_rate": 3.817036468528755e-05, + "loss": 0.057888460159301755, + "step": 144400 + }, + { + "epoch": 0.6199823119789117, + "grad_norm": 2.917448043823242, + "learning_rate": 3.816605296516993e-05, + "loss": 0.28990559577941893, + "step": 144410 + }, + { + "epoch": 0.6200252440689318, + "grad_norm": 0.002920327242463827, + "learning_rate": 3.81617412450523e-05, + "loss": 0.07854058742523193, + "step": 144420 + }, + { + "epoch": 0.6200681761589518, + "grad_norm": 3.3465566635131836, + "learning_rate": 3.815742952493468e-05, + "loss": 0.20734431743621826, + "step": 144430 + }, + { + "epoch": 0.6201111082489718, + "grad_norm": 10.82240104675293, + "learning_rate": 3.8153117804817054e-05, + "loss": 0.12018462419509887, + "step": 144440 + }, + { + "epoch": 0.6201540403389918, + "grad_norm": 0.002988159190863371, + "learning_rate": 3.814880608469943e-05, + "loss": 0.20601401329040528, + "step": 144450 + }, + { + "epoch": 0.6201969724290118, + "grad_norm": 1.9121832847595215, + "learning_rate": 3.814449436458181e-05, + "loss": 0.2810624122619629, + "step": 144460 + }, + { + "epoch": 0.6202399045190318, + "grad_norm": 0.005302377510815859, + "learning_rate": 3.8140182644464186e-05, + "loss": 0.3187835931777954, + "step": 144470 + }, + { + "epoch": 0.6202828366090518, + "grad_norm": 1.602386236190796, + "learning_rate": 3.8135870924346564e-05, + "loss": 0.07800792455673218, + "step": 144480 + }, + { + "epoch": 0.6203257686990719, + "grad_norm": 0.007031735498458147, + "learning_rate": 3.813155920422894e-05, + "loss": 0.006209475174546242, + "step": 144490 + }, + { + "epoch": 0.6203687007890918, + "grad_norm": 1.0654966831207275, + "learning_rate": 3.812724748411131e-05, + "loss": 0.1540340781211853, + "step": 144500 + }, + { + "epoch": 0.6204116328791118, + "grad_norm": 4.626923561096191, + "learning_rate": 3.812293576399369e-05, + "loss": 0.2106240510940552, + "step": 144510 + }, + { + "epoch": 0.6204545649691319, + "grad_norm": 0.20285534858703613, + "learning_rate": 3.8118624043876066e-05, + "loss": 0.3320075273513794, + "step": 144520 + }, + { + "epoch": 0.6204974970591518, + "grad_norm": 0.0030227135866880417, + "learning_rate": 3.8114312323758444e-05, + "loss": 0.13404037952423095, + "step": 144530 + }, + { + "epoch": 0.6205404291491718, + "grad_norm": 0.22634758055210114, + "learning_rate": 3.8110000603640814e-05, + "loss": 0.0774927020072937, + "step": 144540 + }, + { + "epoch": 0.6205833612391919, + "grad_norm": 2.493731737136841, + "learning_rate": 3.810568888352319e-05, + "loss": 0.09815502762794495, + "step": 144550 + }, + { + "epoch": 0.6206262933292118, + "grad_norm": 0.0005603001336567104, + "learning_rate": 3.810137716340557e-05, + "loss": 0.12804954051971434, + "step": 144560 + }, + { + "epoch": 0.6206692254192319, + "grad_norm": 0.03851265087723732, + "learning_rate": 3.8097065443287946e-05, + "loss": 0.11833794116973877, + "step": 144570 + }, + { + "epoch": 0.6207121575092519, + "grad_norm": 0.00021021152497269213, + "learning_rate": 3.8092753723170324e-05, + "loss": 0.11247782707214356, + "step": 144580 + }, + { + "epoch": 0.6207550895992718, + "grad_norm": 6.465201377868652, + "learning_rate": 3.80884420030527e-05, + "loss": 0.3022731304168701, + "step": 144590 + }, + { + "epoch": 0.6207980216892919, + "grad_norm": 0.001970832934603095, + "learning_rate": 3.808413028293508e-05, + "loss": 0.26380870342254636, + "step": 144600 + }, + { + "epoch": 0.6208409537793119, + "grad_norm": 0.009534573182463646, + "learning_rate": 3.8079818562817456e-05, + "loss": 0.0802575707435608, + "step": 144610 + }, + { + "epoch": 0.6208838858693319, + "grad_norm": 0.6005143523216248, + "learning_rate": 3.8075506842699826e-05, + "loss": 0.0741170346736908, + "step": 144620 + }, + { + "epoch": 0.6209268179593519, + "grad_norm": 0.0028213628102093935, + "learning_rate": 3.8071195122582204e-05, + "loss": 0.307177472114563, + "step": 144630 + }, + { + "epoch": 0.6209697500493719, + "grad_norm": 0.0034725700970739126, + "learning_rate": 3.806688340246458e-05, + "loss": 0.0965348243713379, + "step": 144640 + }, + { + "epoch": 0.6210126821393919, + "grad_norm": 1.9992482662200928, + "learning_rate": 3.806257168234696e-05, + "loss": 0.1916975736618042, + "step": 144650 + }, + { + "epoch": 0.6210556142294119, + "grad_norm": 0.4276776611804962, + "learning_rate": 3.805825996222933e-05, + "loss": 0.34029054641723633, + "step": 144660 + }, + { + "epoch": 0.621098546319432, + "grad_norm": 1.4472237825393677, + "learning_rate": 3.8053948242111706e-05, + "loss": 0.08610463738441468, + "step": 144670 + }, + { + "epoch": 0.6211414784094519, + "grad_norm": 0.8272998332977295, + "learning_rate": 3.8049636521994083e-05, + "loss": 0.387483549118042, + "step": 144680 + }, + { + "epoch": 0.6211844104994719, + "grad_norm": 0.0029743912164121866, + "learning_rate": 3.804532480187646e-05, + "loss": 0.22995834350585936, + "step": 144690 + }, + { + "epoch": 0.621227342589492, + "grad_norm": 3.1174747943878174, + "learning_rate": 3.804101308175884e-05, + "loss": 0.13578439950942994, + "step": 144700 + }, + { + "epoch": 0.6212702746795119, + "grad_norm": 0.009533915668725967, + "learning_rate": 3.8036701361641216e-05, + "loss": 0.22146253585815429, + "step": 144710 + }, + { + "epoch": 0.621313206769532, + "grad_norm": 0.03149215504527092, + "learning_rate": 3.803238964152359e-05, + "loss": 0.2577746152877808, + "step": 144720 + }, + { + "epoch": 0.621356138859552, + "grad_norm": 1.5252588987350464, + "learning_rate": 3.802807792140597e-05, + "loss": 0.30984480381011964, + "step": 144730 + }, + { + "epoch": 0.6213990709495719, + "grad_norm": 0.17621318995952606, + "learning_rate": 3.802376620128834e-05, + "loss": 0.16381561756134033, + "step": 144740 + }, + { + "epoch": 0.621442003039592, + "grad_norm": 0.004886439070105553, + "learning_rate": 3.801945448117072e-05, + "loss": 0.11476737260818481, + "step": 144750 + }, + { + "epoch": 0.621484935129612, + "grad_norm": 1.7159773111343384, + "learning_rate": 3.8015142761053095e-05, + "loss": 0.13346294164657593, + "step": 144760 + }, + { + "epoch": 0.6215278672196319, + "grad_norm": 0.4524969756603241, + "learning_rate": 3.801083104093547e-05, + "loss": 0.28767459392547606, + "step": 144770 + }, + { + "epoch": 0.621570799309652, + "grad_norm": 0.0026604910381138325, + "learning_rate": 3.800651932081785e-05, + "loss": 0.25750012397766114, + "step": 144780 + }, + { + "epoch": 0.621613731399672, + "grad_norm": 5.111704349517822, + "learning_rate": 3.800220760070022e-05, + "loss": 0.3053690195083618, + "step": 144790 + }, + { + "epoch": 0.621656663489692, + "grad_norm": 1.1439964771270752, + "learning_rate": 3.79978958805826e-05, + "loss": 0.21157233715057372, + "step": 144800 + }, + { + "epoch": 0.621699595579712, + "grad_norm": 0.010473281145095825, + "learning_rate": 3.799358416046498e-05, + "loss": 0.32777354717254636, + "step": 144810 + }, + { + "epoch": 0.621742527669732, + "grad_norm": 1.7217943668365479, + "learning_rate": 3.798927244034736e-05, + "loss": 0.09053964614868164, + "step": 144820 + }, + { + "epoch": 0.6217854597597521, + "grad_norm": 0.032146602869033813, + "learning_rate": 3.798496072022973e-05, + "loss": 0.33871643543243407, + "step": 144830 + }, + { + "epoch": 0.621828391849772, + "grad_norm": 1.690449595451355, + "learning_rate": 3.798064900011211e-05, + "loss": 0.22544546127319337, + "step": 144840 + }, + { + "epoch": 0.621871323939792, + "grad_norm": 0.03286479413509369, + "learning_rate": 3.7976337279994485e-05, + "loss": 0.2782609224319458, + "step": 144850 + }, + { + "epoch": 0.6219142560298121, + "grad_norm": 0.22668150067329407, + "learning_rate": 3.797202555987686e-05, + "loss": 0.09379579424858094, + "step": 144860 + }, + { + "epoch": 0.621957188119832, + "grad_norm": 1.2573636770248413, + "learning_rate": 3.796771383975923e-05, + "loss": 0.26519174575805665, + "step": 144870 + }, + { + "epoch": 0.6220001202098521, + "grad_norm": 0.35459011793136597, + "learning_rate": 3.796340211964161e-05, + "loss": 0.2879498958587646, + "step": 144880 + }, + { + "epoch": 0.6220430522998721, + "grad_norm": 1.8405866622924805, + "learning_rate": 3.795909039952399e-05, + "loss": 0.42113757133483887, + "step": 144890 + }, + { + "epoch": 0.622085984389892, + "grad_norm": 0.3268384635448456, + "learning_rate": 3.7954778679406365e-05, + "loss": 0.21574442386627196, + "step": 144900 + }, + { + "epoch": 0.6221289164799121, + "grad_norm": 0.0016373234102502465, + "learning_rate": 3.7950466959288735e-05, + "loss": 0.26897358894348145, + "step": 144910 + }, + { + "epoch": 0.6221718485699321, + "grad_norm": 0.0009527649381197989, + "learning_rate": 3.794615523917112e-05, + "loss": 0.29565980434417727, + "step": 144920 + }, + { + "epoch": 0.6222147806599521, + "grad_norm": 1.7517390251159668, + "learning_rate": 3.79418435190535e-05, + "loss": 0.07204756736755372, + "step": 144930 + }, + { + "epoch": 0.6222577127499721, + "grad_norm": 2.711318016052246, + "learning_rate": 3.7937531798935874e-05, + "loss": 0.4752767562866211, + "step": 144940 + }, + { + "epoch": 0.6223006448399921, + "grad_norm": 1.5553025007247925, + "learning_rate": 3.7933220078818245e-05, + "loss": 0.21367301940917968, + "step": 144950 + }, + { + "epoch": 0.6223435769300121, + "grad_norm": 0.0016237174859270453, + "learning_rate": 3.792890835870062e-05, + "loss": 0.08536785244941711, + "step": 144960 + }, + { + "epoch": 0.6223865090200321, + "grad_norm": 0.015023069456219673, + "learning_rate": 3.7924596638583e-05, + "loss": 0.26001734733581544, + "step": 144970 + }, + { + "epoch": 0.6224294411100522, + "grad_norm": 0.001799068064428866, + "learning_rate": 3.792028491846538e-05, + "loss": 0.193844997882843, + "step": 144980 + }, + { + "epoch": 0.6224723732000721, + "grad_norm": 0.15669679641723633, + "learning_rate": 3.791597319834775e-05, + "loss": 0.1980237603187561, + "step": 144990 + }, + { + "epoch": 0.6225153052900921, + "grad_norm": 0.05634564161300659, + "learning_rate": 3.7911661478230125e-05, + "loss": 0.2074127435684204, + "step": 145000 + }, + { + "epoch": 0.6225153052900921, + "eval_loss": 0.40026023983955383, + "eval_runtime": 27.2668, + "eval_samples_per_second": 3.667, + "eval_steps_per_second": 3.667, + "step": 145000 + }, + { + "epoch": 0.6225582373801122, + "grad_norm": 0.06716686487197876, + "learning_rate": 3.79073497581125e-05, + "loss": 0.21744098663330078, + "step": 145010 + }, + { + "epoch": 0.6226011694701321, + "grad_norm": 0.043255776166915894, + "learning_rate": 3.790303803799488e-05, + "loss": 0.13981914520263672, + "step": 145020 + }, + { + "epoch": 0.6226441015601522, + "grad_norm": 0.4962061047554016, + "learning_rate": 3.789872631787726e-05, + "loss": 0.19934669733047486, + "step": 145030 + }, + { + "epoch": 0.6226870336501722, + "grad_norm": 0.00197092373855412, + "learning_rate": 3.7894414597759634e-05, + "loss": 0.22532355785369873, + "step": 145040 + }, + { + "epoch": 0.6227299657401921, + "grad_norm": 0.6518476009368896, + "learning_rate": 3.789010287764201e-05, + "loss": 0.2599307060241699, + "step": 145050 + }, + { + "epoch": 0.6227728978302122, + "grad_norm": 1.8860397338867188, + "learning_rate": 3.788579115752439e-05, + "loss": 0.13637256622314453, + "step": 145060 + }, + { + "epoch": 0.6228158299202322, + "grad_norm": 0.07711993902921677, + "learning_rate": 3.788147943740676e-05, + "loss": 0.047610118985176086, + "step": 145070 + }, + { + "epoch": 0.6228587620102521, + "grad_norm": 1.4419951438903809, + "learning_rate": 3.7877167717289137e-05, + "loss": 0.1328027367591858, + "step": 145080 + }, + { + "epoch": 0.6229016941002722, + "grad_norm": 0.004267912823706865, + "learning_rate": 3.7872855997171514e-05, + "loss": 0.3583818435668945, + "step": 145090 + }, + { + "epoch": 0.6229446261902922, + "grad_norm": 3.1124303340911865, + "learning_rate": 3.786854427705389e-05, + "loss": 0.2011735200881958, + "step": 145100 + }, + { + "epoch": 0.6229875582803122, + "grad_norm": 1.898911952972412, + "learning_rate": 3.786423255693626e-05, + "loss": 0.31902217864990234, + "step": 145110 + }, + { + "epoch": 0.6230304903703322, + "grad_norm": 0.10020571947097778, + "learning_rate": 3.785992083681864e-05, + "loss": 0.06902580857276916, + "step": 145120 + }, + { + "epoch": 0.6230734224603522, + "grad_norm": 2.0357489585876465, + "learning_rate": 3.7855609116701016e-05, + "loss": 0.1981680989265442, + "step": 145130 + }, + { + "epoch": 0.6231163545503722, + "grad_norm": 0.03490392118692398, + "learning_rate": 3.7851297396583394e-05, + "loss": 0.35762135982513427, + "step": 145140 + }, + { + "epoch": 0.6231592866403922, + "grad_norm": 0.012586482800543308, + "learning_rate": 3.784698567646577e-05, + "loss": 0.14604350328445434, + "step": 145150 + }, + { + "epoch": 0.6232022187304123, + "grad_norm": 0.06158836930990219, + "learning_rate": 3.784267395634815e-05, + "loss": 0.10603926181793213, + "step": 145160 + }, + { + "epoch": 0.6232451508204322, + "grad_norm": 3.0967295169830322, + "learning_rate": 3.7838362236230526e-05, + "loss": 0.06373662352561951, + "step": 145170 + }, + { + "epoch": 0.6232880829104522, + "grad_norm": 1.3756184577941895, + "learning_rate": 3.78340505161129e-05, + "loss": 0.2651843547821045, + "step": 145180 + }, + { + "epoch": 0.6233310150004723, + "grad_norm": 2.4878203868865967, + "learning_rate": 3.782973879599528e-05, + "loss": 0.383553147315979, + "step": 145190 + }, + { + "epoch": 0.6233739470904922, + "grad_norm": 0.015998877584934235, + "learning_rate": 3.782542707587765e-05, + "loss": 0.2149120092391968, + "step": 145200 + }, + { + "epoch": 0.6234168791805123, + "grad_norm": 0.640251636505127, + "learning_rate": 3.782111535576003e-05, + "loss": 0.19925031661987305, + "step": 145210 + }, + { + "epoch": 0.6234598112705323, + "grad_norm": 1.6535863876342773, + "learning_rate": 3.7816803635642406e-05, + "loss": 0.2155580997467041, + "step": 145220 + }, + { + "epoch": 0.6235027433605522, + "grad_norm": 1.1984570026397705, + "learning_rate": 3.781249191552478e-05, + "loss": 0.19622409343719482, + "step": 145230 + }, + { + "epoch": 0.6235456754505723, + "grad_norm": 0.0030771929305046797, + "learning_rate": 3.7808180195407154e-05, + "loss": 0.04981268346309662, + "step": 145240 + }, + { + "epoch": 0.6235886075405923, + "grad_norm": 4.39127779006958, + "learning_rate": 3.780386847528953e-05, + "loss": 0.33121452331542967, + "step": 145250 + }, + { + "epoch": 0.6236315396306124, + "grad_norm": 0.059484440833330154, + "learning_rate": 3.779955675517191e-05, + "loss": 0.2900093078613281, + "step": 145260 + }, + { + "epoch": 0.6236744717206323, + "grad_norm": 0.011092513799667358, + "learning_rate": 3.7795245035054286e-05, + "loss": 0.17457928657531738, + "step": 145270 + }, + { + "epoch": 0.6237174038106523, + "grad_norm": 0.025852881371974945, + "learning_rate": 3.779093331493666e-05, + "loss": 0.0900078535079956, + "step": 145280 + }, + { + "epoch": 0.6237603359006724, + "grad_norm": 2.27447247505188, + "learning_rate": 3.778662159481904e-05, + "loss": 0.30752589702606203, + "step": 145290 + }, + { + "epoch": 0.6238032679906923, + "grad_norm": 10.999873161315918, + "learning_rate": 3.778230987470142e-05, + "loss": 0.2985308408737183, + "step": 145300 + }, + { + "epoch": 0.6238462000807123, + "grad_norm": 0.021880190819501877, + "learning_rate": 3.7777998154583795e-05, + "loss": 0.04139164686203003, + "step": 145310 + }, + { + "epoch": 0.6238891321707324, + "grad_norm": 0.012969445437192917, + "learning_rate": 3.7773686434466166e-05, + "loss": 0.34741475582122805, + "step": 145320 + }, + { + "epoch": 0.6239320642607523, + "grad_norm": 0.012905898503959179, + "learning_rate": 3.776937471434854e-05, + "loss": 0.0011121029034256934, + "step": 145330 + }, + { + "epoch": 0.6239749963507724, + "grad_norm": 2.579164981842041, + "learning_rate": 3.776506299423092e-05, + "loss": 0.25848629474639895, + "step": 145340 + }, + { + "epoch": 0.6240179284407924, + "grad_norm": 0.2724818289279938, + "learning_rate": 3.77607512741133e-05, + "loss": 0.20009520053863525, + "step": 145350 + }, + { + "epoch": 0.6240608605308123, + "grad_norm": 2.1852598190307617, + "learning_rate": 3.775643955399567e-05, + "loss": 0.2844280242919922, + "step": 145360 + }, + { + "epoch": 0.6241037926208324, + "grad_norm": 0.014285706914961338, + "learning_rate": 3.7752127833878046e-05, + "loss": 0.2562817335128784, + "step": 145370 + }, + { + "epoch": 0.6241467247108524, + "grad_norm": 0.06769892573356628, + "learning_rate": 3.774781611376042e-05, + "loss": 0.015678460896015167, + "step": 145380 + }, + { + "epoch": 0.6241896568008724, + "grad_norm": 0.02668279968202114, + "learning_rate": 3.77435043936428e-05, + "loss": 0.24262983798980714, + "step": 145390 + }, + { + "epoch": 0.6242325888908924, + "grad_norm": 0.052530985325574875, + "learning_rate": 3.773919267352518e-05, + "loss": 0.18415896892547606, + "step": 145400 + }, + { + "epoch": 0.6242755209809124, + "grad_norm": 1.8255459070205688, + "learning_rate": 3.7734880953407555e-05, + "loss": 0.3908602237701416, + "step": 145410 + }, + { + "epoch": 0.6243184530709324, + "grad_norm": 0.0010746768675744534, + "learning_rate": 3.773056923328993e-05, + "loss": 0.13366581201553346, + "step": 145420 + }, + { + "epoch": 0.6243613851609524, + "grad_norm": 4.105757713317871, + "learning_rate": 3.772625751317231e-05, + "loss": 0.3110255479812622, + "step": 145430 + }, + { + "epoch": 0.6244043172509725, + "grad_norm": 0.027147723361849785, + "learning_rate": 3.772194579305468e-05, + "loss": 0.29846107959747314, + "step": 145440 + }, + { + "epoch": 0.6244472493409924, + "grad_norm": 0.4245328903198242, + "learning_rate": 3.771763407293706e-05, + "loss": 0.10092895030975342, + "step": 145450 + }, + { + "epoch": 0.6244901814310124, + "grad_norm": 1.827056884765625, + "learning_rate": 3.7713322352819435e-05, + "loss": 0.06381365656852722, + "step": 145460 + }, + { + "epoch": 0.6245331135210325, + "grad_norm": 0.18593548238277435, + "learning_rate": 3.770901063270181e-05, + "loss": 0.14583500623703002, + "step": 145470 + }, + { + "epoch": 0.6245760456110524, + "grad_norm": 0.17885200679302216, + "learning_rate": 3.770469891258418e-05, + "loss": 0.11033551692962647, + "step": 145480 + }, + { + "epoch": 0.6246189777010724, + "grad_norm": 0.005315417889505625, + "learning_rate": 3.770038719246656e-05, + "loss": 0.22789452075958253, + "step": 145490 + }, + { + "epoch": 0.6246619097910925, + "grad_norm": 0.004930767696350813, + "learning_rate": 3.769607547234894e-05, + "loss": 0.12441198825836182, + "step": 145500 + }, + { + "epoch": 0.6247048418811124, + "grad_norm": 2.7298367023468018, + "learning_rate": 3.769176375223132e-05, + "loss": 0.2080162525177002, + "step": 145510 + }, + { + "epoch": 0.6247477739711325, + "grad_norm": 0.01054426096379757, + "learning_rate": 3.76874520321137e-05, + "loss": 0.16831830739974976, + "step": 145520 + }, + { + "epoch": 0.6247907060611525, + "grad_norm": 9.6473970413208, + "learning_rate": 3.768314031199607e-05, + "loss": 0.2800298690795898, + "step": 145530 + }, + { + "epoch": 0.6248336381511724, + "grad_norm": 0.004264887422323227, + "learning_rate": 3.767882859187845e-05, + "loss": 0.07018274664878846, + "step": 145540 + }, + { + "epoch": 0.6248765702411925, + "grad_norm": 2.2102596759796143, + "learning_rate": 3.7674516871760824e-05, + "loss": 0.47200469970703124, + "step": 145550 + }, + { + "epoch": 0.6249195023312125, + "grad_norm": 0.015142920427024364, + "learning_rate": 3.76702051516432e-05, + "loss": 0.21918439865112305, + "step": 145560 + }, + { + "epoch": 0.6249624344212324, + "grad_norm": 0.14599426090717316, + "learning_rate": 3.766589343152557e-05, + "loss": 0.2167109251022339, + "step": 145570 + }, + { + "epoch": 0.6250053665112525, + "grad_norm": 0.3270648419857025, + "learning_rate": 3.766158171140795e-05, + "loss": 0.31240594387054443, + "step": 145580 + }, + { + "epoch": 0.6250482986012725, + "grad_norm": 0.04953808709979057, + "learning_rate": 3.765726999129033e-05, + "loss": 0.21056201457977294, + "step": 145590 + }, + { + "epoch": 0.6250912306912925, + "grad_norm": 0.033134374767541885, + "learning_rate": 3.7652958271172704e-05, + "loss": 0.1267376184463501, + "step": 145600 + }, + { + "epoch": 0.6251341627813125, + "grad_norm": 0.04095722362399101, + "learning_rate": 3.7648646551055075e-05, + "loss": 0.053391391038894655, + "step": 145610 + }, + { + "epoch": 0.6251770948713326, + "grad_norm": 0.11082349717617035, + "learning_rate": 3.764433483093746e-05, + "loss": 0.07753645181655884, + "step": 145620 + }, + { + "epoch": 0.6252200269613525, + "grad_norm": 0.0397820807993412, + "learning_rate": 3.7640023110819836e-05, + "loss": 0.0551756739616394, + "step": 145630 + }, + { + "epoch": 0.6252629590513725, + "grad_norm": 0.08418507874011993, + "learning_rate": 3.7635711390702214e-05, + "loss": 0.23780393600463867, + "step": 145640 + }, + { + "epoch": 0.6253058911413926, + "grad_norm": 0.1522897332906723, + "learning_rate": 3.7631399670584584e-05, + "loss": 0.15822170972824096, + "step": 145650 + }, + { + "epoch": 0.6253488232314125, + "grad_norm": 10.029183387756348, + "learning_rate": 3.762708795046696e-05, + "loss": 0.269795823097229, + "step": 145660 + }, + { + "epoch": 0.6253917553214325, + "grad_norm": 0.08777247369289398, + "learning_rate": 3.762277623034934e-05, + "loss": 0.31310975551605225, + "step": 145670 + }, + { + "epoch": 0.6254346874114526, + "grad_norm": 0.007045544218271971, + "learning_rate": 3.7618464510231716e-05, + "loss": 0.15862007141113282, + "step": 145680 + }, + { + "epoch": 0.6254776195014726, + "grad_norm": 0.002723569516092539, + "learning_rate": 3.761415279011409e-05, + "loss": 0.25944039821624754, + "step": 145690 + }, + { + "epoch": 0.6255205515914926, + "grad_norm": 0.0022276772651821375, + "learning_rate": 3.7609841069996464e-05, + "loss": 0.2553360939025879, + "step": 145700 + }, + { + "epoch": 0.6255634836815126, + "grad_norm": 0.009006328880786896, + "learning_rate": 3.760552934987884e-05, + "loss": 0.0817391335964203, + "step": 145710 + }, + { + "epoch": 0.6256064157715326, + "grad_norm": 1.0603159666061401, + "learning_rate": 3.760121762976122e-05, + "loss": 0.27546770572662355, + "step": 145720 + }, + { + "epoch": 0.6256493478615526, + "grad_norm": 0.043728139251470566, + "learning_rate": 3.7596905909643596e-05, + "loss": 0.2361830234527588, + "step": 145730 + }, + { + "epoch": 0.6256922799515726, + "grad_norm": 0.004647698253393173, + "learning_rate": 3.7592594189525973e-05, + "loss": 0.17039880752563477, + "step": 145740 + }, + { + "epoch": 0.6257352120415927, + "grad_norm": 1.4827666282653809, + "learning_rate": 3.758828246940835e-05, + "loss": 0.2931626558303833, + "step": 145750 + }, + { + "epoch": 0.6257781441316126, + "grad_norm": 0.006912170443683863, + "learning_rate": 3.758397074929073e-05, + "loss": 0.17341002225875854, + "step": 145760 + }, + { + "epoch": 0.6258210762216326, + "grad_norm": 0.0031075282022356987, + "learning_rate": 3.75796590291731e-05, + "loss": 0.2037494421005249, + "step": 145770 + }, + { + "epoch": 0.6258640083116527, + "grad_norm": 3.5641911029815674, + "learning_rate": 3.7575347309055476e-05, + "loss": 0.12922029495239257, + "step": 145780 + }, + { + "epoch": 0.6259069404016726, + "grad_norm": 1.2592276334762573, + "learning_rate": 3.757103558893785e-05, + "loss": 0.2730496883392334, + "step": 145790 + }, + { + "epoch": 0.6259498724916926, + "grad_norm": 1.6755534410476685, + "learning_rate": 3.756672386882023e-05, + "loss": 0.2942385196685791, + "step": 145800 + }, + { + "epoch": 0.6259928045817127, + "grad_norm": 0.32389035820961, + "learning_rate": 3.75624121487026e-05, + "loss": 0.18852608203887938, + "step": 145810 + }, + { + "epoch": 0.6260357366717326, + "grad_norm": 0.0004410554829519242, + "learning_rate": 3.755810042858498e-05, + "loss": 0.16696833372116088, + "step": 145820 + }, + { + "epoch": 0.6260786687617527, + "grad_norm": 0.15728677809238434, + "learning_rate": 3.7553788708467356e-05, + "loss": 0.26857168674468995, + "step": 145830 + }, + { + "epoch": 0.6261216008517727, + "grad_norm": 5.143691062927246, + "learning_rate": 3.754947698834973e-05, + "loss": 0.28821985721588134, + "step": 145840 + }, + { + "epoch": 0.6261645329417926, + "grad_norm": 1.2966766357421875, + "learning_rate": 3.754516526823211e-05, + "loss": 0.13320962190628052, + "step": 145850 + }, + { + "epoch": 0.6262074650318127, + "grad_norm": 0.861115038394928, + "learning_rate": 3.754085354811449e-05, + "loss": 0.17039065361022948, + "step": 145860 + }, + { + "epoch": 0.6262503971218327, + "grad_norm": 0.37558168172836304, + "learning_rate": 3.7536541827996865e-05, + "loss": 0.21755945682525635, + "step": 145870 + }, + { + "epoch": 0.6262933292118527, + "grad_norm": 0.0026671243831515312, + "learning_rate": 3.753223010787924e-05, + "loss": 0.18931851387023926, + "step": 145880 + }, + { + "epoch": 0.6263362613018727, + "grad_norm": 5.478182315826416, + "learning_rate": 3.752791838776162e-05, + "loss": 0.19699089527130126, + "step": 145890 + }, + { + "epoch": 0.6263791933918927, + "grad_norm": 0.21105577051639557, + "learning_rate": 3.752360666764399e-05, + "loss": 0.10836315155029297, + "step": 145900 + }, + { + "epoch": 0.6264221254819127, + "grad_norm": 0.004369442816823721, + "learning_rate": 3.751929494752637e-05, + "loss": 0.265771222114563, + "step": 145910 + }, + { + "epoch": 0.6264650575719327, + "grad_norm": 0.028172479942440987, + "learning_rate": 3.7514983227408745e-05, + "loss": 0.2676279067993164, + "step": 145920 + }, + { + "epoch": 0.6265079896619528, + "grad_norm": 2.269669771194458, + "learning_rate": 3.751067150729112e-05, + "loss": 0.2712892532348633, + "step": 145930 + }, + { + "epoch": 0.6265509217519727, + "grad_norm": 0.005145613569766283, + "learning_rate": 3.750635978717349e-05, + "loss": 0.11162854433059692, + "step": 145940 + }, + { + "epoch": 0.6265938538419927, + "grad_norm": 3.3848986625671387, + "learning_rate": 3.750204806705587e-05, + "loss": 0.04469236433506012, + "step": 145950 + }, + { + "epoch": 0.6266367859320128, + "grad_norm": 0.03765769302845001, + "learning_rate": 3.749773634693825e-05, + "loss": 0.12097903490066528, + "step": 145960 + }, + { + "epoch": 0.6266797180220327, + "grad_norm": 1.5613038539886475, + "learning_rate": 3.7493424626820625e-05, + "loss": 0.29975531101226804, + "step": 145970 + }, + { + "epoch": 0.6267226501120527, + "grad_norm": 0.8295878767967224, + "learning_rate": 3.7489112906703e-05, + "loss": 0.17278884649276732, + "step": 145980 + }, + { + "epoch": 0.6267655822020728, + "grad_norm": 0.02731485851109028, + "learning_rate": 3.748480118658538e-05, + "loss": 0.19173781871795653, + "step": 145990 + }, + { + "epoch": 0.6268085142920927, + "grad_norm": 0.2628108561038971, + "learning_rate": 3.748048946646776e-05, + "loss": 0.220833158493042, + "step": 146000 + }, + { + "epoch": 0.6268085142920927, + "eval_loss": 0.4031921625137329, + "eval_runtime": 27.2015, + "eval_samples_per_second": 3.676, + "eval_steps_per_second": 3.676, + "step": 146000 + }, + { + "epoch": 0.6268514463821128, + "grad_norm": 1.9009525775909424, + "learning_rate": 3.7476177746350135e-05, + "loss": 0.181487512588501, + "step": 146010 + }, + { + "epoch": 0.6268943784721328, + "grad_norm": 0.006618720479309559, + "learning_rate": 3.7471866026232505e-05, + "loss": 0.28319597244262695, + "step": 146020 + }, + { + "epoch": 0.6269373105621527, + "grad_norm": 3.7840189933776855, + "learning_rate": 3.746755430611488e-05, + "loss": 0.47288169860839846, + "step": 146030 + }, + { + "epoch": 0.6269802426521728, + "grad_norm": 1.4764827489852905, + "learning_rate": 3.746324258599726e-05, + "loss": 0.16911323070526124, + "step": 146040 + }, + { + "epoch": 0.6270231747421928, + "grad_norm": 0.2975632846355438, + "learning_rate": 3.745893086587964e-05, + "loss": 0.19954975843429565, + "step": 146050 + }, + { + "epoch": 0.6270661068322128, + "grad_norm": 0.3340650796890259, + "learning_rate": 3.745461914576201e-05, + "loss": 0.22841260433197022, + "step": 146060 + }, + { + "epoch": 0.6271090389222328, + "grad_norm": 0.06769564002752304, + "learning_rate": 3.7450307425644385e-05, + "loss": 0.03349553942680359, + "step": 146070 + }, + { + "epoch": 0.6271519710122528, + "grad_norm": 0.0012691410956904292, + "learning_rate": 3.744599570552676e-05, + "loss": 0.12927672863006592, + "step": 146080 + }, + { + "epoch": 0.6271949031022728, + "grad_norm": 3.187361717224121, + "learning_rate": 3.744168398540914e-05, + "loss": 0.27967898845672606, + "step": 146090 + }, + { + "epoch": 0.6272378351922928, + "grad_norm": 0.11216457933187485, + "learning_rate": 3.743737226529152e-05, + "loss": 0.27449731826782225, + "step": 146100 + }, + { + "epoch": 0.6272807672823129, + "grad_norm": 0.11325439810752869, + "learning_rate": 3.7433060545173894e-05, + "loss": 0.18230938911437988, + "step": 146110 + }, + { + "epoch": 0.6273236993723329, + "grad_norm": 0.8538678884506226, + "learning_rate": 3.742874882505627e-05, + "loss": 0.18734936714172362, + "step": 146120 + }, + { + "epoch": 0.6273666314623528, + "grad_norm": 0.010642403736710548, + "learning_rate": 3.742443710493865e-05, + "loss": 0.10937390327453614, + "step": 146130 + }, + { + "epoch": 0.6274095635523729, + "grad_norm": 9.793993949890137, + "learning_rate": 3.742012538482102e-05, + "loss": 0.3626396894454956, + "step": 146140 + }, + { + "epoch": 0.6274524956423929, + "grad_norm": 1.1978455781936646, + "learning_rate": 3.74158136647034e-05, + "loss": 0.07268427610397339, + "step": 146150 + }, + { + "epoch": 0.6274954277324128, + "grad_norm": 2.1169817447662354, + "learning_rate": 3.7411501944585774e-05, + "loss": 0.12902868986129762, + "step": 146160 + }, + { + "epoch": 0.6275383598224329, + "grad_norm": 6.555164813995361, + "learning_rate": 3.740719022446815e-05, + "loss": 0.45094785690307615, + "step": 146170 + }, + { + "epoch": 0.6275812919124529, + "grad_norm": 0.001882154494524002, + "learning_rate": 3.740287850435052e-05, + "loss": 0.23653554916381836, + "step": 146180 + }, + { + "epoch": 0.6276242240024729, + "grad_norm": 0.026824140921235085, + "learning_rate": 3.73985667842329e-05, + "loss": 0.24134407043457032, + "step": 146190 + }, + { + "epoch": 0.6276671560924929, + "grad_norm": 0.005355069879442453, + "learning_rate": 3.739425506411528e-05, + "loss": 0.28949851989746095, + "step": 146200 + }, + { + "epoch": 0.627710088182513, + "grad_norm": 0.018241219222545624, + "learning_rate": 3.7389943343997654e-05, + "loss": 0.29019651412963865, + "step": 146210 + }, + { + "epoch": 0.6277530202725329, + "grad_norm": 0.021457862108945847, + "learning_rate": 3.738563162388004e-05, + "loss": 0.2145296573638916, + "step": 146220 + }, + { + "epoch": 0.6277959523625529, + "grad_norm": 0.023087697103619576, + "learning_rate": 3.738131990376241e-05, + "loss": 0.006541821360588074, + "step": 146230 + }, + { + "epoch": 0.627838884452573, + "grad_norm": 0.07880302518606186, + "learning_rate": 3.7377008183644786e-05, + "loss": 0.16138269901275634, + "step": 146240 + }, + { + "epoch": 0.6278818165425929, + "grad_norm": 1.7311794757843018, + "learning_rate": 3.7372696463527164e-05, + "loss": 0.3214499235153198, + "step": 146250 + }, + { + "epoch": 0.6279247486326129, + "grad_norm": 0.009020458906888962, + "learning_rate": 3.736838474340954e-05, + "loss": 0.11787396669387817, + "step": 146260 + }, + { + "epoch": 0.627967680722633, + "grad_norm": 1.642788052558899, + "learning_rate": 3.736407302329191e-05, + "loss": 0.2313527822494507, + "step": 146270 + }, + { + "epoch": 0.6280106128126529, + "grad_norm": 0.03921899199485779, + "learning_rate": 3.735976130317429e-05, + "loss": 0.16721705198287964, + "step": 146280 + }, + { + "epoch": 0.628053544902673, + "grad_norm": 2.4101881980895996, + "learning_rate": 3.7355449583056666e-05, + "loss": 0.2695363998413086, + "step": 146290 + }, + { + "epoch": 0.628096476992693, + "grad_norm": 2.3433501720428467, + "learning_rate": 3.7351137862939044e-05, + "loss": 0.28004429340362547, + "step": 146300 + }, + { + "epoch": 0.6281394090827129, + "grad_norm": 0.7007365822792053, + "learning_rate": 3.7346826142821414e-05, + "loss": 0.1107077956199646, + "step": 146310 + }, + { + "epoch": 0.628182341172733, + "grad_norm": 0.019995957612991333, + "learning_rate": 3.734251442270379e-05, + "loss": 0.005020419508218766, + "step": 146320 + }, + { + "epoch": 0.628225273262753, + "grad_norm": 0.0074407197535037994, + "learning_rate": 3.7338202702586176e-05, + "loss": 0.2880034208297729, + "step": 146330 + }, + { + "epoch": 0.628268205352773, + "grad_norm": 40.73788833618164, + "learning_rate": 3.733389098246855e-05, + "loss": 0.4160567283630371, + "step": 146340 + }, + { + "epoch": 0.628311137442793, + "grad_norm": 0.009502614848315716, + "learning_rate": 3.7329579262350924e-05, + "loss": 0.14252859354019165, + "step": 146350 + }, + { + "epoch": 0.628354069532813, + "grad_norm": 2.465653896331787, + "learning_rate": 3.73252675422333e-05, + "loss": 0.37836034297943116, + "step": 146360 + }, + { + "epoch": 0.628397001622833, + "grad_norm": 0.012200412340462208, + "learning_rate": 3.732095582211568e-05, + "loss": 0.08112860321998597, + "step": 146370 + }, + { + "epoch": 0.628439933712853, + "grad_norm": 0.0012890741927549243, + "learning_rate": 3.7316644101998056e-05, + "loss": 0.13902195692062377, + "step": 146380 + }, + { + "epoch": 0.628482865802873, + "grad_norm": 2.2479848861694336, + "learning_rate": 3.7312332381880426e-05, + "loss": 0.1304740071296692, + "step": 146390 + }, + { + "epoch": 0.628525797892893, + "grad_norm": 0.45096537470817566, + "learning_rate": 3.7308020661762804e-05, + "loss": 0.14930739402770996, + "step": 146400 + }, + { + "epoch": 0.628568729982913, + "grad_norm": 0.0019203760894015431, + "learning_rate": 3.730370894164518e-05, + "loss": 0.1763664960861206, + "step": 146410 + }, + { + "epoch": 0.6286116620729331, + "grad_norm": 1.2747520208358765, + "learning_rate": 3.729939722152756e-05, + "loss": 0.1266668438911438, + "step": 146420 + }, + { + "epoch": 0.628654594162953, + "grad_norm": 4.500255107879639, + "learning_rate": 3.729508550140993e-05, + "loss": 0.21049704551696777, + "step": 146430 + }, + { + "epoch": 0.628697526252973, + "grad_norm": 0.0036547803319990635, + "learning_rate": 3.729077378129231e-05, + "loss": 0.07864784598350524, + "step": 146440 + }, + { + "epoch": 0.6287404583429931, + "grad_norm": 0.011433214880526066, + "learning_rate": 3.728646206117469e-05, + "loss": 0.3140164136886597, + "step": 146450 + }, + { + "epoch": 0.628783390433013, + "grad_norm": 0.3247806429862976, + "learning_rate": 3.728215034105707e-05, + "loss": 0.10256623029708863, + "step": 146460 + }, + { + "epoch": 0.628826322523033, + "grad_norm": 0.0035158980172127485, + "learning_rate": 3.727783862093944e-05, + "loss": 0.11457140445709228, + "step": 146470 + }, + { + "epoch": 0.6288692546130531, + "grad_norm": 0.008701084181666374, + "learning_rate": 3.7273526900821815e-05, + "loss": 0.1851799964904785, + "step": 146480 + }, + { + "epoch": 0.628912186703073, + "grad_norm": 0.009318762458860874, + "learning_rate": 3.726921518070419e-05, + "loss": 0.19677205085754396, + "step": 146490 + }, + { + "epoch": 0.6289551187930931, + "grad_norm": 3.374988317489624, + "learning_rate": 3.726490346058657e-05, + "loss": 0.21994738578796386, + "step": 146500 + }, + { + "epoch": 0.6289980508831131, + "grad_norm": 0.0007356581627391279, + "learning_rate": 3.726059174046894e-05, + "loss": 0.07651254534721375, + "step": 146510 + }, + { + "epoch": 0.629040982973133, + "grad_norm": 1.4155685901641846, + "learning_rate": 3.725628002035132e-05, + "loss": 0.15071755647659302, + "step": 146520 + }, + { + "epoch": 0.6290839150631531, + "grad_norm": 10.228381156921387, + "learning_rate": 3.7251968300233695e-05, + "loss": 0.23199965953826904, + "step": 146530 + }, + { + "epoch": 0.6291268471531731, + "grad_norm": 0.028175201267004013, + "learning_rate": 3.724765658011607e-05, + "loss": 0.19142227172851561, + "step": 146540 + }, + { + "epoch": 0.6291697792431932, + "grad_norm": 0.024978064000606537, + "learning_rate": 3.724334485999845e-05, + "loss": 0.25046958923339846, + "step": 146550 + }, + { + "epoch": 0.6292127113332131, + "grad_norm": 0.09602590650320053, + "learning_rate": 3.723903313988083e-05, + "loss": 0.2911947250366211, + "step": 146560 + }, + { + "epoch": 0.6292556434232331, + "grad_norm": 0.024032101035118103, + "learning_rate": 3.7234721419763205e-05, + "loss": 0.3351039171218872, + "step": 146570 + }, + { + "epoch": 0.6292985755132532, + "grad_norm": 0.0016572453314438462, + "learning_rate": 3.723040969964558e-05, + "loss": 0.1245275855064392, + "step": 146580 + }, + { + "epoch": 0.6293415076032731, + "grad_norm": 0.12577351927757263, + "learning_rate": 3.722609797952796e-05, + "loss": 0.3349307537078857, + "step": 146590 + }, + { + "epoch": 0.6293844396932932, + "grad_norm": 0.10792262107133865, + "learning_rate": 3.722178625941033e-05, + "loss": 0.2238081932067871, + "step": 146600 + }, + { + "epoch": 0.6294273717833132, + "grad_norm": 0.001989643555134535, + "learning_rate": 3.721747453929271e-05, + "loss": 0.11774779558181762, + "step": 146610 + }, + { + "epoch": 0.6294703038733331, + "grad_norm": 1.7697548866271973, + "learning_rate": 3.7213162819175085e-05, + "loss": 0.18405303955078126, + "step": 146620 + }, + { + "epoch": 0.6295132359633532, + "grad_norm": 0.005540918558835983, + "learning_rate": 3.720885109905746e-05, + "loss": 0.14159350395202636, + "step": 146630 + }, + { + "epoch": 0.6295561680533732, + "grad_norm": 1.2075954675674438, + "learning_rate": 3.720453937893983e-05, + "loss": 0.38316285610198975, + "step": 146640 + }, + { + "epoch": 0.6295991001433932, + "grad_norm": 1.5206912755966187, + "learning_rate": 3.720022765882221e-05, + "loss": 0.30779242515563965, + "step": 146650 + }, + { + "epoch": 0.6296420322334132, + "grad_norm": 0.0024733147583901882, + "learning_rate": 3.719591593870459e-05, + "loss": 0.2867335081100464, + "step": 146660 + }, + { + "epoch": 0.6296849643234332, + "grad_norm": 0.029516298323869705, + "learning_rate": 3.7191604218586965e-05, + "loss": 0.15923948287963868, + "step": 146670 + }, + { + "epoch": 0.6297278964134532, + "grad_norm": 3.6430470943450928, + "learning_rate": 3.718729249846934e-05, + "loss": 0.2126899242401123, + "step": 146680 + }, + { + "epoch": 0.6297708285034732, + "grad_norm": 0.032296113669872284, + "learning_rate": 3.718298077835172e-05, + "loss": 0.2337803363800049, + "step": 146690 + }, + { + "epoch": 0.6298137605934933, + "grad_norm": 0.13325929641723633, + "learning_rate": 3.71786690582341e-05, + "loss": 0.1753853440284729, + "step": 146700 + }, + { + "epoch": 0.6298566926835132, + "grad_norm": 1.2130388021469116, + "learning_rate": 3.7174357338116474e-05, + "loss": 0.14811716079711915, + "step": 146710 + }, + { + "epoch": 0.6298996247735332, + "grad_norm": 0.9190666675567627, + "learning_rate": 3.7170045617998845e-05, + "loss": 0.22362709045410156, + "step": 146720 + }, + { + "epoch": 0.6299425568635533, + "grad_norm": 0.0004198495007585734, + "learning_rate": 3.716573389788122e-05, + "loss": 0.24469988346099852, + "step": 146730 + }, + { + "epoch": 0.6299854889535732, + "grad_norm": 0.004793898668140173, + "learning_rate": 3.71614221777636e-05, + "loss": 0.3914718389511108, + "step": 146740 + }, + { + "epoch": 0.6300284210435932, + "grad_norm": 0.07496125996112823, + "learning_rate": 3.715711045764598e-05, + "loss": 0.1663152575492859, + "step": 146750 + }, + { + "epoch": 0.6300713531336133, + "grad_norm": 0.0066208732314407825, + "learning_rate": 3.715279873752835e-05, + "loss": 0.09344573020935058, + "step": 146760 + }, + { + "epoch": 0.6301142852236332, + "grad_norm": 1.0700713396072388, + "learning_rate": 3.7148487017410725e-05, + "loss": 0.23460733890533447, + "step": 146770 + }, + { + "epoch": 0.6301572173136533, + "grad_norm": 1.1751123666763306, + "learning_rate": 3.71441752972931e-05, + "loss": 0.14351837635040282, + "step": 146780 + }, + { + "epoch": 0.6302001494036733, + "grad_norm": 1.113236665725708, + "learning_rate": 3.713986357717548e-05, + "loss": 0.2353208065032959, + "step": 146790 + }, + { + "epoch": 0.6302430814936932, + "grad_norm": 0.01679939031600952, + "learning_rate": 3.7135551857057857e-05, + "loss": 0.19200507402420045, + "step": 146800 + }, + { + "epoch": 0.6302860135837133, + "grad_norm": 0.0016825655475258827, + "learning_rate": 3.7131240136940234e-05, + "loss": 0.26706829071044924, + "step": 146810 + }, + { + "epoch": 0.6303289456737333, + "grad_norm": 0.033295415341854095, + "learning_rate": 3.712692841682261e-05, + "loss": 0.3771516799926758, + "step": 146820 + }, + { + "epoch": 0.6303718777637533, + "grad_norm": 0.2702064514160156, + "learning_rate": 3.712261669670499e-05, + "loss": 0.4259639263153076, + "step": 146830 + }, + { + "epoch": 0.6304148098537733, + "grad_norm": 0.035043902695178986, + "learning_rate": 3.711830497658736e-05, + "loss": 0.047640106081962584, + "step": 146840 + }, + { + "epoch": 0.6304577419437933, + "grad_norm": 1.3876519203186035, + "learning_rate": 3.7113993256469737e-05, + "loss": 0.41136393547058103, + "step": 146850 + }, + { + "epoch": 0.6305006740338133, + "grad_norm": 1.9727435111999512, + "learning_rate": 3.7109681536352114e-05, + "loss": 0.08784054517745972, + "step": 146860 + }, + { + "epoch": 0.6305436061238333, + "grad_norm": 0.4474978744983673, + "learning_rate": 3.710536981623449e-05, + "loss": 0.09354345798492432, + "step": 146870 + }, + { + "epoch": 0.6305865382138534, + "grad_norm": 0.26996567845344543, + "learning_rate": 3.710105809611686e-05, + "loss": 0.21056702136993408, + "step": 146880 + }, + { + "epoch": 0.6306294703038733, + "grad_norm": 0.02897840365767479, + "learning_rate": 3.709674637599924e-05, + "loss": 0.274979829788208, + "step": 146890 + }, + { + "epoch": 0.6306724023938933, + "grad_norm": 1.4435681104660034, + "learning_rate": 3.7092434655881616e-05, + "loss": 0.22395720481872558, + "step": 146900 + }, + { + "epoch": 0.6307153344839134, + "grad_norm": 0.37976908683776855, + "learning_rate": 3.7088122935763994e-05, + "loss": 0.12538430690765381, + "step": 146910 + }, + { + "epoch": 0.6307582665739333, + "grad_norm": 1.2855740785598755, + "learning_rate": 3.708381121564637e-05, + "loss": 0.3442651033401489, + "step": 146920 + }, + { + "epoch": 0.6308011986639533, + "grad_norm": 9.116958618164062, + "learning_rate": 3.707949949552875e-05, + "loss": 0.28530521392822267, + "step": 146930 + }, + { + "epoch": 0.6308441307539734, + "grad_norm": 0.021495725959539413, + "learning_rate": 3.7075187775411126e-05, + "loss": 0.3916192531585693, + "step": 146940 + }, + { + "epoch": 0.6308870628439933, + "grad_norm": 0.09106861799955368, + "learning_rate": 3.70708760552935e-05, + "loss": 0.14217714071273804, + "step": 146950 + }, + { + "epoch": 0.6309299949340134, + "grad_norm": 4.241379737854004, + "learning_rate": 3.706656433517588e-05, + "loss": 0.4013192653656006, + "step": 146960 + }, + { + "epoch": 0.6309729270240334, + "grad_norm": 0.1723231077194214, + "learning_rate": 3.706225261505825e-05, + "loss": 0.08882884383201599, + "step": 146970 + }, + { + "epoch": 0.6310158591140534, + "grad_norm": 1.0407248735427856, + "learning_rate": 3.705794089494063e-05, + "loss": 0.27390859127044676, + "step": 146980 + }, + { + "epoch": 0.6310587912040734, + "grad_norm": 0.13997861742973328, + "learning_rate": 3.7053629174823006e-05, + "loss": 0.2324601173400879, + "step": 146990 + }, + { + "epoch": 0.6311017232940934, + "grad_norm": 0.013284893706440926, + "learning_rate": 3.704931745470538e-05, + "loss": 0.3834493160247803, + "step": 147000 + }, + { + "epoch": 0.6311017232940934, + "eval_loss": 0.3863438367843628, + "eval_runtime": 27.1407, + "eval_samples_per_second": 3.684, + "eval_steps_per_second": 3.684, + "step": 147000 + }, + { + "epoch": 0.6311446553841135, + "grad_norm": 2.8731067180633545, + "learning_rate": 3.7045005734587754e-05, + "loss": 0.1249500036239624, + "step": 147010 + }, + { + "epoch": 0.6311875874741334, + "grad_norm": 1.8072718381881714, + "learning_rate": 3.704069401447013e-05, + "loss": 0.4470974922180176, + "step": 147020 + }, + { + "epoch": 0.6312305195641534, + "grad_norm": 0.0028647701255977154, + "learning_rate": 3.7036382294352515e-05, + "loss": 0.295593786239624, + "step": 147030 + }, + { + "epoch": 0.6312734516541735, + "grad_norm": 0.04063660278916359, + "learning_rate": 3.703207057423489e-05, + "loss": 0.12825417518615723, + "step": 147040 + }, + { + "epoch": 0.6313163837441934, + "grad_norm": 0.043811146169900894, + "learning_rate": 3.702775885411726e-05, + "loss": 0.2365511178970337, + "step": 147050 + }, + { + "epoch": 0.6313593158342135, + "grad_norm": 0.025705622509121895, + "learning_rate": 3.702344713399964e-05, + "loss": 0.26898367404937745, + "step": 147060 + }, + { + "epoch": 0.6314022479242335, + "grad_norm": 0.04759734496474266, + "learning_rate": 3.701913541388202e-05, + "loss": 0.00582539290189743, + "step": 147070 + }, + { + "epoch": 0.6314451800142534, + "grad_norm": 0.21706196665763855, + "learning_rate": 3.7014823693764395e-05, + "loss": 0.49407367706298827, + "step": 147080 + }, + { + "epoch": 0.6314881121042735, + "grad_norm": 1.1993558406829834, + "learning_rate": 3.7010511973646766e-05, + "loss": 0.21412403583526612, + "step": 147090 + }, + { + "epoch": 0.6315310441942935, + "grad_norm": 0.010109450668096542, + "learning_rate": 3.700620025352914e-05, + "loss": 0.008372948318719865, + "step": 147100 + }, + { + "epoch": 0.6315739762843134, + "grad_norm": 0.32362470030784607, + "learning_rate": 3.700188853341152e-05, + "loss": 0.20660581588745117, + "step": 147110 + }, + { + "epoch": 0.6316169083743335, + "grad_norm": 0.0003117761225439608, + "learning_rate": 3.69975768132939e-05, + "loss": 0.3196255683898926, + "step": 147120 + }, + { + "epoch": 0.6316598404643535, + "grad_norm": 0.22397857904434204, + "learning_rate": 3.699326509317627e-05, + "loss": 0.0932235836982727, + "step": 147130 + }, + { + "epoch": 0.6317027725543735, + "grad_norm": 0.002880153711885214, + "learning_rate": 3.698895337305865e-05, + "loss": 0.24725043773651123, + "step": 147140 + }, + { + "epoch": 0.6317457046443935, + "grad_norm": 0.17092221975326538, + "learning_rate": 3.698464165294103e-05, + "loss": 0.09742676615715026, + "step": 147150 + }, + { + "epoch": 0.6317886367344135, + "grad_norm": 2.2574303150177, + "learning_rate": 3.698032993282341e-05, + "loss": 0.16607791185379028, + "step": 147160 + }, + { + "epoch": 0.6318315688244335, + "grad_norm": 2.8181631565093994, + "learning_rate": 3.697601821270578e-05, + "loss": 0.1326764464378357, + "step": 147170 + }, + { + "epoch": 0.6318745009144535, + "grad_norm": 0.0026326251681894064, + "learning_rate": 3.6971706492588155e-05, + "loss": 0.19749863147735597, + "step": 147180 + }, + { + "epoch": 0.6319174330044736, + "grad_norm": 0.0013430170947685838, + "learning_rate": 3.696739477247053e-05, + "loss": 0.2386791467666626, + "step": 147190 + }, + { + "epoch": 0.6319603650944935, + "grad_norm": 5.6976776123046875, + "learning_rate": 3.696308305235291e-05, + "loss": 0.31951711177825926, + "step": 147200 + }, + { + "epoch": 0.6320032971845135, + "grad_norm": 0.2425728440284729, + "learning_rate": 3.695877133223528e-05, + "loss": 0.4066159248352051, + "step": 147210 + }, + { + "epoch": 0.6320462292745336, + "grad_norm": 0.12010367214679718, + "learning_rate": 3.695445961211766e-05, + "loss": 0.16877690553665162, + "step": 147220 + }, + { + "epoch": 0.6320891613645535, + "grad_norm": 0.13629449903964996, + "learning_rate": 3.6950147892000035e-05, + "loss": 0.07434083819389344, + "step": 147230 + }, + { + "epoch": 0.6321320934545736, + "grad_norm": 0.00021290559379849583, + "learning_rate": 3.694583617188241e-05, + "loss": 0.2000523567199707, + "step": 147240 + }, + { + "epoch": 0.6321750255445936, + "grad_norm": 0.24666225910186768, + "learning_rate": 3.694152445176479e-05, + "loss": 0.27231531143188475, + "step": 147250 + }, + { + "epoch": 0.6322179576346135, + "grad_norm": 0.004511278122663498, + "learning_rate": 3.693721273164717e-05, + "loss": 0.08121266365051269, + "step": 147260 + }, + { + "epoch": 0.6322608897246336, + "grad_norm": 0.00025838721194304526, + "learning_rate": 3.6932901011529544e-05, + "loss": 0.03792424499988556, + "step": 147270 + }, + { + "epoch": 0.6323038218146536, + "grad_norm": 0.02920406311750412, + "learning_rate": 3.692858929141192e-05, + "loss": 0.22607312202453614, + "step": 147280 + }, + { + "epoch": 0.6323467539046735, + "grad_norm": 0.0067008258774876595, + "learning_rate": 3.692427757129429e-05, + "loss": 0.26392254829406736, + "step": 147290 + }, + { + "epoch": 0.6323896859946936, + "grad_norm": 0.1206984594464302, + "learning_rate": 3.691996585117667e-05, + "loss": 0.17658530473709105, + "step": 147300 + }, + { + "epoch": 0.6324326180847136, + "grad_norm": 0.06308251619338989, + "learning_rate": 3.691565413105905e-05, + "loss": 0.19717254638671874, + "step": 147310 + }, + { + "epoch": 0.6324755501747336, + "grad_norm": 0.0017627764027565718, + "learning_rate": 3.6911342410941424e-05, + "loss": 0.024138632416725158, + "step": 147320 + }, + { + "epoch": 0.6325184822647536, + "grad_norm": 1.507886528968811, + "learning_rate": 3.69070306908238e-05, + "loss": 0.3853263854980469, + "step": 147330 + }, + { + "epoch": 0.6325614143547736, + "grad_norm": 2.78678297996521, + "learning_rate": 3.690271897070617e-05, + "loss": 0.16321289539337158, + "step": 147340 + }, + { + "epoch": 0.6326043464447936, + "grad_norm": 1.2329705953598022, + "learning_rate": 3.689840725058855e-05, + "loss": 0.27058162689208987, + "step": 147350 + }, + { + "epoch": 0.6326472785348136, + "grad_norm": 2.1577019691467285, + "learning_rate": 3.689409553047093e-05, + "loss": 0.2109225273132324, + "step": 147360 + }, + { + "epoch": 0.6326902106248337, + "grad_norm": 7.57331436034292e-05, + "learning_rate": 3.6889783810353304e-05, + "loss": 0.2609088659286499, + "step": 147370 + }, + { + "epoch": 0.6327331427148536, + "grad_norm": 0.009449384175240993, + "learning_rate": 3.688547209023568e-05, + "loss": 0.3353603363037109, + "step": 147380 + }, + { + "epoch": 0.6327760748048736, + "grad_norm": 3.3012754917144775, + "learning_rate": 3.688116037011806e-05, + "loss": 0.11990993022918701, + "step": 147390 + }, + { + "epoch": 0.6328190068948937, + "grad_norm": 2.1117093563079834, + "learning_rate": 3.6876848650000436e-05, + "loss": 0.4455077648162842, + "step": 147400 + }, + { + "epoch": 0.6328619389849137, + "grad_norm": 0.0228413213044405, + "learning_rate": 3.6872536929882814e-05, + "loss": 0.10761359930038453, + "step": 147410 + }, + { + "epoch": 0.6329048710749337, + "grad_norm": 9.0891695022583, + "learning_rate": 3.6868225209765184e-05, + "loss": 0.11293789148330688, + "step": 147420 + }, + { + "epoch": 0.6329478031649537, + "grad_norm": 0.0013412254629656672, + "learning_rate": 3.686391348964756e-05, + "loss": 0.11689684391021729, + "step": 147430 + }, + { + "epoch": 0.6329907352549737, + "grad_norm": 0.005989507306367159, + "learning_rate": 3.685960176952994e-05, + "loss": 0.17218418121337892, + "step": 147440 + }, + { + "epoch": 0.6330336673449937, + "grad_norm": 0.004671361763030291, + "learning_rate": 3.6855290049412316e-05, + "loss": 0.03742716014385224, + "step": 147450 + }, + { + "epoch": 0.6330765994350137, + "grad_norm": 0.9053412079811096, + "learning_rate": 3.685097832929469e-05, + "loss": 0.07469549179077148, + "step": 147460 + }, + { + "epoch": 0.6331195315250338, + "grad_norm": 8.508781320415437e-05, + "learning_rate": 3.6846666609177064e-05, + "loss": 0.2632524728775024, + "step": 147470 + }, + { + "epoch": 0.6331624636150537, + "grad_norm": 4.723887920379639, + "learning_rate": 3.684235488905944e-05, + "loss": 0.13231680393218995, + "step": 147480 + }, + { + "epoch": 0.6332053957050737, + "grad_norm": 1.05924654006958, + "learning_rate": 3.683804316894182e-05, + "loss": 0.2699723720550537, + "step": 147490 + }, + { + "epoch": 0.6332483277950938, + "grad_norm": 0.27306386828422546, + "learning_rate": 3.6833731448824196e-05, + "loss": 0.0017774634063243866, + "step": 147500 + }, + { + "epoch": 0.6332912598851137, + "grad_norm": 0.0028440305031836033, + "learning_rate": 3.6829419728706573e-05, + "loss": 0.15099271535873413, + "step": 147510 + }, + { + "epoch": 0.6333341919751337, + "grad_norm": 0.2072194367647171, + "learning_rate": 3.682510800858895e-05, + "loss": 0.15268101692199706, + "step": 147520 + }, + { + "epoch": 0.6333771240651538, + "grad_norm": 0.0005316045135259628, + "learning_rate": 3.682079628847133e-05, + "loss": 0.1784825563430786, + "step": 147530 + }, + { + "epoch": 0.6334200561551737, + "grad_norm": 0.08889344334602356, + "learning_rate": 3.68164845683537e-05, + "loss": 0.07060465812683106, + "step": 147540 + }, + { + "epoch": 0.6334629882451938, + "grad_norm": 0.020317930728197098, + "learning_rate": 3.6812172848236076e-05, + "loss": 0.08077744245529175, + "step": 147550 + }, + { + "epoch": 0.6335059203352138, + "grad_norm": 0.028461042791604996, + "learning_rate": 3.680786112811845e-05, + "loss": 0.375799560546875, + "step": 147560 + }, + { + "epoch": 0.6335488524252337, + "grad_norm": 0.0021006674505770206, + "learning_rate": 3.680354940800083e-05, + "loss": 0.23112802505493163, + "step": 147570 + }, + { + "epoch": 0.6335917845152538, + "grad_norm": 0.4146704375743866, + "learning_rate": 3.67992376878832e-05, + "loss": 0.22537550926208497, + "step": 147580 + }, + { + "epoch": 0.6336347166052738, + "grad_norm": 0.0008316123858094215, + "learning_rate": 3.679492596776558e-05, + "loss": 0.16894692182540894, + "step": 147590 + }, + { + "epoch": 0.6336776486952937, + "grad_norm": 0.26273226737976074, + "learning_rate": 3.6790614247647956e-05, + "loss": 0.21262357234954835, + "step": 147600 + }, + { + "epoch": 0.6337205807853138, + "grad_norm": 2.1046674251556396, + "learning_rate": 3.678630252753033e-05, + "loss": 0.17924087047576903, + "step": 147610 + }, + { + "epoch": 0.6337635128753338, + "grad_norm": 5.799474716186523, + "learning_rate": 3.678199080741271e-05, + "loss": 0.08266505599021912, + "step": 147620 + }, + { + "epoch": 0.6338064449653538, + "grad_norm": 1.8729013204574585, + "learning_rate": 3.677767908729509e-05, + "loss": 0.11252769231796264, + "step": 147630 + }, + { + "epoch": 0.6338493770553738, + "grad_norm": 6.594720840454102, + "learning_rate": 3.6773367367177465e-05, + "loss": 0.4548068046569824, + "step": 147640 + }, + { + "epoch": 0.6338923091453939, + "grad_norm": 0.0037639886140823364, + "learning_rate": 3.676905564705984e-05, + "loss": 0.22702391147613527, + "step": 147650 + }, + { + "epoch": 0.6339352412354138, + "grad_norm": 0.00032389559783041477, + "learning_rate": 3.676474392694221e-05, + "loss": 0.11630215644836425, + "step": 147660 + }, + { + "epoch": 0.6339781733254338, + "grad_norm": 0.0001477748592151329, + "learning_rate": 3.676043220682459e-05, + "loss": 0.19351050853729249, + "step": 147670 + }, + { + "epoch": 0.6340211054154539, + "grad_norm": 1.0882714986801147, + "learning_rate": 3.675612048670697e-05, + "loss": 0.2964040279388428, + "step": 147680 + }, + { + "epoch": 0.6340640375054738, + "grad_norm": 0.010478765703737736, + "learning_rate": 3.6751808766589345e-05, + "loss": 0.1366469144821167, + "step": 147690 + }, + { + "epoch": 0.6341069695954938, + "grad_norm": 5.9954352378845215, + "learning_rate": 3.674749704647172e-05, + "loss": 0.3098312854766846, + "step": 147700 + }, + { + "epoch": 0.6341499016855139, + "grad_norm": 0.4428004026412964, + "learning_rate": 3.674318532635409e-05, + "loss": 0.22567715644836425, + "step": 147710 + }, + { + "epoch": 0.6341928337755338, + "grad_norm": 0.012649418786168098, + "learning_rate": 3.673887360623647e-05, + "loss": 0.13131637573242189, + "step": 147720 + }, + { + "epoch": 0.6342357658655539, + "grad_norm": 0.01736398972570896, + "learning_rate": 3.673456188611885e-05, + "loss": 0.05424131751060486, + "step": 147730 + }, + { + "epoch": 0.6342786979555739, + "grad_norm": 4.064063549041748, + "learning_rate": 3.673025016600123e-05, + "loss": 0.423872184753418, + "step": 147740 + }, + { + "epoch": 0.6343216300455938, + "grad_norm": 0.027415230870246887, + "learning_rate": 3.67259384458836e-05, + "loss": 0.313714599609375, + "step": 147750 + }, + { + "epoch": 0.6343645621356139, + "grad_norm": 22.213289260864258, + "learning_rate": 3.672162672576598e-05, + "loss": 0.1917173147201538, + "step": 147760 + }, + { + "epoch": 0.6344074942256339, + "grad_norm": 2.9939708709716797, + "learning_rate": 3.671731500564836e-05, + "loss": 0.3942455291748047, + "step": 147770 + }, + { + "epoch": 0.6344504263156538, + "grad_norm": 2.6282453536987305, + "learning_rate": 3.6713003285530735e-05, + "loss": 0.31463940143585206, + "step": 147780 + }, + { + "epoch": 0.6344933584056739, + "grad_norm": 0.0022995853796601295, + "learning_rate": 3.6708691565413105e-05, + "loss": 0.08896902799606324, + "step": 147790 + }, + { + "epoch": 0.6345362904956939, + "grad_norm": 0.004803662188351154, + "learning_rate": 3.670437984529548e-05, + "loss": 0.24017529487609862, + "step": 147800 + }, + { + "epoch": 0.6345792225857139, + "grad_norm": 2.3700520992279053, + "learning_rate": 3.670006812517786e-05, + "loss": 0.41024460792541506, + "step": 147810 + }, + { + "epoch": 0.6346221546757339, + "grad_norm": 0.014520244672894478, + "learning_rate": 3.669575640506024e-05, + "loss": 0.0028250502422451974, + "step": 147820 + }, + { + "epoch": 0.634665086765754, + "grad_norm": 0.009078883565962315, + "learning_rate": 3.669144468494261e-05, + "loss": 0.024532407522201538, + "step": 147830 + }, + { + "epoch": 0.634708018855774, + "grad_norm": 0.33582961559295654, + "learning_rate": 3.6687132964824985e-05, + "loss": 0.14474940299987793, + "step": 147840 + }, + { + "epoch": 0.6347509509457939, + "grad_norm": 6.57936954498291, + "learning_rate": 3.668282124470737e-05, + "loss": 0.33017749786376954, + "step": 147850 + }, + { + "epoch": 0.634793883035814, + "grad_norm": 0.01086271833628416, + "learning_rate": 3.6678509524589747e-05, + "loss": 0.26077311038970946, + "step": 147860 + }, + { + "epoch": 0.634836815125834, + "grad_norm": 1.8153256177902222, + "learning_rate": 3.667419780447212e-05, + "loss": 0.3809725046157837, + "step": 147870 + }, + { + "epoch": 0.6348797472158539, + "grad_norm": 0.39656689763069153, + "learning_rate": 3.6669886084354494e-05, + "loss": 0.10590251684188842, + "step": 147880 + }, + { + "epoch": 0.634922679305874, + "grad_norm": 0.20176301896572113, + "learning_rate": 3.666557436423687e-05, + "loss": 0.11742727756500244, + "step": 147890 + }, + { + "epoch": 0.634965611395894, + "grad_norm": 0.09392766654491425, + "learning_rate": 3.666126264411925e-05, + "loss": 0.2184234380722046, + "step": 147900 + }, + { + "epoch": 0.635008543485914, + "grad_norm": 0.08617518842220306, + "learning_rate": 3.665695092400162e-05, + "loss": 0.35497307777404785, + "step": 147910 + }, + { + "epoch": 0.635051475575934, + "grad_norm": 0.09503033012151718, + "learning_rate": 3.6652639203884e-05, + "loss": 0.1937497854232788, + "step": 147920 + }, + { + "epoch": 0.635094407665954, + "grad_norm": 1.621155858039856, + "learning_rate": 3.6648327483766374e-05, + "loss": 0.14177558422088624, + "step": 147930 + }, + { + "epoch": 0.635137339755974, + "grad_norm": 1.141985297203064, + "learning_rate": 3.664401576364875e-05, + "loss": 0.05915210247039795, + "step": 147940 + }, + { + "epoch": 0.635180271845994, + "grad_norm": 0.0722414031624794, + "learning_rate": 3.663970404353112e-05, + "loss": 0.25335264205932617, + "step": 147950 + }, + { + "epoch": 0.6352232039360141, + "grad_norm": 6.025696277618408, + "learning_rate": 3.6635392323413506e-05, + "loss": 0.1266070008277893, + "step": 147960 + }, + { + "epoch": 0.635266136026034, + "grad_norm": 4.48659086227417, + "learning_rate": 3.6631080603295884e-05, + "loss": 0.25072340965270995, + "step": 147970 + }, + { + "epoch": 0.635309068116054, + "grad_norm": 0.0004896190366707742, + "learning_rate": 3.662676888317826e-05, + "loss": 0.21098458766937256, + "step": 147980 + }, + { + "epoch": 0.6353520002060741, + "grad_norm": 2.152426242828369, + "learning_rate": 3.662245716306063e-05, + "loss": 0.37824833393096924, + "step": 147990 + }, + { + "epoch": 0.635394932296094, + "grad_norm": 2.1820719242095947, + "learning_rate": 3.661814544294301e-05, + "loss": 0.43947458267211914, + "step": 148000 + }, + { + "epoch": 0.635394932296094, + "eval_loss": 0.3964541256427765, + "eval_runtime": 27.1059, + "eval_samples_per_second": 3.689, + "eval_steps_per_second": 3.689, + "step": 148000 + }, + { + "epoch": 0.635437864386114, + "grad_norm": 2.4284281730651855, + "learning_rate": 3.6613833722825386e-05, + "loss": 0.16709811687469484, + "step": 148010 + }, + { + "epoch": 0.6354807964761341, + "grad_norm": 2.571385622024536, + "learning_rate": 3.6609522002707764e-05, + "loss": 0.2915945053100586, + "step": 148020 + }, + { + "epoch": 0.635523728566154, + "grad_norm": 0.009592088870704174, + "learning_rate": 3.6605210282590134e-05, + "loss": 0.06732832193374634, + "step": 148030 + }, + { + "epoch": 0.6355666606561741, + "grad_norm": 0.007833711802959442, + "learning_rate": 3.660089856247251e-05, + "loss": 0.3823229312896729, + "step": 148040 + }, + { + "epoch": 0.6356095927461941, + "grad_norm": 1.7308787107467651, + "learning_rate": 3.659658684235489e-05, + "loss": 0.2691195964813232, + "step": 148050 + }, + { + "epoch": 0.635652524836214, + "grad_norm": 0.0012639171909540892, + "learning_rate": 3.6592275122237266e-05, + "loss": 0.28667852878570554, + "step": 148060 + }, + { + "epoch": 0.6356954569262341, + "grad_norm": 1.2245522737503052, + "learning_rate": 3.6587963402119644e-05, + "loss": 0.4876396656036377, + "step": 148070 + }, + { + "epoch": 0.6357383890162541, + "grad_norm": 0.0028517525643110275, + "learning_rate": 3.658365168200202e-05, + "loss": 0.3154790163040161, + "step": 148080 + }, + { + "epoch": 0.635781321106274, + "grad_norm": 0.0017242819303646684, + "learning_rate": 3.65793399618844e-05, + "loss": 0.20070092678070067, + "step": 148090 + }, + { + "epoch": 0.6358242531962941, + "grad_norm": 0.022681070491671562, + "learning_rate": 3.6575028241766776e-05, + "loss": 0.35894730091094973, + "step": 148100 + }, + { + "epoch": 0.6358671852863141, + "grad_norm": 0.011042545549571514, + "learning_rate": 3.657071652164915e-05, + "loss": 0.3039877414703369, + "step": 148110 + }, + { + "epoch": 0.6359101173763341, + "grad_norm": 3.708176374435425, + "learning_rate": 3.6566404801531524e-05, + "loss": 0.32838003635406493, + "step": 148120 + }, + { + "epoch": 0.6359530494663541, + "grad_norm": 0.12351704388856888, + "learning_rate": 3.65620930814139e-05, + "loss": 0.29023327827453616, + "step": 148130 + }, + { + "epoch": 0.6359959815563742, + "grad_norm": 0.004000886343419552, + "learning_rate": 3.655778136129628e-05, + "loss": 0.07846883535385132, + "step": 148140 + }, + { + "epoch": 0.6360389136463941, + "grad_norm": 0.002440853975713253, + "learning_rate": 3.6553469641178656e-05, + "loss": 0.05644827485084534, + "step": 148150 + }, + { + "epoch": 0.6360818457364141, + "grad_norm": 2.0655572414398193, + "learning_rate": 3.6549157921061026e-05, + "loss": 0.42191014289855955, + "step": 148160 + }, + { + "epoch": 0.6361247778264342, + "grad_norm": 0.006922286003828049, + "learning_rate": 3.6544846200943403e-05, + "loss": 0.24610230922698975, + "step": 148170 + }, + { + "epoch": 0.6361677099164541, + "grad_norm": 2.2612147331237793, + "learning_rate": 3.654053448082578e-05, + "loss": 0.31474781036376953, + "step": 148180 + }, + { + "epoch": 0.6362106420064741, + "grad_norm": 0.3926016688346863, + "learning_rate": 3.653622276070816e-05, + "loss": 0.16994028091430663, + "step": 148190 + }, + { + "epoch": 0.6362535740964942, + "grad_norm": 3.7936487197875977, + "learning_rate": 3.6531911040590536e-05, + "loss": 0.19815267324447633, + "step": 148200 + }, + { + "epoch": 0.6362965061865141, + "grad_norm": 0.7687986493110657, + "learning_rate": 3.652759932047291e-05, + "loss": 0.29149169921875, + "step": 148210 + }, + { + "epoch": 0.6363394382765342, + "grad_norm": 3.6010568141937256, + "learning_rate": 3.652328760035529e-05, + "loss": 0.24495539665222169, + "step": 148220 + }, + { + "epoch": 0.6363823703665542, + "grad_norm": 0.02493220753967762, + "learning_rate": 3.651897588023767e-05, + "loss": 0.15526796579360963, + "step": 148230 + }, + { + "epoch": 0.6364253024565741, + "grad_norm": 0.38506048917770386, + "learning_rate": 3.651466416012004e-05, + "loss": 0.07274820804595947, + "step": 148240 + }, + { + "epoch": 0.6364682345465942, + "grad_norm": 1.6484607458114624, + "learning_rate": 3.6510352440002415e-05, + "loss": 0.18348608016967774, + "step": 148250 + }, + { + "epoch": 0.6365111666366142, + "grad_norm": 2.5601675510406494, + "learning_rate": 3.650604071988479e-05, + "loss": 0.4256871223449707, + "step": 148260 + }, + { + "epoch": 0.6365540987266343, + "grad_norm": 0.3567001521587372, + "learning_rate": 3.650172899976717e-05, + "loss": 0.14035037755966187, + "step": 148270 + }, + { + "epoch": 0.6365970308166542, + "grad_norm": 0.020381588488817215, + "learning_rate": 3.649741727964954e-05, + "loss": 0.2797467947006226, + "step": 148280 + }, + { + "epoch": 0.6366399629066742, + "grad_norm": 7.5753045082092285, + "learning_rate": 3.649310555953192e-05, + "loss": 0.4102283477783203, + "step": 148290 + }, + { + "epoch": 0.6366828949966943, + "grad_norm": 1.01334547996521, + "learning_rate": 3.6488793839414295e-05, + "loss": 0.2608525037765503, + "step": 148300 + }, + { + "epoch": 0.6367258270867142, + "grad_norm": 0.0031121079809963703, + "learning_rate": 3.648448211929667e-05, + "loss": 0.271061635017395, + "step": 148310 + }, + { + "epoch": 0.6367687591767343, + "grad_norm": 0.05686857923865318, + "learning_rate": 3.648017039917905e-05, + "loss": 0.255864691734314, + "step": 148320 + }, + { + "epoch": 0.6368116912667543, + "grad_norm": 0.0934736356139183, + "learning_rate": 3.647585867906143e-05, + "loss": 0.07495766282081603, + "step": 148330 + }, + { + "epoch": 0.6368546233567742, + "grad_norm": 1.5425732135772705, + "learning_rate": 3.6471546958943805e-05, + "loss": 0.34494857788085936, + "step": 148340 + }, + { + "epoch": 0.6368975554467943, + "grad_norm": 0.10615680366754532, + "learning_rate": 3.646723523882618e-05, + "loss": 0.15554267168045044, + "step": 148350 + }, + { + "epoch": 0.6369404875368143, + "grad_norm": 0.8117842078208923, + "learning_rate": 3.646292351870855e-05, + "loss": 0.12490513324737548, + "step": 148360 + }, + { + "epoch": 0.6369834196268342, + "grad_norm": 1.5500012636184692, + "learning_rate": 3.645861179859093e-05, + "loss": 0.12137371301651001, + "step": 148370 + }, + { + "epoch": 0.6370263517168543, + "grad_norm": 2.9333624839782715, + "learning_rate": 3.645430007847331e-05, + "loss": 0.2662505149841309, + "step": 148380 + }, + { + "epoch": 0.6370692838068743, + "grad_norm": 0.11318331956863403, + "learning_rate": 3.6449988358355685e-05, + "loss": 0.23979718685150148, + "step": 148390 + }, + { + "epoch": 0.6371122158968943, + "grad_norm": 1.10670804977417, + "learning_rate": 3.6445676638238055e-05, + "loss": 0.2728361845016479, + "step": 148400 + }, + { + "epoch": 0.6371551479869143, + "grad_norm": 4.697409152984619, + "learning_rate": 3.644136491812043e-05, + "loss": 0.39655683040618894, + "step": 148410 + }, + { + "epoch": 0.6371980800769343, + "grad_norm": 0.01723591797053814, + "learning_rate": 3.643705319800281e-05, + "loss": 0.2219693899154663, + "step": 148420 + }, + { + "epoch": 0.6372410121669543, + "grad_norm": 1.3442760705947876, + "learning_rate": 3.643274147788519e-05, + "loss": 0.3207077503204346, + "step": 148430 + }, + { + "epoch": 0.6372839442569743, + "grad_norm": 0.0006233075400814414, + "learning_rate": 3.642842975776757e-05, + "loss": 0.16337018013000487, + "step": 148440 + }, + { + "epoch": 0.6373268763469944, + "grad_norm": 0.010497170500457287, + "learning_rate": 3.642411803764994e-05, + "loss": 0.2993746757507324, + "step": 148450 + }, + { + "epoch": 0.6373698084370143, + "grad_norm": 0.06728941202163696, + "learning_rate": 3.641980631753232e-05, + "loss": 0.32750234603881834, + "step": 148460 + }, + { + "epoch": 0.6374127405270343, + "grad_norm": 15.103754043579102, + "learning_rate": 3.64154945974147e-05, + "loss": 0.4762892723083496, + "step": 148470 + }, + { + "epoch": 0.6374556726170544, + "grad_norm": 2.0977084636688232, + "learning_rate": 3.6411182877297074e-05, + "loss": 0.39258265495300293, + "step": 148480 + }, + { + "epoch": 0.6374986047070743, + "grad_norm": 0.9504750370979309, + "learning_rate": 3.6406871157179445e-05, + "loss": 0.24261243343353273, + "step": 148490 + }, + { + "epoch": 0.6375415367970944, + "grad_norm": 0.007759752683341503, + "learning_rate": 3.640255943706182e-05, + "loss": 0.12828243970870973, + "step": 148500 + }, + { + "epoch": 0.6375844688871144, + "grad_norm": 0.061698079109191895, + "learning_rate": 3.63982477169442e-05, + "loss": 0.30746870040893554, + "step": 148510 + }, + { + "epoch": 0.6376274009771343, + "grad_norm": 0.06339918076992035, + "learning_rate": 3.6393935996826577e-05, + "loss": 0.1409119725227356, + "step": 148520 + }, + { + "epoch": 0.6376703330671544, + "grad_norm": 0.015551037155091763, + "learning_rate": 3.638962427670895e-05, + "loss": 0.14194364547729493, + "step": 148530 + }, + { + "epoch": 0.6377132651571744, + "grad_norm": 0.5721806287765503, + "learning_rate": 3.6385312556591325e-05, + "loss": 0.2990145444869995, + "step": 148540 + }, + { + "epoch": 0.6377561972471943, + "grad_norm": 0.38188424706459045, + "learning_rate": 3.638100083647371e-05, + "loss": 0.18333234786987304, + "step": 148550 + }, + { + "epoch": 0.6377991293372144, + "grad_norm": 0.04208914935588837, + "learning_rate": 3.6376689116356086e-05, + "loss": 0.10026679039001465, + "step": 148560 + }, + { + "epoch": 0.6378420614272344, + "grad_norm": 0.04540753737092018, + "learning_rate": 3.6372377396238457e-05, + "loss": 0.0547387421131134, + "step": 148570 + }, + { + "epoch": 0.6378849935172544, + "grad_norm": 0.03880661725997925, + "learning_rate": 3.6368065676120834e-05, + "loss": 0.24963154792785644, + "step": 148580 + }, + { + "epoch": 0.6379279256072744, + "grad_norm": 0.003545165527611971, + "learning_rate": 3.636375395600321e-05, + "loss": 0.11689521074295044, + "step": 148590 + }, + { + "epoch": 0.6379708576972944, + "grad_norm": 0.830731987953186, + "learning_rate": 3.635944223588559e-05, + "loss": 0.30333313941955564, + "step": 148600 + }, + { + "epoch": 0.6380137897873144, + "grad_norm": 0.031620461493730545, + "learning_rate": 3.635513051576796e-05, + "loss": 0.23221213817596437, + "step": 148610 + }, + { + "epoch": 0.6380567218773344, + "grad_norm": 0.02347370609641075, + "learning_rate": 3.6350818795650336e-05, + "loss": 0.4794201374053955, + "step": 148620 + }, + { + "epoch": 0.6380996539673545, + "grad_norm": 7.367602348327637, + "learning_rate": 3.6346507075532714e-05, + "loss": 0.2784811735153198, + "step": 148630 + }, + { + "epoch": 0.6381425860573744, + "grad_norm": 0.06946471333503723, + "learning_rate": 3.634219535541509e-05, + "loss": 0.36514983177185056, + "step": 148640 + }, + { + "epoch": 0.6381855181473944, + "grad_norm": 1.8712137937545776, + "learning_rate": 3.633788363529746e-05, + "loss": 0.1741746187210083, + "step": 148650 + }, + { + "epoch": 0.6382284502374145, + "grad_norm": 0.005989521741867065, + "learning_rate": 3.6333571915179846e-05, + "loss": 0.2726861000061035, + "step": 148660 + }, + { + "epoch": 0.6382713823274344, + "grad_norm": 0.04366813972592354, + "learning_rate": 3.632926019506222e-05, + "loss": 0.10254474878311157, + "step": 148670 + }, + { + "epoch": 0.6383143144174545, + "grad_norm": 0.22113563120365143, + "learning_rate": 3.63249484749446e-05, + "loss": 0.05884222984313965, + "step": 148680 + }, + { + "epoch": 0.6383572465074745, + "grad_norm": 0.21171332895755768, + "learning_rate": 3.632063675482697e-05, + "loss": 0.024245685338973998, + "step": 148690 + }, + { + "epoch": 0.6384001785974945, + "grad_norm": 9.799121856689453, + "learning_rate": 3.631632503470935e-05, + "loss": 0.2564573049545288, + "step": 148700 + }, + { + "epoch": 0.6384431106875145, + "grad_norm": 0.48802077770233154, + "learning_rate": 3.6312013314591726e-05, + "loss": 0.1415635108947754, + "step": 148710 + }, + { + "epoch": 0.6384860427775345, + "grad_norm": 0.14215391874313354, + "learning_rate": 3.63077015944741e-05, + "loss": 0.1812630772590637, + "step": 148720 + }, + { + "epoch": 0.6385289748675546, + "grad_norm": 1.549190878868103, + "learning_rate": 3.6303389874356474e-05, + "loss": 0.3076681137084961, + "step": 148730 + }, + { + "epoch": 0.6385719069575745, + "grad_norm": 0.18916155397891998, + "learning_rate": 3.629907815423885e-05, + "loss": 0.014893820881843567, + "step": 148740 + }, + { + "epoch": 0.6386148390475945, + "grad_norm": 0.02583307959139347, + "learning_rate": 3.629476643412123e-05, + "loss": 0.20866930484771729, + "step": 148750 + }, + { + "epoch": 0.6386577711376146, + "grad_norm": 0.04389571025967598, + "learning_rate": 3.6290454714003606e-05, + "loss": 0.3857304096221924, + "step": 148760 + }, + { + "epoch": 0.6387007032276345, + "grad_norm": 0.02454189956188202, + "learning_rate": 3.628614299388598e-05, + "loss": 0.09770624041557312, + "step": 148770 + }, + { + "epoch": 0.6387436353176545, + "grad_norm": 4.198458671569824, + "learning_rate": 3.628183127376836e-05, + "loss": 0.1682787537574768, + "step": 148780 + }, + { + "epoch": 0.6387865674076746, + "grad_norm": 0.01082449872046709, + "learning_rate": 3.627751955365074e-05, + "loss": 0.10119086503982544, + "step": 148790 + }, + { + "epoch": 0.6388294994976945, + "grad_norm": 2.1636593341827393, + "learning_rate": 3.6273207833533115e-05, + "loss": 0.10471152067184449, + "step": 148800 + }, + { + "epoch": 0.6388724315877146, + "grad_norm": 0.005232629831880331, + "learning_rate": 3.626889611341549e-05, + "loss": 0.12240467071533204, + "step": 148810 + }, + { + "epoch": 0.6389153636777346, + "grad_norm": 0.06836547702550888, + "learning_rate": 3.626458439329786e-05, + "loss": 0.25388565063476565, + "step": 148820 + }, + { + "epoch": 0.6389582957677545, + "grad_norm": 0.004365882370620966, + "learning_rate": 3.626027267318024e-05, + "loss": 0.3014135599136353, + "step": 148830 + }, + { + "epoch": 0.6390012278577746, + "grad_norm": 0.007590130437165499, + "learning_rate": 3.625596095306262e-05, + "loss": 0.04911408424377441, + "step": 148840 + }, + { + "epoch": 0.6390441599477946, + "grad_norm": 0.01083632092922926, + "learning_rate": 3.6251649232944995e-05, + "loss": 0.10735723972320557, + "step": 148850 + }, + { + "epoch": 0.6390870920378146, + "grad_norm": 0.0025522809009999037, + "learning_rate": 3.6247337512827366e-05, + "loss": 0.09026329517364502, + "step": 148860 + }, + { + "epoch": 0.6391300241278346, + "grad_norm": 0.021843230351805687, + "learning_rate": 3.624302579270974e-05, + "loss": 0.11751435995101929, + "step": 148870 + }, + { + "epoch": 0.6391729562178546, + "grad_norm": 1.4416433572769165, + "learning_rate": 3.623871407259212e-05, + "loss": 0.1340113401412964, + "step": 148880 + }, + { + "epoch": 0.6392158883078746, + "grad_norm": 0.9927673935890198, + "learning_rate": 3.62344023524745e-05, + "loss": 0.22968411445617676, + "step": 148890 + }, + { + "epoch": 0.6392588203978946, + "grad_norm": 0.056031033396720886, + "learning_rate": 3.6230090632356875e-05, + "loss": 0.21517865657806395, + "step": 148900 + }, + { + "epoch": 0.6393017524879147, + "grad_norm": 0.17687752842903137, + "learning_rate": 3.622577891223925e-05, + "loss": 0.1644793391227722, + "step": 148910 + }, + { + "epoch": 0.6393446845779346, + "grad_norm": 0.0010743543971329927, + "learning_rate": 3.622146719212163e-05, + "loss": 0.13042839765548705, + "step": 148920 + }, + { + "epoch": 0.6393876166679546, + "grad_norm": 0.0013966663973405957, + "learning_rate": 3.621715547200401e-05, + "loss": 0.2743825912475586, + "step": 148930 + }, + { + "epoch": 0.6394305487579747, + "grad_norm": 3.341494083404541, + "learning_rate": 3.621284375188638e-05, + "loss": 0.34465975761413575, + "step": 148940 + }, + { + "epoch": 0.6394734808479946, + "grad_norm": 0.004896739963442087, + "learning_rate": 3.6208532031768755e-05, + "loss": 0.11658406257629395, + "step": 148950 + }, + { + "epoch": 0.6395164129380146, + "grad_norm": 6.2574992179870605, + "learning_rate": 3.620422031165113e-05, + "loss": 0.22075233459472657, + "step": 148960 + }, + { + "epoch": 0.6395593450280347, + "grad_norm": 3.035881519317627, + "learning_rate": 3.619990859153351e-05, + "loss": 0.28051533699035647, + "step": 148970 + }, + { + "epoch": 0.6396022771180546, + "grad_norm": 0.0007945537799969316, + "learning_rate": 3.619559687141588e-05, + "loss": 0.14235063791275024, + "step": 148980 + }, + { + "epoch": 0.6396452092080747, + "grad_norm": 0.0015852749347686768, + "learning_rate": 3.619128515129826e-05, + "loss": 0.19316439628601073, + "step": 148990 + }, + { + "epoch": 0.6396881412980947, + "grad_norm": 0.0012423275038599968, + "learning_rate": 3.6186973431180635e-05, + "loss": 0.5619192600250245, + "step": 149000 + }, + { + "epoch": 0.6396881412980947, + "eval_loss": 0.3982442021369934, + "eval_runtime": 27.134, + "eval_samples_per_second": 3.685, + "eval_steps_per_second": 3.685, + "step": 149000 + }, + { + "epoch": 0.6397310733881146, + "grad_norm": 0.22282634675502777, + "learning_rate": 3.618266171106301e-05, + "loss": 0.17875736951828003, + "step": 149010 + }, + { + "epoch": 0.6397740054781347, + "grad_norm": 0.35734257102012634, + "learning_rate": 3.617834999094539e-05, + "loss": 0.24705617427825927, + "step": 149020 + }, + { + "epoch": 0.6398169375681547, + "grad_norm": 7.129267692565918, + "learning_rate": 3.617403827082777e-05, + "loss": 0.394257926940918, + "step": 149030 + }, + { + "epoch": 0.6398598696581747, + "grad_norm": 0.5210902690887451, + "learning_rate": 3.6169726550710144e-05, + "loss": 0.0944884955883026, + "step": 149040 + }, + { + "epoch": 0.6399028017481947, + "grad_norm": 0.014846066944301128, + "learning_rate": 3.616541483059252e-05, + "loss": 0.2939175605773926, + "step": 149050 + }, + { + "epoch": 0.6399457338382147, + "grad_norm": 0.014867091551423073, + "learning_rate": 3.616110311047489e-05, + "loss": 0.06061587929725647, + "step": 149060 + }, + { + "epoch": 0.6399886659282347, + "grad_norm": 0.6271289587020874, + "learning_rate": 3.615679139035727e-05, + "loss": 0.25383543968200684, + "step": 149070 + }, + { + "epoch": 0.6400315980182547, + "grad_norm": 0.0007486839895136654, + "learning_rate": 3.615247967023965e-05, + "loss": 0.002995048649609089, + "step": 149080 + }, + { + "epoch": 0.6400745301082748, + "grad_norm": 0.9311636090278625, + "learning_rate": 3.6148167950122024e-05, + "loss": 0.3089101552963257, + "step": 149090 + }, + { + "epoch": 0.6401174621982947, + "grad_norm": 0.06805947422981262, + "learning_rate": 3.6143856230004395e-05, + "loss": 0.152249014377594, + "step": 149100 + }, + { + "epoch": 0.6401603942883147, + "grad_norm": 6.076711654663086, + "learning_rate": 3.613954450988677e-05, + "loss": 0.4289687156677246, + "step": 149110 + }, + { + "epoch": 0.6402033263783348, + "grad_norm": 0.010889571160078049, + "learning_rate": 3.613523278976915e-05, + "loss": 0.14229416847229004, + "step": 149120 + }, + { + "epoch": 0.6402462584683548, + "grad_norm": 0.024119842797517776, + "learning_rate": 3.613092106965153e-05, + "loss": 0.18358902931213378, + "step": 149130 + }, + { + "epoch": 0.6402891905583747, + "grad_norm": 2.6810688972473145, + "learning_rate": 3.612660934953391e-05, + "loss": 0.23319919109344484, + "step": 149140 + }, + { + "epoch": 0.6403321226483948, + "grad_norm": 0.3877185583114624, + "learning_rate": 3.612229762941628e-05, + "loss": 0.05466576218605042, + "step": 149150 + }, + { + "epoch": 0.6403750547384148, + "grad_norm": 3.808795928955078, + "learning_rate": 3.611798590929866e-05, + "loss": 0.2208317995071411, + "step": 149160 + }, + { + "epoch": 0.6404179868284348, + "grad_norm": 0.43981850147247314, + "learning_rate": 3.6113674189181036e-05, + "loss": 0.04135819673538208, + "step": 149170 + }, + { + "epoch": 0.6404609189184548, + "grad_norm": 0.23187753558158875, + "learning_rate": 3.6109362469063413e-05, + "loss": 0.21757895946502687, + "step": 149180 + }, + { + "epoch": 0.6405038510084748, + "grad_norm": 0.01140986941754818, + "learning_rate": 3.6105050748945784e-05, + "loss": 0.1515246272087097, + "step": 149190 + }, + { + "epoch": 0.6405467830984948, + "grad_norm": 0.01072423905134201, + "learning_rate": 3.610073902882816e-05, + "loss": 0.22516722679138185, + "step": 149200 + }, + { + "epoch": 0.6405897151885148, + "grad_norm": 0.002811270533129573, + "learning_rate": 3.609642730871054e-05, + "loss": 0.09642552137374878, + "step": 149210 + }, + { + "epoch": 0.6406326472785349, + "grad_norm": 0.03889846429228783, + "learning_rate": 3.6092115588592916e-05, + "loss": 0.061399024724960324, + "step": 149220 + }, + { + "epoch": 0.6406755793685548, + "grad_norm": 0.0017019611550495028, + "learning_rate": 3.608780386847529e-05, + "loss": 0.258405876159668, + "step": 149230 + }, + { + "epoch": 0.6407185114585748, + "grad_norm": 0.8725544810295105, + "learning_rate": 3.6083492148357664e-05, + "loss": 0.1622206449508667, + "step": 149240 + }, + { + "epoch": 0.6407614435485949, + "grad_norm": 1.6804289817810059, + "learning_rate": 3.607918042824005e-05, + "loss": 0.42448744773864744, + "step": 149250 + }, + { + "epoch": 0.6408043756386148, + "grad_norm": 5.870574474334717, + "learning_rate": 3.6074868708122425e-05, + "loss": 0.2516177654266357, + "step": 149260 + }, + { + "epoch": 0.6408473077286349, + "grad_norm": 2.136117696762085, + "learning_rate": 3.6070556988004796e-05, + "loss": 0.2335583209991455, + "step": 149270 + }, + { + "epoch": 0.6408902398186549, + "grad_norm": 0.018576808273792267, + "learning_rate": 3.606624526788717e-05, + "loss": 0.06257337927818299, + "step": 149280 + }, + { + "epoch": 0.6409331719086748, + "grad_norm": 0.4843984544277191, + "learning_rate": 3.606193354776955e-05, + "loss": 0.3152668237686157, + "step": 149290 + }, + { + "epoch": 0.6409761039986949, + "grad_norm": 0.15813709795475006, + "learning_rate": 3.605762182765193e-05, + "loss": 0.1964523434638977, + "step": 149300 + }, + { + "epoch": 0.6410190360887149, + "grad_norm": 1.1011407375335693, + "learning_rate": 3.60533101075343e-05, + "loss": 0.17502723932266234, + "step": 149310 + }, + { + "epoch": 0.6410619681787348, + "grad_norm": 6.5771684646606445, + "learning_rate": 3.6048998387416676e-05, + "loss": 0.2309612512588501, + "step": 149320 + }, + { + "epoch": 0.6411049002687549, + "grad_norm": 0.05764686316251755, + "learning_rate": 3.604468666729905e-05, + "loss": 0.23601062297821046, + "step": 149330 + }, + { + "epoch": 0.6411478323587749, + "grad_norm": 2.103090286254883, + "learning_rate": 3.604037494718143e-05, + "loss": 0.3898202657699585, + "step": 149340 + }, + { + "epoch": 0.6411907644487949, + "grad_norm": 0.07793723046779633, + "learning_rate": 3.60360632270638e-05, + "loss": 0.09205517172813416, + "step": 149350 + }, + { + "epoch": 0.6412336965388149, + "grad_norm": 1.2947548627853394, + "learning_rate": 3.6031751506946185e-05, + "loss": 0.1622360348701477, + "step": 149360 + }, + { + "epoch": 0.6412766286288349, + "grad_norm": 1.644738793373108, + "learning_rate": 3.602743978682856e-05, + "loss": 0.3392355918884277, + "step": 149370 + }, + { + "epoch": 0.6413195607188549, + "grad_norm": 0.00325818732380867, + "learning_rate": 3.602312806671094e-05, + "loss": 0.11670932769775391, + "step": 149380 + }, + { + "epoch": 0.6413624928088749, + "grad_norm": 0.013296050950884819, + "learning_rate": 3.601881634659331e-05, + "loss": 0.22430813312530518, + "step": 149390 + }, + { + "epoch": 0.641405424898895, + "grad_norm": 0.0017144365701824427, + "learning_rate": 3.601450462647569e-05, + "loss": 0.0805868923664093, + "step": 149400 + }, + { + "epoch": 0.6414483569889149, + "grad_norm": 0.010498907417058945, + "learning_rate": 3.6010192906358065e-05, + "loss": 0.1966702699661255, + "step": 149410 + }, + { + "epoch": 0.6414912890789349, + "grad_norm": 0.0028181958477944136, + "learning_rate": 3.600588118624044e-05, + "loss": 0.07636668682098388, + "step": 149420 + }, + { + "epoch": 0.641534221168955, + "grad_norm": 0.004065072163939476, + "learning_rate": 3.600156946612281e-05, + "loss": 0.2734179735183716, + "step": 149430 + }, + { + "epoch": 0.6415771532589749, + "grad_norm": 0.007763869594782591, + "learning_rate": 3.599725774600519e-05, + "loss": 0.23989946842193605, + "step": 149440 + }, + { + "epoch": 0.641620085348995, + "grad_norm": 1.1928882598876953, + "learning_rate": 3.599294602588757e-05, + "loss": 0.2867441177368164, + "step": 149450 + }, + { + "epoch": 0.641663017439015, + "grad_norm": 0.0222454946488142, + "learning_rate": 3.5988634305769945e-05, + "loss": 0.07181630730628967, + "step": 149460 + }, + { + "epoch": 0.6417059495290349, + "grad_norm": 1.8484033346176147, + "learning_rate": 3.598432258565232e-05, + "loss": 0.32464113235473635, + "step": 149470 + }, + { + "epoch": 0.641748881619055, + "grad_norm": 0.012596558779478073, + "learning_rate": 3.59800108655347e-05, + "loss": 0.0818311870098114, + "step": 149480 + }, + { + "epoch": 0.641791813709075, + "grad_norm": 0.024298356845974922, + "learning_rate": 3.597569914541708e-05, + "loss": 0.20316953659057618, + "step": 149490 + }, + { + "epoch": 0.6418347457990949, + "grad_norm": 0.16192997992038727, + "learning_rate": 3.5971387425299455e-05, + "loss": 0.008682883530855178, + "step": 149500 + }, + { + "epoch": 0.641877677889115, + "grad_norm": 0.2028181105852127, + "learning_rate": 3.596707570518183e-05, + "loss": 0.15205684900283814, + "step": 149510 + }, + { + "epoch": 0.641920609979135, + "grad_norm": 1.6921296119689941, + "learning_rate": 3.59627639850642e-05, + "loss": 0.2163633108139038, + "step": 149520 + }, + { + "epoch": 0.641963542069155, + "grad_norm": 1.2315031290054321, + "learning_rate": 3.595845226494658e-05, + "loss": 0.3820303678512573, + "step": 149530 + }, + { + "epoch": 0.642006474159175, + "grad_norm": 0.001260005752556026, + "learning_rate": 3.595414054482896e-05, + "loss": 0.252690863609314, + "step": 149540 + }, + { + "epoch": 0.642049406249195, + "grad_norm": 0.0427839457988739, + "learning_rate": 3.5949828824711335e-05, + "loss": 0.1781385898590088, + "step": 149550 + }, + { + "epoch": 0.6420923383392151, + "grad_norm": 1.7424696683883667, + "learning_rate": 3.5945517104593705e-05, + "loss": 0.091168612241745, + "step": 149560 + }, + { + "epoch": 0.642135270429235, + "grad_norm": 0.024264035746455193, + "learning_rate": 3.594120538447608e-05, + "loss": 0.1589852452278137, + "step": 149570 + }, + { + "epoch": 0.6421782025192551, + "grad_norm": 2.4907279014587402, + "learning_rate": 3.593689366435846e-05, + "loss": 0.24225354194641113, + "step": 149580 + }, + { + "epoch": 0.6422211346092751, + "grad_norm": 0.03332900255918503, + "learning_rate": 3.593258194424084e-05, + "loss": 0.3125003814697266, + "step": 149590 + }, + { + "epoch": 0.642264066699295, + "grad_norm": 1.9512094259262085, + "learning_rate": 3.5928270224123214e-05, + "loss": 0.08376158475875854, + "step": 149600 + }, + { + "epoch": 0.6423069987893151, + "grad_norm": 0.01022291649132967, + "learning_rate": 3.592395850400559e-05, + "loss": 0.24850192070007324, + "step": 149610 + }, + { + "epoch": 0.6423499308793351, + "grad_norm": 0.0009068639483302832, + "learning_rate": 3.591964678388797e-05, + "loss": 0.24298958778381347, + "step": 149620 + }, + { + "epoch": 0.642392862969355, + "grad_norm": 0.8004212975502014, + "learning_rate": 3.5915335063770346e-05, + "loss": 0.3129880428314209, + "step": 149630 + }, + { + "epoch": 0.6424357950593751, + "grad_norm": 7.949131011962891, + "learning_rate": 3.591102334365272e-05, + "loss": 0.28877594470977785, + "step": 149640 + }, + { + "epoch": 0.6424787271493951, + "grad_norm": 1.1796244382858276, + "learning_rate": 3.5906711623535094e-05, + "loss": 0.22746336460113525, + "step": 149650 + }, + { + "epoch": 0.6425216592394151, + "grad_norm": 3.379171848297119, + "learning_rate": 3.590239990341747e-05, + "loss": 0.12769432067871095, + "step": 149660 + }, + { + "epoch": 0.6425645913294351, + "grad_norm": 2.4193947315216064, + "learning_rate": 3.589808818329985e-05, + "loss": 0.19697859287261962, + "step": 149670 + }, + { + "epoch": 0.6426075234194552, + "grad_norm": 0.005682434421032667, + "learning_rate": 3.589377646318222e-05, + "loss": 0.0763977587223053, + "step": 149680 + }, + { + "epoch": 0.6426504555094751, + "grad_norm": 0.0972975641489029, + "learning_rate": 3.58894647430646e-05, + "loss": 0.009377355128526688, + "step": 149690 + }, + { + "epoch": 0.6426933875994951, + "grad_norm": 0.036606717854738235, + "learning_rate": 3.5885153022946974e-05, + "loss": 0.13104115724563598, + "step": 149700 + }, + { + "epoch": 0.6427363196895152, + "grad_norm": 0.002917773788794875, + "learning_rate": 3.588084130282935e-05, + "loss": 0.1379793882369995, + "step": 149710 + }, + { + "epoch": 0.6427792517795351, + "grad_norm": 2.2030205726623535, + "learning_rate": 3.587652958271173e-05, + "loss": 0.24357876777648926, + "step": 149720 + }, + { + "epoch": 0.6428221838695551, + "grad_norm": 3.2720603942871094, + "learning_rate": 3.5872217862594106e-05, + "loss": 0.2897838592529297, + "step": 149730 + }, + { + "epoch": 0.6428651159595752, + "grad_norm": 0.0052750613540410995, + "learning_rate": 3.5867906142476484e-05, + "loss": 0.09423772692680359, + "step": 149740 + }, + { + "epoch": 0.6429080480495951, + "grad_norm": 0.8562742471694946, + "learning_rate": 3.586359442235886e-05, + "loss": 0.3347553968429565, + "step": 149750 + }, + { + "epoch": 0.6429509801396152, + "grad_norm": 1.6895736455917358, + "learning_rate": 3.585928270224123e-05, + "loss": 0.35067009925842285, + "step": 149760 + }, + { + "epoch": 0.6429939122296352, + "grad_norm": 0.02921820618212223, + "learning_rate": 3.585497098212361e-05, + "loss": 0.061477482318878174, + "step": 149770 + }, + { + "epoch": 0.6430368443196551, + "grad_norm": 1.9546409845352173, + "learning_rate": 3.5850659262005986e-05, + "loss": 0.23800258636474608, + "step": 149780 + }, + { + "epoch": 0.6430797764096752, + "grad_norm": 0.0008198361028917134, + "learning_rate": 3.5846347541888364e-05, + "loss": 0.2178973913192749, + "step": 149790 + }, + { + "epoch": 0.6431227084996952, + "grad_norm": 0.0036720235366374254, + "learning_rate": 3.5842035821770734e-05, + "loss": 0.17330249547958373, + "step": 149800 + }, + { + "epoch": 0.6431656405897151, + "grad_norm": 6.70130729675293, + "learning_rate": 3.583772410165311e-05, + "loss": 0.3165444374084473, + "step": 149810 + }, + { + "epoch": 0.6432085726797352, + "grad_norm": 2.2758169174194336, + "learning_rate": 3.583341238153549e-05, + "loss": 0.23460037708282472, + "step": 149820 + }, + { + "epoch": 0.6432515047697552, + "grad_norm": 0.012248000130057335, + "learning_rate": 3.5829100661417866e-05, + "loss": 0.15115057229995726, + "step": 149830 + }, + { + "epoch": 0.6432944368597752, + "grad_norm": 0.01865295320749283, + "learning_rate": 3.5824788941300244e-05, + "loss": 0.17717584371566772, + "step": 149840 + }, + { + "epoch": 0.6433373689497952, + "grad_norm": 0.1970166563987732, + "learning_rate": 3.582047722118262e-05, + "loss": 0.1073176383972168, + "step": 149850 + }, + { + "epoch": 0.6433803010398152, + "grad_norm": 0.01511879451572895, + "learning_rate": 3.5816165501065e-05, + "loss": 0.20396804809570312, + "step": 149860 + }, + { + "epoch": 0.6434232331298352, + "grad_norm": 2.3057587146759033, + "learning_rate": 3.5811853780947376e-05, + "loss": 0.44550137519836425, + "step": 149870 + }, + { + "epoch": 0.6434661652198552, + "grad_norm": 0.01049025822430849, + "learning_rate": 3.580754206082975e-05, + "loss": 0.06601186990737914, + "step": 149880 + }, + { + "epoch": 0.6435090973098753, + "grad_norm": 0.004382689017802477, + "learning_rate": 3.5803230340712124e-05, + "loss": 0.08267223834991455, + "step": 149890 + }, + { + "epoch": 0.6435520293998952, + "grad_norm": 1.260653018951416, + "learning_rate": 3.57989186205945e-05, + "loss": 0.18585045337677003, + "step": 149900 + }, + { + "epoch": 0.6435949614899152, + "grad_norm": 0.01793503761291504, + "learning_rate": 3.579460690047688e-05, + "loss": 0.27620763778686525, + "step": 149910 + }, + { + "epoch": 0.6436378935799353, + "grad_norm": 0.009449219331145287, + "learning_rate": 3.5790295180359256e-05, + "loss": 0.17628442049026488, + "step": 149920 + }, + { + "epoch": 0.6436808256699552, + "grad_norm": 0.006964544299989939, + "learning_rate": 3.5785983460241626e-05, + "loss": 0.06601274013519287, + "step": 149930 + }, + { + "epoch": 0.6437237577599753, + "grad_norm": 0.03464784100651741, + "learning_rate": 3.5781671740124003e-05, + "loss": 0.1950068950653076, + "step": 149940 + }, + { + "epoch": 0.6437666898499953, + "grad_norm": 0.5150058269500732, + "learning_rate": 3.577736002000638e-05, + "loss": 0.07730207443237305, + "step": 149950 + }, + { + "epoch": 0.6438096219400152, + "grad_norm": 0.0041276682168245316, + "learning_rate": 3.5773048299888765e-05, + "loss": 0.15968955755233766, + "step": 149960 + }, + { + "epoch": 0.6438525540300353, + "grad_norm": 0.006611963734030724, + "learning_rate": 3.5768736579771135e-05, + "loss": 0.2820572853088379, + "step": 149970 + }, + { + "epoch": 0.6438954861200553, + "grad_norm": 2.0153894424438477, + "learning_rate": 3.576442485965351e-05, + "loss": 0.19689462184906006, + "step": 149980 + }, + { + "epoch": 0.6439384182100752, + "grad_norm": 1.3693201541900635, + "learning_rate": 3.576011313953589e-05, + "loss": 0.2069246530532837, + "step": 149990 + }, + { + "epoch": 0.6439813503000953, + "grad_norm": 0.00849225465208292, + "learning_rate": 3.575580141941827e-05, + "loss": 0.2323996067047119, + "step": 150000 + }, + { + "epoch": 0.6439813503000953, + "eval_loss": 0.3989526629447937, + "eval_runtime": 27.1345, + "eval_samples_per_second": 3.685, + "eval_steps_per_second": 3.685, + "step": 150000 + }, + { + "epoch": 0.6440242823901153, + "grad_norm": 0.24946488440036774, + "learning_rate": 3.575148969930064e-05, + "loss": 0.05341393947601318, + "step": 150010 + }, + { + "epoch": 0.6440672144801354, + "grad_norm": 0.16969804465770721, + "learning_rate": 3.5747177979183015e-05, + "loss": 0.2269148349761963, + "step": 150020 + }, + { + "epoch": 0.6441101465701553, + "grad_norm": 0.014137118123471737, + "learning_rate": 3.574286625906539e-05, + "loss": 0.10535632371902466, + "step": 150030 + }, + { + "epoch": 0.6441530786601753, + "grad_norm": 4.645323276519775, + "learning_rate": 3.573855453894777e-05, + "loss": 0.23873796463012695, + "step": 150040 + }, + { + "epoch": 0.6441960107501954, + "grad_norm": 0.22043707966804504, + "learning_rate": 3.573424281883014e-05, + "loss": 0.10489349365234375, + "step": 150050 + }, + { + "epoch": 0.6442389428402153, + "grad_norm": 0.005896933376789093, + "learning_rate": 3.572993109871252e-05, + "loss": 0.3532412052154541, + "step": 150060 + }, + { + "epoch": 0.6442818749302354, + "grad_norm": 0.05356534942984581, + "learning_rate": 3.57256193785949e-05, + "loss": 0.1797205090522766, + "step": 150070 + }, + { + "epoch": 0.6443248070202554, + "grad_norm": 8.781749725341797, + "learning_rate": 3.572130765847728e-05, + "loss": 0.06868206858634948, + "step": 150080 + }, + { + "epoch": 0.6443677391102753, + "grad_norm": 2.9030253887176514, + "learning_rate": 3.571699593835965e-05, + "loss": 0.2557013511657715, + "step": 150090 + }, + { + "epoch": 0.6444106712002954, + "grad_norm": 0.012084102258086205, + "learning_rate": 3.571268421824203e-05, + "loss": 0.21030721664428711, + "step": 150100 + }, + { + "epoch": 0.6444536032903154, + "grad_norm": 0.0009573132847435772, + "learning_rate": 3.5708372498124405e-05, + "loss": 0.20109183788299562, + "step": 150110 + }, + { + "epoch": 0.6444965353803354, + "grad_norm": 0.0013460484333336353, + "learning_rate": 3.570406077800678e-05, + "loss": 0.20196888446807862, + "step": 150120 + }, + { + "epoch": 0.6445394674703554, + "grad_norm": 2.3234500885009766, + "learning_rate": 3.569974905788915e-05, + "loss": 0.3194742202758789, + "step": 150130 + }, + { + "epoch": 0.6445823995603754, + "grad_norm": 0.09026148170232773, + "learning_rate": 3.569543733777153e-05, + "loss": 0.1295180320739746, + "step": 150140 + }, + { + "epoch": 0.6446253316503954, + "grad_norm": 0.31340131163597107, + "learning_rate": 3.569112561765391e-05, + "loss": 0.25863502025604246, + "step": 150150 + }, + { + "epoch": 0.6446682637404154, + "grad_norm": 0.5061963796615601, + "learning_rate": 3.5686813897536285e-05, + "loss": 0.22574491500854493, + "step": 150160 + }, + { + "epoch": 0.6447111958304355, + "grad_norm": 0.5957401394844055, + "learning_rate": 3.5682502177418655e-05, + "loss": 0.28134448528289796, + "step": 150170 + }, + { + "epoch": 0.6447541279204554, + "grad_norm": 0.8046202659606934, + "learning_rate": 3.567819045730104e-05, + "loss": 0.2787386655807495, + "step": 150180 + }, + { + "epoch": 0.6447970600104754, + "grad_norm": 7.400388240814209, + "learning_rate": 3.567387873718342e-05, + "loss": 0.4055464744567871, + "step": 150190 + }, + { + "epoch": 0.6448399921004955, + "grad_norm": 2.637162208557129, + "learning_rate": 3.5669567017065794e-05, + "loss": 0.24768633842468263, + "step": 150200 + }, + { + "epoch": 0.6448829241905154, + "grad_norm": 0.06767522543668747, + "learning_rate": 3.5665255296948165e-05, + "loss": 0.13162182569503783, + "step": 150210 + }, + { + "epoch": 0.6449258562805354, + "grad_norm": 1.7277255058288574, + "learning_rate": 3.566094357683054e-05, + "loss": 0.27955288887023927, + "step": 150220 + }, + { + "epoch": 0.6449687883705555, + "grad_norm": 0.020901095122098923, + "learning_rate": 3.565663185671292e-05, + "loss": 0.3833391427993774, + "step": 150230 + }, + { + "epoch": 0.6450117204605754, + "grad_norm": 0.04695943742990494, + "learning_rate": 3.56523201365953e-05, + "loss": 0.08938364982604981, + "step": 150240 + }, + { + "epoch": 0.6450546525505955, + "grad_norm": 4.172770977020264, + "learning_rate": 3.5648008416477674e-05, + "loss": 0.5616393566131592, + "step": 150250 + }, + { + "epoch": 0.6450975846406155, + "grad_norm": 1.0414800643920898, + "learning_rate": 3.5643696696360045e-05, + "loss": 0.16682374477386475, + "step": 150260 + }, + { + "epoch": 0.6451405167306354, + "grad_norm": 0.07870873063802719, + "learning_rate": 3.563938497624242e-05, + "loss": 0.3457974910736084, + "step": 150270 + }, + { + "epoch": 0.6451834488206555, + "grad_norm": 0.216685488820076, + "learning_rate": 3.56350732561248e-05, + "loss": 0.28935139179229735, + "step": 150280 + }, + { + "epoch": 0.6452263809106755, + "grad_norm": 0.013029251247644424, + "learning_rate": 3.5630761536007177e-05, + "loss": 0.23000710010528563, + "step": 150290 + }, + { + "epoch": 0.6452693130006955, + "grad_norm": 0.04310464859008789, + "learning_rate": 3.5626449815889554e-05, + "loss": 0.12242240905761718, + "step": 150300 + }, + { + "epoch": 0.6453122450907155, + "grad_norm": 2.501413106918335, + "learning_rate": 3.562213809577193e-05, + "loss": 0.30814170837402344, + "step": 150310 + }, + { + "epoch": 0.6453551771807355, + "grad_norm": 1.097299575805664, + "learning_rate": 3.561782637565431e-05, + "loss": 0.22030160427093506, + "step": 150320 + }, + { + "epoch": 0.6453981092707555, + "grad_norm": 0.026852579787373543, + "learning_rate": 3.5613514655536686e-05, + "loss": 0.22212910652160645, + "step": 150330 + }, + { + "epoch": 0.6454410413607755, + "grad_norm": 0.22037015855312347, + "learning_rate": 3.5609202935419057e-05, + "loss": 0.13063724040985109, + "step": 150340 + }, + { + "epoch": 0.6454839734507956, + "grad_norm": 60.27128982543945, + "learning_rate": 3.5604891215301434e-05, + "loss": 0.08863616585731507, + "step": 150350 + }, + { + "epoch": 0.6455269055408155, + "grad_norm": 3.72627592086792, + "learning_rate": 3.560057949518381e-05, + "loss": 0.21849074363708496, + "step": 150360 + }, + { + "epoch": 0.6455698376308355, + "grad_norm": 0.45230111479759216, + "learning_rate": 3.559626777506619e-05, + "loss": 0.22867064476013182, + "step": 150370 + }, + { + "epoch": 0.6456127697208556, + "grad_norm": 0.9160948395729065, + "learning_rate": 3.559195605494856e-05, + "loss": 0.0876701295375824, + "step": 150380 + }, + { + "epoch": 0.6456557018108755, + "grad_norm": 3.828824043273926, + "learning_rate": 3.5587644334830936e-05, + "loss": 0.29961979389190674, + "step": 150390 + }, + { + "epoch": 0.6456986339008955, + "grad_norm": 2.7227840423583984, + "learning_rate": 3.5583332614713314e-05, + "loss": 0.11139649152755737, + "step": 150400 + }, + { + "epoch": 0.6457415659909156, + "grad_norm": 0.00475529907271266, + "learning_rate": 3.557902089459569e-05, + "loss": 0.058909434080123904, + "step": 150410 + }, + { + "epoch": 0.6457844980809355, + "grad_norm": 0.2636564373970032, + "learning_rate": 3.557470917447807e-05, + "loss": 0.27903735637664795, + "step": 150420 + }, + { + "epoch": 0.6458274301709556, + "grad_norm": 0.007288595661520958, + "learning_rate": 3.5570397454360446e-05, + "loss": 0.2414243221282959, + "step": 150430 + }, + { + "epoch": 0.6458703622609756, + "grad_norm": 0.16422626376152039, + "learning_rate": 3.556608573424282e-05, + "loss": 0.19783369302749634, + "step": 150440 + }, + { + "epoch": 0.6459132943509956, + "grad_norm": 2.2630701065063477, + "learning_rate": 3.55617740141252e-05, + "loss": 0.2823080062866211, + "step": 150450 + }, + { + "epoch": 0.6459562264410156, + "grad_norm": 0.02012724243104458, + "learning_rate": 3.555746229400757e-05, + "loss": 0.01825539916753769, + "step": 150460 + }, + { + "epoch": 0.6459991585310356, + "grad_norm": 0.0037891704123467207, + "learning_rate": 3.555315057388995e-05, + "loss": 0.09379191398620605, + "step": 150470 + }, + { + "epoch": 0.6460420906210557, + "grad_norm": 0.01953510195016861, + "learning_rate": 3.5548838853772326e-05, + "loss": 0.28741207122802737, + "step": 150480 + }, + { + "epoch": 0.6460850227110756, + "grad_norm": 0.018316145986318588, + "learning_rate": 3.55445271336547e-05, + "loss": 0.20187015533447267, + "step": 150490 + }, + { + "epoch": 0.6461279548010956, + "grad_norm": 2.894874095916748, + "learning_rate": 3.5540215413537074e-05, + "loss": 0.2339266538619995, + "step": 150500 + }, + { + "epoch": 0.6461708868911157, + "grad_norm": 0.011740121990442276, + "learning_rate": 3.553590369341945e-05, + "loss": 0.1139156699180603, + "step": 150510 + }, + { + "epoch": 0.6462138189811356, + "grad_norm": 0.0005255074356682599, + "learning_rate": 3.553159197330183e-05, + "loss": 0.4588716983795166, + "step": 150520 + }, + { + "epoch": 0.6462567510711557, + "grad_norm": 0.49539849162101746, + "learning_rate": 3.5527280253184206e-05, + "loss": 0.35927271842956543, + "step": 150530 + }, + { + "epoch": 0.6462996831611757, + "grad_norm": 1.401881456375122, + "learning_rate": 3.552296853306658e-05, + "loss": 0.24080908298492432, + "step": 150540 + }, + { + "epoch": 0.6463426152511956, + "grad_norm": 0.002369405934587121, + "learning_rate": 3.551865681294896e-05, + "loss": 0.17735791206359863, + "step": 150550 + }, + { + "epoch": 0.6463855473412157, + "grad_norm": 3.389665126800537, + "learning_rate": 3.551434509283134e-05, + "loss": 0.09773001074790955, + "step": 150560 + }, + { + "epoch": 0.6464284794312357, + "grad_norm": 1.7062057256698608, + "learning_rate": 3.5510033372713715e-05, + "loss": 0.4862934112548828, + "step": 150570 + }, + { + "epoch": 0.6464714115212556, + "grad_norm": 0.10221821814775467, + "learning_rate": 3.5505721652596086e-05, + "loss": 0.1651764392852783, + "step": 150580 + }, + { + "epoch": 0.6465143436112757, + "grad_norm": 0.5306051969528198, + "learning_rate": 3.550140993247846e-05, + "loss": 0.1362286925315857, + "step": 150590 + }, + { + "epoch": 0.6465572757012957, + "grad_norm": 0.11507293581962585, + "learning_rate": 3.549709821236084e-05, + "loss": 0.35334086418151855, + "step": 150600 + }, + { + "epoch": 0.6466002077913157, + "grad_norm": 1.0700881481170654, + "learning_rate": 3.549278649224322e-05, + "loss": 0.2019124746322632, + "step": 150610 + }, + { + "epoch": 0.6466431398813357, + "grad_norm": 2.763080596923828, + "learning_rate": 3.5488474772125595e-05, + "loss": 0.33109560012817385, + "step": 150620 + }, + { + "epoch": 0.6466860719713557, + "grad_norm": 0.0019787929486483335, + "learning_rate": 3.5484163052007966e-05, + "loss": 0.1044989824295044, + "step": 150630 + }, + { + "epoch": 0.6467290040613757, + "grad_norm": 0.006205317564308643, + "learning_rate": 3.547985133189034e-05, + "loss": 0.2810218095779419, + "step": 150640 + }, + { + "epoch": 0.6467719361513957, + "grad_norm": 0.08015859872102737, + "learning_rate": 3.547553961177272e-05, + "loss": 0.3245912790298462, + "step": 150650 + }, + { + "epoch": 0.6468148682414158, + "grad_norm": 2.5483102798461914, + "learning_rate": 3.5471227891655104e-05, + "loss": 0.17378085851669312, + "step": 150660 + }, + { + "epoch": 0.6468578003314357, + "grad_norm": 2.2716610431671143, + "learning_rate": 3.5466916171537475e-05, + "loss": 0.25345578193664553, + "step": 150670 + }, + { + "epoch": 0.6469007324214557, + "grad_norm": 0.013727148994803429, + "learning_rate": 3.546260445141985e-05, + "loss": 0.2554850339889526, + "step": 150680 + }, + { + "epoch": 0.6469436645114758, + "grad_norm": 1.6586568355560303, + "learning_rate": 3.545829273130223e-05, + "loss": 0.2570873022079468, + "step": 150690 + }, + { + "epoch": 0.6469865966014957, + "grad_norm": 1.9871070384979248, + "learning_rate": 3.545398101118461e-05, + "loss": 0.20374269485473634, + "step": 150700 + }, + { + "epoch": 0.6470295286915158, + "grad_norm": 0.022350076586008072, + "learning_rate": 3.544966929106698e-05, + "loss": 0.4256162166595459, + "step": 150710 + }, + { + "epoch": 0.6470724607815358, + "grad_norm": 0.004630012437701225, + "learning_rate": 3.5445357570949355e-05, + "loss": 0.2682207107543945, + "step": 150720 + }, + { + "epoch": 0.6471153928715557, + "grad_norm": 0.15521609783172607, + "learning_rate": 3.544104585083173e-05, + "loss": 0.20004658699035643, + "step": 150730 + }, + { + "epoch": 0.6471583249615758, + "grad_norm": 0.0005406261188909411, + "learning_rate": 3.543673413071411e-05, + "loss": 0.15981554985046387, + "step": 150740 + }, + { + "epoch": 0.6472012570515958, + "grad_norm": 0.001623362535610795, + "learning_rate": 3.543242241059648e-05, + "loss": 0.23363192081451417, + "step": 150750 + }, + { + "epoch": 0.6472441891416157, + "grad_norm": 0.058352451771497726, + "learning_rate": 3.542811069047886e-05, + "loss": 0.06839287877082825, + "step": 150760 + }, + { + "epoch": 0.6472871212316358, + "grad_norm": 2.14315128326416, + "learning_rate": 3.542379897036124e-05, + "loss": 0.4139379024505615, + "step": 150770 + }, + { + "epoch": 0.6473300533216558, + "grad_norm": 0.8045847415924072, + "learning_rate": 3.541948725024362e-05, + "loss": 0.24877502918243408, + "step": 150780 + }, + { + "epoch": 0.6473729854116758, + "grad_norm": 6.681770324707031, + "learning_rate": 3.541517553012599e-05, + "loss": 0.2461794376373291, + "step": 150790 + }, + { + "epoch": 0.6474159175016958, + "grad_norm": 0.007231173105537891, + "learning_rate": 3.541086381000837e-05, + "loss": 0.10801637172698975, + "step": 150800 + }, + { + "epoch": 0.6474588495917158, + "grad_norm": 0.08952129632234573, + "learning_rate": 3.5406552089890744e-05, + "loss": 0.163426411151886, + "step": 150810 + }, + { + "epoch": 0.6475017816817358, + "grad_norm": 1.4802162647247314, + "learning_rate": 3.540224036977312e-05, + "loss": 0.31829946041107177, + "step": 150820 + }, + { + "epoch": 0.6475447137717558, + "grad_norm": 0.6895108222961426, + "learning_rate": 3.539792864965549e-05, + "loss": 0.17124329805374144, + "step": 150830 + }, + { + "epoch": 0.6475876458617759, + "grad_norm": 2.4215404987335205, + "learning_rate": 3.539361692953787e-05, + "loss": 0.15050746202468873, + "step": 150840 + }, + { + "epoch": 0.6476305779517958, + "grad_norm": 0.004913152661174536, + "learning_rate": 3.538930520942025e-05, + "loss": 0.1458522915840149, + "step": 150850 + }, + { + "epoch": 0.6476735100418158, + "grad_norm": 0.2607187032699585, + "learning_rate": 3.5384993489302624e-05, + "loss": 0.1336721658706665, + "step": 150860 + }, + { + "epoch": 0.6477164421318359, + "grad_norm": 0.1601972132921219, + "learning_rate": 3.5380681769184995e-05, + "loss": 0.041481971740722656, + "step": 150870 + }, + { + "epoch": 0.6477593742218559, + "grad_norm": 0.22868028283119202, + "learning_rate": 3.537637004906738e-05, + "loss": 0.1673444151878357, + "step": 150880 + }, + { + "epoch": 0.6478023063118759, + "grad_norm": 0.0013107474660500884, + "learning_rate": 3.5372058328949756e-05, + "loss": 0.05632483959197998, + "step": 150890 + }, + { + "epoch": 0.6478452384018959, + "grad_norm": 0.037882279604673386, + "learning_rate": 3.5367746608832134e-05, + "loss": 0.1135305643081665, + "step": 150900 + }, + { + "epoch": 0.6478881704919159, + "grad_norm": 0.006019525229930878, + "learning_rate": 3.5363434888714504e-05, + "loss": 0.21990854740142823, + "step": 150910 + }, + { + "epoch": 0.6479311025819359, + "grad_norm": 1.5703551769256592, + "learning_rate": 3.535912316859688e-05, + "loss": 0.197979736328125, + "step": 150920 + }, + { + "epoch": 0.6479740346719559, + "grad_norm": 0.0006432771333493292, + "learning_rate": 3.535481144847926e-05, + "loss": 0.13005446195602416, + "step": 150930 + }, + { + "epoch": 0.648016966761976, + "grad_norm": 0.005556243937462568, + "learning_rate": 3.5350499728361636e-05, + "loss": 0.19083883762359619, + "step": 150940 + }, + { + "epoch": 0.6480598988519959, + "grad_norm": 1.8534623384475708, + "learning_rate": 3.5346188008244013e-05, + "loss": 0.28145432472229004, + "step": 150950 + }, + { + "epoch": 0.6481028309420159, + "grad_norm": 0.00571678951382637, + "learning_rate": 3.5341876288126384e-05, + "loss": 0.09889286756515503, + "step": 150960 + }, + { + "epoch": 0.648145763032036, + "grad_norm": 0.022175125777721405, + "learning_rate": 3.533756456800876e-05, + "loss": 0.13391042947769166, + "step": 150970 + }, + { + "epoch": 0.6481886951220559, + "grad_norm": 2.6668763160705566, + "learning_rate": 3.533325284789114e-05, + "loss": 0.5017569541931153, + "step": 150980 + }, + { + "epoch": 0.648231627212076, + "grad_norm": 2.446192741394043, + "learning_rate": 3.5328941127773516e-05, + "loss": 0.22131681442260742, + "step": 150990 + }, + { + "epoch": 0.648274559302096, + "grad_norm": 0.0022710757330060005, + "learning_rate": 3.532462940765589e-05, + "loss": 0.27220540046691893, + "step": 151000 + }, + { + "epoch": 0.648274559302096, + "eval_loss": 0.3889124393463135, + "eval_runtime": 27.2159, + "eval_samples_per_second": 3.674, + "eval_steps_per_second": 3.674, + "step": 151000 + }, + { + "epoch": 0.6483174913921159, + "grad_norm": 0.011589907109737396, + "learning_rate": 3.532031768753827e-05, + "loss": 0.09215273857116699, + "step": 151010 + }, + { + "epoch": 0.648360423482136, + "grad_norm": 0.005135936196893454, + "learning_rate": 3.531600596742065e-05, + "loss": 0.11207122802734375, + "step": 151020 + }, + { + "epoch": 0.648403355572156, + "grad_norm": 1.7124568223953247, + "learning_rate": 3.5311694247303025e-05, + "loss": 0.26488828659057617, + "step": 151030 + }, + { + "epoch": 0.6484462876621759, + "grad_norm": 0.029093217104673386, + "learning_rate": 3.5307382527185396e-05, + "loss": 0.18144166469573975, + "step": 151040 + }, + { + "epoch": 0.648489219752196, + "grad_norm": 0.2676723599433899, + "learning_rate": 3.530307080706777e-05, + "loss": 0.15304325819015502, + "step": 151050 + }, + { + "epoch": 0.648532151842216, + "grad_norm": 0.026561537757515907, + "learning_rate": 3.529875908695015e-05, + "loss": 0.02566699981689453, + "step": 151060 + }, + { + "epoch": 0.648575083932236, + "grad_norm": 0.005033195950090885, + "learning_rate": 3.529444736683253e-05, + "loss": 0.12942137718200683, + "step": 151070 + }, + { + "epoch": 0.648618016022256, + "grad_norm": 0.00713531207293272, + "learning_rate": 3.52901356467149e-05, + "loss": 0.10324745178222657, + "step": 151080 + }, + { + "epoch": 0.648660948112276, + "grad_norm": 6.128237724304199, + "learning_rate": 3.5285823926597276e-05, + "loss": 0.45879373550415037, + "step": 151090 + }, + { + "epoch": 0.648703880202296, + "grad_norm": 0.4716830551624298, + "learning_rate": 3.528151220647965e-05, + "loss": 0.1531036376953125, + "step": 151100 + }, + { + "epoch": 0.648746812292316, + "grad_norm": 0.7725340723991394, + "learning_rate": 3.527720048636203e-05, + "loss": 0.13332175016403197, + "step": 151110 + }, + { + "epoch": 0.648789744382336, + "grad_norm": 0.4862283766269684, + "learning_rate": 3.527288876624441e-05, + "loss": 0.19347046613693236, + "step": 151120 + }, + { + "epoch": 0.648832676472356, + "grad_norm": 0.010178297758102417, + "learning_rate": 3.5268577046126785e-05, + "loss": 0.07210128903388976, + "step": 151130 + }, + { + "epoch": 0.648875608562376, + "grad_norm": 0.005305929109454155, + "learning_rate": 3.526426532600916e-05, + "loss": 0.025121399760246278, + "step": 151140 + }, + { + "epoch": 0.6489185406523961, + "grad_norm": 0.036797620356082916, + "learning_rate": 3.525995360589154e-05, + "loss": 0.11298577785491944, + "step": 151150 + }, + { + "epoch": 0.648961472742416, + "grad_norm": 0.1233087033033371, + "learning_rate": 3.525564188577391e-05, + "loss": 0.17588850259780883, + "step": 151160 + }, + { + "epoch": 0.649004404832436, + "grad_norm": 5.559473514556885, + "learning_rate": 3.525133016565629e-05, + "loss": 0.35527606010437013, + "step": 151170 + }, + { + "epoch": 0.6490473369224561, + "grad_norm": 0.0020315709989517927, + "learning_rate": 3.5247018445538665e-05, + "loss": 0.14180855751037597, + "step": 151180 + }, + { + "epoch": 0.649090269012476, + "grad_norm": 1.5811628103256226, + "learning_rate": 3.524270672542104e-05, + "loss": 0.31139168739318845, + "step": 151190 + }, + { + "epoch": 0.6491332011024961, + "grad_norm": 1.6150096654891968, + "learning_rate": 3.523839500530341e-05, + "loss": 0.33792574405670167, + "step": 151200 + }, + { + "epoch": 0.6491761331925161, + "grad_norm": 0.29071733355522156, + "learning_rate": 3.523408328518579e-05, + "loss": 0.26106438636779783, + "step": 151210 + }, + { + "epoch": 0.649219065282536, + "grad_norm": 4.788187503814697, + "learning_rate": 3.522977156506817e-05, + "loss": 0.17764378786087037, + "step": 151220 + }, + { + "epoch": 0.6492619973725561, + "grad_norm": 0.0022474355064332485, + "learning_rate": 3.5225459844950545e-05, + "loss": 0.15507956743240356, + "step": 151230 + }, + { + "epoch": 0.6493049294625761, + "grad_norm": 0.0006411916110664606, + "learning_rate": 3.522114812483292e-05, + "loss": 0.3545748233795166, + "step": 151240 + }, + { + "epoch": 0.649347861552596, + "grad_norm": 0.044647056609392166, + "learning_rate": 3.52168364047153e-05, + "loss": 0.2371835947036743, + "step": 151250 + }, + { + "epoch": 0.6493907936426161, + "grad_norm": 0.022296231240034103, + "learning_rate": 3.521252468459768e-05, + "loss": 0.19835184812545775, + "step": 151260 + }, + { + "epoch": 0.6494337257326361, + "grad_norm": 1.7125898599624634, + "learning_rate": 3.5208212964480055e-05, + "loss": 0.27645077705383303, + "step": 151270 + }, + { + "epoch": 0.6494766578226561, + "grad_norm": 0.5980311632156372, + "learning_rate": 3.5203901244362425e-05, + "loss": 0.2318577527999878, + "step": 151280 + }, + { + "epoch": 0.6495195899126761, + "grad_norm": 0.04134934023022652, + "learning_rate": 3.51995895242448e-05, + "loss": 0.20498950481414796, + "step": 151290 + }, + { + "epoch": 0.6495625220026962, + "grad_norm": 2.364464044570923, + "learning_rate": 3.519527780412718e-05, + "loss": 0.19556167125701904, + "step": 151300 + }, + { + "epoch": 0.6496054540927162, + "grad_norm": 0.002782400930300355, + "learning_rate": 3.519096608400956e-05, + "loss": 0.299759578704834, + "step": 151310 + }, + { + "epoch": 0.6496483861827361, + "grad_norm": 0.002601859625428915, + "learning_rate": 3.5186654363891934e-05, + "loss": 0.23682382106781005, + "step": 151320 + }, + { + "epoch": 0.6496913182727562, + "grad_norm": 0.03596939146518707, + "learning_rate": 3.5182342643774305e-05, + "loss": 0.23851912021636962, + "step": 151330 + }, + { + "epoch": 0.6497342503627762, + "grad_norm": 4.506582260131836, + "learning_rate": 3.517803092365668e-05, + "loss": 0.32757136821746824, + "step": 151340 + }, + { + "epoch": 0.6497771824527961, + "grad_norm": 0.0009721462265588343, + "learning_rate": 3.517371920353906e-05, + "loss": 0.3446447134017944, + "step": 151350 + }, + { + "epoch": 0.6498201145428162, + "grad_norm": 0.0029338260646909475, + "learning_rate": 3.516940748342144e-05, + "loss": 0.23369097709655762, + "step": 151360 + }, + { + "epoch": 0.6498630466328362, + "grad_norm": 0.0009567153174430132, + "learning_rate": 3.5165095763303814e-05, + "loss": 0.22707803249359132, + "step": 151370 + }, + { + "epoch": 0.6499059787228562, + "grad_norm": 3.856454610824585, + "learning_rate": 3.516078404318619e-05, + "loss": 0.09049471616744995, + "step": 151380 + }, + { + "epoch": 0.6499489108128762, + "grad_norm": 0.5688821077346802, + "learning_rate": 3.515647232306857e-05, + "loss": 0.3531051158905029, + "step": 151390 + }, + { + "epoch": 0.6499918429028962, + "grad_norm": 0.014180580154061317, + "learning_rate": 3.5152160602950946e-05, + "loss": 0.0902495801448822, + "step": 151400 + }, + { + "epoch": 0.6500347749929162, + "grad_norm": 1.0567702054977417, + "learning_rate": 3.514784888283332e-05, + "loss": 0.12536590099334716, + "step": 151410 + }, + { + "epoch": 0.6500777070829362, + "grad_norm": 0.05386923998594284, + "learning_rate": 3.5143537162715694e-05, + "loss": 0.20828590393066407, + "step": 151420 + }, + { + "epoch": 0.6501206391729563, + "grad_norm": 0.0040575917810201645, + "learning_rate": 3.513922544259807e-05, + "loss": 0.005848048627376557, + "step": 151430 + }, + { + "epoch": 0.6501635712629762, + "grad_norm": 0.005221802741289139, + "learning_rate": 3.513491372248045e-05, + "loss": 0.2222289562225342, + "step": 151440 + }, + { + "epoch": 0.6502065033529962, + "grad_norm": 1.2336004972457886, + "learning_rate": 3.513060200236282e-05, + "loss": 0.1798251748085022, + "step": 151450 + }, + { + "epoch": 0.6502494354430163, + "grad_norm": 0.010236959904432297, + "learning_rate": 3.51262902822452e-05, + "loss": 0.17683861255645753, + "step": 151460 + }, + { + "epoch": 0.6502923675330362, + "grad_norm": 147.47923278808594, + "learning_rate": 3.5121978562127574e-05, + "loss": 0.11385173797607422, + "step": 151470 + }, + { + "epoch": 0.6503352996230563, + "grad_norm": 0.2765538692474365, + "learning_rate": 3.511766684200996e-05, + "loss": 0.153677499294281, + "step": 151480 + }, + { + "epoch": 0.6503782317130763, + "grad_norm": 0.47096744179725647, + "learning_rate": 3.511335512189233e-05, + "loss": 0.23527054786682128, + "step": 151490 + }, + { + "epoch": 0.6504211638030962, + "grad_norm": 1.5383275747299194, + "learning_rate": 3.5109043401774706e-05, + "loss": 0.21786458492279054, + "step": 151500 + }, + { + "epoch": 0.6504640958931163, + "grad_norm": 0.019338462501764297, + "learning_rate": 3.5104731681657084e-05, + "loss": 0.3397980213165283, + "step": 151510 + }, + { + "epoch": 0.6505070279831363, + "grad_norm": 1.6768598556518555, + "learning_rate": 3.510041996153946e-05, + "loss": 0.4416059494018555, + "step": 151520 + }, + { + "epoch": 0.6505499600731562, + "grad_norm": 2.0906455516815186, + "learning_rate": 3.509610824142183e-05, + "loss": 0.42383246421813964, + "step": 151530 + }, + { + "epoch": 0.6505928921631763, + "grad_norm": 0.0007825464126653969, + "learning_rate": 3.509179652130421e-05, + "loss": 0.1377132773399353, + "step": 151540 + }, + { + "epoch": 0.6506358242531963, + "grad_norm": 0.001401827554218471, + "learning_rate": 3.5087484801186586e-05, + "loss": 0.047088241577148436, + "step": 151550 + }, + { + "epoch": 0.6506787563432163, + "grad_norm": 2.3653674125671387, + "learning_rate": 3.5083173081068964e-05, + "loss": 0.20371806621551514, + "step": 151560 + }, + { + "epoch": 0.6507216884332363, + "grad_norm": 0.0049470472149550915, + "learning_rate": 3.5078861360951334e-05, + "loss": 0.28787286281585694, + "step": 151570 + }, + { + "epoch": 0.6507646205232563, + "grad_norm": 0.0012642034562304616, + "learning_rate": 3.507454964083371e-05, + "loss": 0.2894446611404419, + "step": 151580 + }, + { + "epoch": 0.6508075526132763, + "grad_norm": 0.3203394412994385, + "learning_rate": 3.5070237920716096e-05, + "loss": 0.08833484053611755, + "step": 151590 + }, + { + "epoch": 0.6508504847032963, + "grad_norm": 3.1001203060150146, + "learning_rate": 3.506592620059847e-05, + "loss": 0.24941644668579102, + "step": 151600 + }, + { + "epoch": 0.6508934167933164, + "grad_norm": 1.7117334604263306, + "learning_rate": 3.5061614480480844e-05, + "loss": 0.057556116580963136, + "step": 151610 + }, + { + "epoch": 0.6509363488833363, + "grad_norm": 1.9608606100082397, + "learning_rate": 3.505730276036322e-05, + "loss": 0.1367754340171814, + "step": 151620 + }, + { + "epoch": 0.6509792809733563, + "grad_norm": 0.02430753782391548, + "learning_rate": 3.50529910402456e-05, + "loss": 0.32413339614868164, + "step": 151630 + }, + { + "epoch": 0.6510222130633764, + "grad_norm": 6.840951442718506, + "learning_rate": 3.5048679320127976e-05, + "loss": 0.4086446285247803, + "step": 151640 + }, + { + "epoch": 0.6510651451533963, + "grad_norm": 2.100064277648926, + "learning_rate": 3.5044367600010346e-05, + "loss": 0.21504318714141846, + "step": 151650 + }, + { + "epoch": 0.6511080772434163, + "grad_norm": 0.5872086882591248, + "learning_rate": 3.5040055879892723e-05, + "loss": 0.21643712520599365, + "step": 151660 + }, + { + "epoch": 0.6511510093334364, + "grad_norm": 0.005748818628489971, + "learning_rate": 3.50357441597751e-05, + "loss": 0.19003957509994507, + "step": 151670 + }, + { + "epoch": 0.6511939414234563, + "grad_norm": 0.0002342463267268613, + "learning_rate": 3.503143243965748e-05, + "loss": 0.1779122233390808, + "step": 151680 + }, + { + "epoch": 0.6512368735134764, + "grad_norm": 0.021635752171278, + "learning_rate": 3.5027120719539856e-05, + "loss": 0.07176170349121094, + "step": 151690 + }, + { + "epoch": 0.6512798056034964, + "grad_norm": 1.4818261861801147, + "learning_rate": 3.502280899942223e-05, + "loss": 0.18642498254776002, + "step": 151700 + }, + { + "epoch": 0.6513227376935163, + "grad_norm": 1.6581411361694336, + "learning_rate": 3.501849727930461e-05, + "loss": 0.17148194313049317, + "step": 151710 + }, + { + "epoch": 0.6513656697835364, + "grad_norm": 0.046362608671188354, + "learning_rate": 3.501418555918699e-05, + "loss": 0.15675292015075684, + "step": 151720 + }, + { + "epoch": 0.6514086018735564, + "grad_norm": 0.011807423084974289, + "learning_rate": 3.5009873839069365e-05, + "loss": 0.09193039536476136, + "step": 151730 + }, + { + "epoch": 0.6514515339635765, + "grad_norm": 0.017920024693012238, + "learning_rate": 3.5005562118951735e-05, + "loss": 0.20496132373809814, + "step": 151740 + }, + { + "epoch": 0.6514944660535964, + "grad_norm": 0.004140602890402079, + "learning_rate": 3.500125039883411e-05, + "loss": 0.23158493041992187, + "step": 151750 + }, + { + "epoch": 0.6515373981436164, + "grad_norm": 0.0014110020129010081, + "learning_rate": 3.499693867871649e-05, + "loss": 0.07966670393943787, + "step": 151760 + }, + { + "epoch": 0.6515803302336365, + "grad_norm": 0.00932406634092331, + "learning_rate": 3.499262695859887e-05, + "loss": 0.2577128648757935, + "step": 151770 + }, + { + "epoch": 0.6516232623236564, + "grad_norm": 0.0015593827702105045, + "learning_rate": 3.498831523848124e-05, + "loss": 0.13507426977157594, + "step": 151780 + }, + { + "epoch": 0.6516661944136765, + "grad_norm": 1.8296703100204468, + "learning_rate": 3.4984003518363615e-05, + "loss": 0.10646724700927734, + "step": 151790 + }, + { + "epoch": 0.6517091265036965, + "grad_norm": 5.870005130767822, + "learning_rate": 3.497969179824599e-05, + "loss": 0.3149374008178711, + "step": 151800 + }, + { + "epoch": 0.6517520585937164, + "grad_norm": 1.2665034532546997, + "learning_rate": 3.497538007812837e-05, + "loss": 0.21344881057739257, + "step": 151810 + }, + { + "epoch": 0.6517949906837365, + "grad_norm": 0.33878016471862793, + "learning_rate": 3.497106835801075e-05, + "loss": 0.4020512104034424, + "step": 151820 + }, + { + "epoch": 0.6518379227737565, + "grad_norm": 0.005240934900939465, + "learning_rate": 3.4966756637893125e-05, + "loss": 0.26684696674346925, + "step": 151830 + }, + { + "epoch": 0.6518808548637764, + "grad_norm": 0.0019433987326920033, + "learning_rate": 3.49624449177755e-05, + "loss": 0.18590428829193115, + "step": 151840 + }, + { + "epoch": 0.6519237869537965, + "grad_norm": 0.40653958916664124, + "learning_rate": 3.495813319765788e-05, + "loss": 0.19820864200592042, + "step": 151850 + }, + { + "epoch": 0.6519667190438165, + "grad_norm": 5.8154401779174805, + "learning_rate": 3.495382147754025e-05, + "loss": 0.2836847066879272, + "step": 151860 + }, + { + "epoch": 0.6520096511338365, + "grad_norm": 0.005736039485782385, + "learning_rate": 3.494950975742263e-05, + "loss": 0.09045286178588867, + "step": 151870 + }, + { + "epoch": 0.6520525832238565, + "grad_norm": 0.001496506156399846, + "learning_rate": 3.4945198037305005e-05, + "loss": 0.25731420516967773, + "step": 151880 + }, + { + "epoch": 0.6520955153138766, + "grad_norm": 13.771329879760742, + "learning_rate": 3.494088631718738e-05, + "loss": 0.2797736644744873, + "step": 151890 + }, + { + "epoch": 0.6521384474038965, + "grad_norm": 0.5289430618286133, + "learning_rate": 3.493657459706975e-05, + "loss": 0.22803876399993897, + "step": 151900 + }, + { + "epoch": 0.6521813794939165, + "grad_norm": 0.5056369304656982, + "learning_rate": 3.493226287695213e-05, + "loss": 0.1541724681854248, + "step": 151910 + }, + { + "epoch": 0.6522243115839366, + "grad_norm": 0.028238875791430473, + "learning_rate": 3.492795115683451e-05, + "loss": 0.19024114608764647, + "step": 151920 + }, + { + "epoch": 0.6522672436739565, + "grad_norm": 0.0652453675866127, + "learning_rate": 3.4923639436716885e-05, + "loss": 0.16091703176498412, + "step": 151930 + }, + { + "epoch": 0.6523101757639765, + "grad_norm": 2.1925699710845947, + "learning_rate": 3.491932771659926e-05, + "loss": 0.2420794725418091, + "step": 151940 + }, + { + "epoch": 0.6523531078539966, + "grad_norm": 0.004015914164483547, + "learning_rate": 3.491501599648164e-05, + "loss": 0.3008209943771362, + "step": 151950 + }, + { + "epoch": 0.6523960399440165, + "grad_norm": 2.2953882217407227, + "learning_rate": 3.491070427636402e-05, + "loss": 0.11296336650848389, + "step": 151960 + }, + { + "epoch": 0.6524389720340366, + "grad_norm": 0.028664682060480118, + "learning_rate": 3.4906392556246394e-05, + "loss": 0.03355903923511505, + "step": 151970 + }, + { + "epoch": 0.6524819041240566, + "grad_norm": 0.5694634914398193, + "learning_rate": 3.4902080836128765e-05, + "loss": 0.15875810384750366, + "step": 151980 + }, + { + "epoch": 0.6525248362140765, + "grad_norm": 0.07715904712677002, + "learning_rate": 3.489776911601114e-05, + "loss": 0.10381278991699219, + "step": 151990 + }, + { + "epoch": 0.6525677683040966, + "grad_norm": 7.928126335144043, + "learning_rate": 3.489345739589352e-05, + "loss": 0.3048884153366089, + "step": 152000 + }, + { + "epoch": 0.6525677683040966, + "eval_loss": 0.3930748701095581, + "eval_runtime": 27.1751, + "eval_samples_per_second": 3.68, + "eval_steps_per_second": 3.68, + "step": 152000 + }, + { + "epoch": 0.6526107003941166, + "grad_norm": 0.0018158082384616137, + "learning_rate": 3.4889145675775897e-05, + "loss": 0.3498977184295654, + "step": 152010 + }, + { + "epoch": 0.6526536324841365, + "grad_norm": 9.38770866394043, + "learning_rate": 3.488483395565827e-05, + "loss": 0.10030590295791626, + "step": 152020 + }, + { + "epoch": 0.6526965645741566, + "grad_norm": 0.0059972163289785385, + "learning_rate": 3.4880522235540645e-05, + "loss": 0.255455470085144, + "step": 152030 + }, + { + "epoch": 0.6527394966641766, + "grad_norm": 1.80948805809021, + "learning_rate": 3.487621051542302e-05, + "loss": 0.24304404258728027, + "step": 152040 + }, + { + "epoch": 0.6527824287541966, + "grad_norm": 0.0005664766067638993, + "learning_rate": 3.48718987953054e-05, + "loss": 0.12256590127944947, + "step": 152050 + }, + { + "epoch": 0.6528253608442166, + "grad_norm": 0.03551163151860237, + "learning_rate": 3.4867587075187777e-05, + "loss": 0.2767303466796875, + "step": 152060 + }, + { + "epoch": 0.6528682929342366, + "grad_norm": 0.0017603436717763543, + "learning_rate": 3.4863275355070154e-05, + "loss": 0.16143405437469482, + "step": 152070 + }, + { + "epoch": 0.6529112250242566, + "grad_norm": 0.00445699505507946, + "learning_rate": 3.485896363495253e-05, + "loss": 0.2035740852355957, + "step": 152080 + }, + { + "epoch": 0.6529541571142766, + "grad_norm": 0.10075142234563828, + "learning_rate": 3.485465191483491e-05, + "loss": 0.20856783390045167, + "step": 152090 + }, + { + "epoch": 0.6529970892042967, + "grad_norm": 0.02400701306760311, + "learning_rate": 3.4850340194717286e-05, + "loss": 0.26665315628051756, + "step": 152100 + }, + { + "epoch": 0.6530400212943166, + "grad_norm": 2.000727653503418, + "learning_rate": 3.4846028474599656e-05, + "loss": 0.40493073463439944, + "step": 152110 + }, + { + "epoch": 0.6530829533843366, + "grad_norm": 0.005690529942512512, + "learning_rate": 3.4841716754482034e-05, + "loss": 0.20626089572906495, + "step": 152120 + }, + { + "epoch": 0.6531258854743567, + "grad_norm": 1.505405068397522, + "learning_rate": 3.483740503436441e-05, + "loss": 0.15051109790802003, + "step": 152130 + }, + { + "epoch": 0.6531688175643766, + "grad_norm": 18.674238204956055, + "learning_rate": 3.483309331424679e-05, + "loss": 0.08497092127799988, + "step": 152140 + }, + { + "epoch": 0.6532117496543967, + "grad_norm": 0.08240605145692825, + "learning_rate": 3.482878159412916e-05, + "loss": 0.15139393806457518, + "step": 152150 + }, + { + "epoch": 0.6532546817444167, + "grad_norm": 0.019352389499545097, + "learning_rate": 3.4824469874011536e-05, + "loss": 0.3356482028961182, + "step": 152160 + }, + { + "epoch": 0.6532976138344367, + "grad_norm": 0.3776816427707672, + "learning_rate": 3.4820158153893914e-05, + "loss": 0.19168307781219482, + "step": 152170 + }, + { + "epoch": 0.6533405459244567, + "grad_norm": 0.010281571187078953, + "learning_rate": 3.48158464337763e-05, + "loss": 0.13237816095352173, + "step": 152180 + }, + { + "epoch": 0.6533834780144767, + "grad_norm": 0.006922994274646044, + "learning_rate": 3.481153471365867e-05, + "loss": 0.3548863410949707, + "step": 152190 + }, + { + "epoch": 0.6534264101044968, + "grad_norm": 0.2510335445404053, + "learning_rate": 3.4807222993541046e-05, + "loss": 0.11280529499053955, + "step": 152200 + }, + { + "epoch": 0.6534693421945167, + "grad_norm": 0.006575438193976879, + "learning_rate": 3.480291127342342e-05, + "loss": 0.13859881162643434, + "step": 152210 + }, + { + "epoch": 0.6535122742845367, + "grad_norm": 0.07279030978679657, + "learning_rate": 3.47985995533058e-05, + "loss": 0.33872673511505125, + "step": 152220 + }, + { + "epoch": 0.6535552063745568, + "grad_norm": 0.4047152101993561, + "learning_rate": 3.479428783318817e-05, + "loss": 0.21664028167724608, + "step": 152230 + }, + { + "epoch": 0.6535981384645767, + "grad_norm": 0.018721673637628555, + "learning_rate": 3.478997611307055e-05, + "loss": 0.1668068528175354, + "step": 152240 + }, + { + "epoch": 0.6536410705545967, + "grad_norm": 0.03141282871365547, + "learning_rate": 3.4785664392952926e-05, + "loss": 0.1575627565383911, + "step": 152250 + }, + { + "epoch": 0.6536840026446168, + "grad_norm": 0.19100338220596313, + "learning_rate": 3.47813526728353e-05, + "loss": 0.1080901026725769, + "step": 152260 + }, + { + "epoch": 0.6537269347346367, + "grad_norm": 1.8139461278915405, + "learning_rate": 3.4777040952717674e-05, + "loss": 0.16453101634979247, + "step": 152270 + }, + { + "epoch": 0.6537698668246568, + "grad_norm": 0.9621730446815491, + "learning_rate": 3.477272923260005e-05, + "loss": 0.2002204418182373, + "step": 152280 + }, + { + "epoch": 0.6538127989146768, + "grad_norm": 1.7624107599258423, + "learning_rate": 3.4768417512482435e-05, + "loss": 0.3082698345184326, + "step": 152290 + }, + { + "epoch": 0.6538557310046967, + "grad_norm": 1.3553941249847412, + "learning_rate": 3.476410579236481e-05, + "loss": 0.3578528881072998, + "step": 152300 + }, + { + "epoch": 0.6538986630947168, + "grad_norm": 2.207336187362671, + "learning_rate": 3.475979407224718e-05, + "loss": 0.3818779468536377, + "step": 152310 + }, + { + "epoch": 0.6539415951847368, + "grad_norm": 2.9257280826568604, + "learning_rate": 3.475548235212956e-05, + "loss": 0.2496558904647827, + "step": 152320 + }, + { + "epoch": 0.6539845272747568, + "grad_norm": 0.05879620090126991, + "learning_rate": 3.475117063201194e-05, + "loss": 0.2835780382156372, + "step": 152330 + }, + { + "epoch": 0.6540274593647768, + "grad_norm": 0.0013960381038486958, + "learning_rate": 3.4746858911894315e-05, + "loss": 0.12066258192062378, + "step": 152340 + }, + { + "epoch": 0.6540703914547968, + "grad_norm": 0.10827244073152542, + "learning_rate": 3.4742547191776686e-05, + "loss": 0.08777568936347961, + "step": 152350 + }, + { + "epoch": 0.6541133235448168, + "grad_norm": 0.0033609438687562943, + "learning_rate": 3.473823547165906e-05, + "loss": 0.14433823823928832, + "step": 152360 + }, + { + "epoch": 0.6541562556348368, + "grad_norm": 0.12234596163034439, + "learning_rate": 3.473392375154144e-05, + "loss": 0.004690999910235405, + "step": 152370 + }, + { + "epoch": 0.6541991877248569, + "grad_norm": 0.2859869599342346, + "learning_rate": 3.472961203142382e-05, + "loss": 0.1143067479133606, + "step": 152380 + }, + { + "epoch": 0.6542421198148768, + "grad_norm": 2.3144009113311768, + "learning_rate": 3.472530031130619e-05, + "loss": 0.1523873209953308, + "step": 152390 + }, + { + "epoch": 0.6542850519048968, + "grad_norm": 1.4639687538146973, + "learning_rate": 3.472098859118857e-05, + "loss": 0.13809309005737305, + "step": 152400 + }, + { + "epoch": 0.6543279839949169, + "grad_norm": 1.4635416269302368, + "learning_rate": 3.471667687107095e-05, + "loss": 0.24965903759002686, + "step": 152410 + }, + { + "epoch": 0.6543709160849368, + "grad_norm": 0.025756657123565674, + "learning_rate": 3.471236515095333e-05, + "loss": 0.26134366989135743, + "step": 152420 + }, + { + "epoch": 0.6544138481749568, + "grad_norm": 0.13001833856105804, + "learning_rate": 3.4708053430835704e-05, + "loss": 0.2040097713470459, + "step": 152430 + }, + { + "epoch": 0.6544567802649769, + "grad_norm": 0.000583972199819982, + "learning_rate": 3.4703741710718075e-05, + "loss": 0.28530220985412597, + "step": 152440 + }, + { + "epoch": 0.6544997123549968, + "grad_norm": 0.45030274987220764, + "learning_rate": 3.469942999060045e-05, + "loss": 0.20632073879241944, + "step": 152450 + }, + { + "epoch": 0.6545426444450169, + "grad_norm": 0.0034222842659801245, + "learning_rate": 3.469511827048283e-05, + "loss": 0.19489221572875975, + "step": 152460 + }, + { + "epoch": 0.6545855765350369, + "grad_norm": 0.002282580127939582, + "learning_rate": 3.469080655036521e-05, + "loss": 0.198526132106781, + "step": 152470 + }, + { + "epoch": 0.6546285086250568, + "grad_norm": 0.02770853415131569, + "learning_rate": 3.468649483024758e-05, + "loss": 0.11303834915161133, + "step": 152480 + }, + { + "epoch": 0.6546714407150769, + "grad_norm": 1.2699313163757324, + "learning_rate": 3.4682183110129955e-05, + "loss": 0.12168716192245484, + "step": 152490 + }, + { + "epoch": 0.6547143728050969, + "grad_norm": 0.0013995268382132053, + "learning_rate": 3.467787139001233e-05, + "loss": 0.3896394491195679, + "step": 152500 + }, + { + "epoch": 0.6547573048951169, + "grad_norm": 1.4256479740142822, + "learning_rate": 3.467355966989471e-05, + "loss": 0.36592926979064944, + "step": 152510 + }, + { + "epoch": 0.6548002369851369, + "grad_norm": 0.029613185673952103, + "learning_rate": 3.466924794977709e-05, + "loss": 0.20814926624298097, + "step": 152520 + }, + { + "epoch": 0.6548431690751569, + "grad_norm": 0.013652176596224308, + "learning_rate": 3.4664936229659464e-05, + "loss": 0.35039663314819336, + "step": 152530 + }, + { + "epoch": 0.6548861011651769, + "grad_norm": 0.19368019700050354, + "learning_rate": 3.466062450954184e-05, + "loss": 0.23914365768432616, + "step": 152540 + }, + { + "epoch": 0.6549290332551969, + "grad_norm": 0.04172434285283089, + "learning_rate": 3.465631278942422e-05, + "loss": 0.07054402232170105, + "step": 152550 + }, + { + "epoch": 0.654971965345217, + "grad_norm": 5.317892074584961, + "learning_rate": 3.465200106930659e-05, + "loss": 0.31213338375091554, + "step": 152560 + }, + { + "epoch": 0.6550148974352369, + "grad_norm": 1.7020102739334106, + "learning_rate": 3.464768934918897e-05, + "loss": 0.30658817291259766, + "step": 152570 + }, + { + "epoch": 0.6550578295252569, + "grad_norm": 0.022133029997348785, + "learning_rate": 3.4643377629071344e-05, + "loss": 0.01100047081708908, + "step": 152580 + }, + { + "epoch": 0.655100761615277, + "grad_norm": 0.11623332649469376, + "learning_rate": 3.463906590895372e-05, + "loss": 0.2962920665740967, + "step": 152590 + }, + { + "epoch": 0.655143693705297, + "grad_norm": 4.137857437133789, + "learning_rate": 3.463475418883609e-05, + "loss": 0.2276834726333618, + "step": 152600 + }, + { + "epoch": 0.655186625795317, + "grad_norm": 0.003172489581629634, + "learning_rate": 3.463044246871847e-05, + "loss": 0.13099607229232788, + "step": 152610 + }, + { + "epoch": 0.655229557885337, + "grad_norm": 0.11240309476852417, + "learning_rate": 3.462613074860085e-05, + "loss": 0.08598883152008056, + "step": 152620 + }, + { + "epoch": 0.655272489975357, + "grad_norm": 0.021000202745199203, + "learning_rate": 3.4621819028483224e-05, + "loss": 0.1656929850578308, + "step": 152630 + }, + { + "epoch": 0.655315422065377, + "grad_norm": 0.011905116029083729, + "learning_rate": 3.46175073083656e-05, + "loss": 0.2573047637939453, + "step": 152640 + }, + { + "epoch": 0.655358354155397, + "grad_norm": 1.581723690032959, + "learning_rate": 3.461319558824798e-05, + "loss": 0.13378567695617677, + "step": 152650 + }, + { + "epoch": 0.655401286245417, + "grad_norm": 0.045455001294612885, + "learning_rate": 3.4608883868130356e-05, + "loss": 0.20185463428497313, + "step": 152660 + }, + { + "epoch": 0.655444218335437, + "grad_norm": 2.161616802215576, + "learning_rate": 3.4604572148012733e-05, + "loss": 0.19833118915557862, + "step": 152670 + }, + { + "epoch": 0.655487150425457, + "grad_norm": 1.8970059156417847, + "learning_rate": 3.4600260427895104e-05, + "loss": 0.21300973892211914, + "step": 152680 + }, + { + "epoch": 0.6555300825154771, + "grad_norm": 0.0016901158960536122, + "learning_rate": 3.459594870777748e-05, + "loss": 0.1876598834991455, + "step": 152690 + }, + { + "epoch": 0.655573014605497, + "grad_norm": 0.0022294456139206886, + "learning_rate": 3.459163698765986e-05, + "loss": 0.10009009838104248, + "step": 152700 + }, + { + "epoch": 0.655615946695517, + "grad_norm": 1.3207972049713135, + "learning_rate": 3.4587325267542236e-05, + "loss": 0.2541964530944824, + "step": 152710 + }, + { + "epoch": 0.6556588787855371, + "grad_norm": 5.267035961151123, + "learning_rate": 3.458301354742461e-05, + "loss": 0.21954050064086914, + "step": 152720 + }, + { + "epoch": 0.655701810875557, + "grad_norm": 0.095136858522892, + "learning_rate": 3.4578701827306984e-05, + "loss": 0.32280683517456055, + "step": 152730 + }, + { + "epoch": 0.655744742965577, + "grad_norm": 0.012000875547528267, + "learning_rate": 3.457439010718936e-05, + "loss": 0.1790920615196228, + "step": 152740 + }, + { + "epoch": 0.6557876750555971, + "grad_norm": 0.22052302956581116, + "learning_rate": 3.457007838707174e-05, + "loss": 0.13050248622894287, + "step": 152750 + }, + { + "epoch": 0.655830607145617, + "grad_norm": 0.01903325505554676, + "learning_rate": 3.4565766666954116e-05, + "loss": 0.24872941970825196, + "step": 152760 + }, + { + "epoch": 0.6558735392356371, + "grad_norm": 0.03306184709072113, + "learning_rate": 3.456145494683649e-05, + "loss": 0.18790819644927978, + "step": 152770 + }, + { + "epoch": 0.6559164713256571, + "grad_norm": 0.03617790713906288, + "learning_rate": 3.455714322671887e-05, + "loss": 0.13181324005126954, + "step": 152780 + }, + { + "epoch": 0.655959403415677, + "grad_norm": 0.008156493306159973, + "learning_rate": 3.455283150660125e-05, + "loss": 0.24294917583465575, + "step": 152790 + }, + { + "epoch": 0.6560023355056971, + "grad_norm": 1.6839267015457153, + "learning_rate": 3.4548519786483625e-05, + "loss": 0.0995561957359314, + "step": 152800 + }, + { + "epoch": 0.6560452675957171, + "grad_norm": 0.001088878489099443, + "learning_rate": 3.4544208066365996e-05, + "loss": 0.17891314029693603, + "step": 152810 + }, + { + "epoch": 0.6560881996857371, + "grad_norm": 0.08594802021980286, + "learning_rate": 3.453989634624837e-05, + "loss": 0.18109216690063476, + "step": 152820 + }, + { + "epoch": 0.6561311317757571, + "grad_norm": 0.05384450778365135, + "learning_rate": 3.453558462613075e-05, + "loss": 0.12445077896118165, + "step": 152830 + }, + { + "epoch": 0.6561740638657771, + "grad_norm": 1.4890823364257812, + "learning_rate": 3.453127290601313e-05, + "loss": 0.27255203723907473, + "step": 152840 + }, + { + "epoch": 0.6562169959557971, + "grad_norm": 0.18443816900253296, + "learning_rate": 3.45269611858955e-05, + "loss": 0.1994839906692505, + "step": 152850 + }, + { + "epoch": 0.6562599280458171, + "grad_norm": 0.020347854122519493, + "learning_rate": 3.4522649465777876e-05, + "loss": 0.04077900350093842, + "step": 152860 + }, + { + "epoch": 0.6563028601358372, + "grad_norm": 0.04533800855278969, + "learning_rate": 3.451833774566025e-05, + "loss": 0.09720346331596375, + "step": 152870 + }, + { + "epoch": 0.6563457922258571, + "grad_norm": 0.034875430166721344, + "learning_rate": 3.451402602554264e-05, + "loss": 0.13401124477386475, + "step": 152880 + }, + { + "epoch": 0.6563887243158771, + "grad_norm": 8.106966972351074, + "learning_rate": 3.450971430542501e-05, + "loss": 0.41603717803955076, + "step": 152890 + }, + { + "epoch": 0.6564316564058972, + "grad_norm": 0.8586081862449646, + "learning_rate": 3.4505402585307385e-05, + "loss": 0.07474786639213563, + "step": 152900 + }, + { + "epoch": 0.6564745884959171, + "grad_norm": 0.012892269529402256, + "learning_rate": 3.450109086518976e-05, + "loss": 0.15219208002090454, + "step": 152910 + }, + { + "epoch": 0.6565175205859372, + "grad_norm": 0.4991026818752289, + "learning_rate": 3.449677914507214e-05, + "loss": 0.366585898399353, + "step": 152920 + }, + { + "epoch": 0.6565604526759572, + "grad_norm": 0.873711109161377, + "learning_rate": 3.449246742495451e-05, + "loss": 0.12484588623046874, + "step": 152930 + }, + { + "epoch": 0.6566033847659771, + "grad_norm": 0.053988225758075714, + "learning_rate": 3.448815570483689e-05, + "loss": 0.24912574291229247, + "step": 152940 + }, + { + "epoch": 0.6566463168559972, + "grad_norm": 0.012094683013856411, + "learning_rate": 3.4483843984719265e-05, + "loss": 0.07237531542778015, + "step": 152950 + }, + { + "epoch": 0.6566892489460172, + "grad_norm": 0.013880450278520584, + "learning_rate": 3.447953226460164e-05, + "loss": 0.2798666000366211, + "step": 152960 + }, + { + "epoch": 0.6567321810360371, + "grad_norm": 0.887079656124115, + "learning_rate": 3.447522054448401e-05, + "loss": 0.12565791606903076, + "step": 152970 + }, + { + "epoch": 0.6567751131260572, + "grad_norm": 0.07446973770856857, + "learning_rate": 3.447090882436639e-05, + "loss": 0.1282172441482544, + "step": 152980 + }, + { + "epoch": 0.6568180452160772, + "grad_norm": 0.004310452379286289, + "learning_rate": 3.4466597104248775e-05, + "loss": 0.24610447883605957, + "step": 152990 + }, + { + "epoch": 0.6568609773060972, + "grad_norm": 0.0427681989967823, + "learning_rate": 3.446228538413115e-05, + "loss": 0.06416921019554138, + "step": 153000 + }, + { + "epoch": 0.6568609773060972, + "eval_loss": 0.392218679189682, + "eval_runtime": 27.216, + "eval_samples_per_second": 3.674, + "eval_steps_per_second": 3.674, + "step": 153000 + }, + { + "epoch": 0.6569039093961172, + "grad_norm": 2.4584898948669434, + "learning_rate": 3.445797366401352e-05, + "loss": 0.24507856369018555, + "step": 153010 + }, + { + "epoch": 0.6569468414861372, + "grad_norm": 0.005726732779294252, + "learning_rate": 3.44536619438959e-05, + "loss": 0.10224964618682861, + "step": 153020 + }, + { + "epoch": 0.6569897735761573, + "grad_norm": 3.3387672901153564, + "learning_rate": 3.444935022377828e-05, + "loss": 0.314972448348999, + "step": 153030 + }, + { + "epoch": 0.6570327056661772, + "grad_norm": 0.008076757192611694, + "learning_rate": 3.4445038503660655e-05, + "loss": 0.22315526008605957, + "step": 153040 + }, + { + "epoch": 0.6570756377561973, + "grad_norm": 3.77646803855896, + "learning_rate": 3.4440726783543025e-05, + "loss": 0.2469933032989502, + "step": 153050 + }, + { + "epoch": 0.6571185698462173, + "grad_norm": 0.08149869740009308, + "learning_rate": 3.44364150634254e-05, + "loss": 0.15403306484222412, + "step": 153060 + }, + { + "epoch": 0.6571615019362372, + "grad_norm": 1.642274022102356, + "learning_rate": 3.443210334330778e-05, + "loss": 0.1979893922805786, + "step": 153070 + }, + { + "epoch": 0.6572044340262573, + "grad_norm": 0.8089116215705872, + "learning_rate": 3.442779162319016e-05, + "loss": 0.24776697158813477, + "step": 153080 + }, + { + "epoch": 0.6572473661162773, + "grad_norm": 0.008406427688896656, + "learning_rate": 3.442347990307253e-05, + "loss": 0.2347959041595459, + "step": 153090 + }, + { + "epoch": 0.6572902982062973, + "grad_norm": 0.2034047693014145, + "learning_rate": 3.441916818295491e-05, + "loss": 0.029045340418815613, + "step": 153100 + }, + { + "epoch": 0.6573332302963173, + "grad_norm": 1.0141544342041016, + "learning_rate": 3.441485646283729e-05, + "loss": 0.3198372840881348, + "step": 153110 + }, + { + "epoch": 0.6573761623863373, + "grad_norm": 0.04060515761375427, + "learning_rate": 3.4410544742719666e-05, + "loss": 0.24239850044250488, + "step": 153120 + }, + { + "epoch": 0.6574190944763573, + "grad_norm": 0.009104978293180466, + "learning_rate": 3.440623302260204e-05, + "loss": 0.14999579191207885, + "step": 153130 + }, + { + "epoch": 0.6574620265663773, + "grad_norm": 0.28482234477996826, + "learning_rate": 3.4401921302484414e-05, + "loss": 0.21846232414245606, + "step": 153140 + }, + { + "epoch": 0.6575049586563974, + "grad_norm": 0.02105596847832203, + "learning_rate": 3.439760958236679e-05, + "loss": 0.21020326614379883, + "step": 153150 + }, + { + "epoch": 0.6575478907464173, + "grad_norm": 1.977399230003357, + "learning_rate": 3.439329786224917e-05, + "loss": 0.10796371698379517, + "step": 153160 + }, + { + "epoch": 0.6575908228364373, + "grad_norm": 0.144552081823349, + "learning_rate": 3.4388986142131546e-05, + "loss": 0.234502911567688, + "step": 153170 + }, + { + "epoch": 0.6576337549264574, + "grad_norm": 3.9625911712646484, + "learning_rate": 3.438467442201392e-05, + "loss": 0.45007548332214353, + "step": 153180 + }, + { + "epoch": 0.6576766870164773, + "grad_norm": 0.006828643381595612, + "learning_rate": 3.4380362701896294e-05, + "loss": 0.22843713760375978, + "step": 153190 + }, + { + "epoch": 0.6577196191064973, + "grad_norm": 5.773648262023926, + "learning_rate": 3.437605098177867e-05, + "loss": 0.27328267097473147, + "step": 153200 + }, + { + "epoch": 0.6577625511965174, + "grad_norm": 0.2885953187942505, + "learning_rate": 3.437173926166105e-05, + "loss": 0.3828735828399658, + "step": 153210 + }, + { + "epoch": 0.6578054832865373, + "grad_norm": 0.6968337893486023, + "learning_rate": 3.4367427541543426e-05, + "loss": 0.2467254877090454, + "step": 153220 + }, + { + "epoch": 0.6578484153765574, + "grad_norm": 0.043182846158742905, + "learning_rate": 3.4363115821425804e-05, + "loss": 0.10045549869537354, + "step": 153230 + }, + { + "epoch": 0.6578913474665774, + "grad_norm": 6.219958782196045, + "learning_rate": 3.435880410130818e-05, + "loss": 0.2550558805465698, + "step": 153240 + }, + { + "epoch": 0.6579342795565973, + "grad_norm": 7.405364513397217, + "learning_rate": 3.435449238119056e-05, + "loss": 0.193221116065979, + "step": 153250 + }, + { + "epoch": 0.6579772116466174, + "grad_norm": 0.7110088467597961, + "learning_rate": 3.435018066107293e-05, + "loss": 0.1556612491607666, + "step": 153260 + }, + { + "epoch": 0.6580201437366374, + "grad_norm": 1.2682826519012451, + "learning_rate": 3.4345868940955306e-05, + "loss": 0.3977441072463989, + "step": 153270 + }, + { + "epoch": 0.6580630758266574, + "grad_norm": 0.35356810688972473, + "learning_rate": 3.4341557220837684e-05, + "loss": 0.2715872526168823, + "step": 153280 + }, + { + "epoch": 0.6581060079166774, + "grad_norm": 0.04433917999267578, + "learning_rate": 3.433724550072006e-05, + "loss": 0.14817949533462524, + "step": 153290 + }, + { + "epoch": 0.6581489400066974, + "grad_norm": 0.39730677008628845, + "learning_rate": 3.433293378060243e-05, + "loss": 0.2873368263244629, + "step": 153300 + }, + { + "epoch": 0.6581918720967174, + "grad_norm": 0.3087719976902008, + "learning_rate": 3.432862206048481e-05, + "loss": 0.1608600616455078, + "step": 153310 + }, + { + "epoch": 0.6582348041867374, + "grad_norm": 1.1279784440994263, + "learning_rate": 3.4324310340367186e-05, + "loss": 0.10123729705810547, + "step": 153320 + }, + { + "epoch": 0.6582777362767575, + "grad_norm": 0.03151925280690193, + "learning_rate": 3.4319998620249564e-05, + "loss": 0.1971789002418518, + "step": 153330 + }, + { + "epoch": 0.6583206683667774, + "grad_norm": 3.2033753395080566, + "learning_rate": 3.431568690013194e-05, + "loss": 0.16672835350036622, + "step": 153340 + }, + { + "epoch": 0.6583636004567974, + "grad_norm": 0.046538546681404114, + "learning_rate": 3.431137518001432e-05, + "loss": 0.03843323886394501, + "step": 153350 + }, + { + "epoch": 0.6584065325468175, + "grad_norm": 0.5804786682128906, + "learning_rate": 3.4307063459896696e-05, + "loss": 0.11106359958648682, + "step": 153360 + }, + { + "epoch": 0.6584494646368374, + "grad_norm": 3.5863006114959717, + "learning_rate": 3.430275173977907e-05, + "loss": 0.22137126922607422, + "step": 153370 + }, + { + "epoch": 0.6584923967268574, + "grad_norm": 4.670158386230469, + "learning_rate": 3.4298440019661444e-05, + "loss": 0.41120376586914065, + "step": 153380 + }, + { + "epoch": 0.6585353288168775, + "grad_norm": 0.27056336402893066, + "learning_rate": 3.429412829954382e-05, + "loss": 0.2656059980392456, + "step": 153390 + }, + { + "epoch": 0.6585782609068974, + "grad_norm": 0.010328982025384903, + "learning_rate": 3.42898165794262e-05, + "loss": 0.1735626220703125, + "step": 153400 + }, + { + "epoch": 0.6586211929969175, + "grad_norm": 1.4152470827102661, + "learning_rate": 3.4285504859308576e-05, + "loss": 0.14489773511886597, + "step": 153410 + }, + { + "epoch": 0.6586641250869375, + "grad_norm": 0.0291027519851923, + "learning_rate": 3.4281193139190946e-05, + "loss": 0.07788435220718384, + "step": 153420 + }, + { + "epoch": 0.6587070571769574, + "grad_norm": 0.8220972418785095, + "learning_rate": 3.4276881419073323e-05, + "loss": 0.33536629676818847, + "step": 153430 + }, + { + "epoch": 0.6587499892669775, + "grad_norm": 0.007435683626681566, + "learning_rate": 3.42725696989557e-05, + "loss": 0.19981803894042968, + "step": 153440 + }, + { + "epoch": 0.6587929213569975, + "grad_norm": 0.043424610048532486, + "learning_rate": 3.426825797883808e-05, + "loss": 0.21512467861175538, + "step": 153450 + }, + { + "epoch": 0.6588358534470176, + "grad_norm": 0.22515398263931274, + "learning_rate": 3.4263946258720455e-05, + "loss": 0.2515620946884155, + "step": 153460 + }, + { + "epoch": 0.6588787855370375, + "grad_norm": 1.116879940032959, + "learning_rate": 3.425963453860283e-05, + "loss": 0.38285002708435056, + "step": 153470 + }, + { + "epoch": 0.6589217176270575, + "grad_norm": 2.415517568588257, + "learning_rate": 3.425532281848521e-05, + "loss": 0.136501681804657, + "step": 153480 + }, + { + "epoch": 0.6589646497170776, + "grad_norm": 1.7895293235778809, + "learning_rate": 3.425101109836759e-05, + "loss": 0.3027945995330811, + "step": 153490 + }, + { + "epoch": 0.6590075818070975, + "grad_norm": 0.0065894764848053455, + "learning_rate": 3.4246699378249965e-05, + "loss": 0.016795614361763002, + "step": 153500 + }, + { + "epoch": 0.6590505138971176, + "grad_norm": 0.12253738194704056, + "learning_rate": 3.4242387658132335e-05, + "loss": 0.1453849196434021, + "step": 153510 + }, + { + "epoch": 0.6590934459871376, + "grad_norm": 0.04784093052148819, + "learning_rate": 3.423807593801471e-05, + "loss": 0.21482656002044678, + "step": 153520 + }, + { + "epoch": 0.6591363780771575, + "grad_norm": 0.014918447472155094, + "learning_rate": 3.423376421789709e-05, + "loss": 0.18660422563552856, + "step": 153530 + }, + { + "epoch": 0.6591793101671776, + "grad_norm": 0.23122480511665344, + "learning_rate": 3.422945249777947e-05, + "loss": 0.0963529109954834, + "step": 153540 + }, + { + "epoch": 0.6592222422571976, + "grad_norm": 1.2685788869857788, + "learning_rate": 3.422514077766184e-05, + "loss": 0.2472707748413086, + "step": 153550 + }, + { + "epoch": 0.6592651743472175, + "grad_norm": 0.004079438280314207, + "learning_rate": 3.4220829057544215e-05, + "loss": 0.49393181800842284, + "step": 153560 + }, + { + "epoch": 0.6593081064372376, + "grad_norm": 0.006530522368848324, + "learning_rate": 3.421651733742659e-05, + "loss": 0.09846093654632568, + "step": 153570 + }, + { + "epoch": 0.6593510385272576, + "grad_norm": 0.015483057126402855, + "learning_rate": 3.421220561730897e-05, + "loss": 0.24493045806884767, + "step": 153580 + }, + { + "epoch": 0.6593939706172776, + "grad_norm": 0.0026599576231092215, + "learning_rate": 3.420789389719135e-05, + "loss": 0.14597982168197632, + "step": 153590 + }, + { + "epoch": 0.6594369027072976, + "grad_norm": 0.002732135122641921, + "learning_rate": 3.4203582177073725e-05, + "loss": 0.1602466106414795, + "step": 153600 + }, + { + "epoch": 0.6594798347973176, + "grad_norm": 0.027440810576081276, + "learning_rate": 3.41992704569561e-05, + "loss": 0.14492573738098144, + "step": 153610 + }, + { + "epoch": 0.6595227668873376, + "grad_norm": 1.5758382081985474, + "learning_rate": 3.419495873683848e-05, + "loss": 0.17967160940170288, + "step": 153620 + }, + { + "epoch": 0.6595656989773576, + "grad_norm": 15.669425010681152, + "learning_rate": 3.419064701672085e-05, + "loss": 0.38480756282806394, + "step": 153630 + }, + { + "epoch": 0.6596086310673777, + "grad_norm": 0.011733450926840305, + "learning_rate": 3.418633529660323e-05, + "loss": 0.16886916160583496, + "step": 153640 + }, + { + "epoch": 0.6596515631573976, + "grad_norm": 0.05485690385103226, + "learning_rate": 3.4182023576485605e-05, + "loss": 0.1938277006149292, + "step": 153650 + }, + { + "epoch": 0.6596944952474176, + "grad_norm": 1.3380587100982666, + "learning_rate": 3.417771185636798e-05, + "loss": 0.23921914100646974, + "step": 153660 + }, + { + "epoch": 0.6597374273374377, + "grad_norm": 7.73290491104126, + "learning_rate": 3.417340013625035e-05, + "loss": 0.2546802043914795, + "step": 153670 + }, + { + "epoch": 0.6597803594274576, + "grad_norm": 0.003008150029927492, + "learning_rate": 3.416908841613273e-05, + "loss": 0.17785210609436036, + "step": 153680 + }, + { + "epoch": 0.6598232915174777, + "grad_norm": 0.9172478914260864, + "learning_rate": 3.416477669601511e-05, + "loss": 0.2208587646484375, + "step": 153690 + }, + { + "epoch": 0.6598662236074977, + "grad_norm": 13.522445678710938, + "learning_rate": 3.416046497589749e-05, + "loss": 0.15731308460235596, + "step": 153700 + }, + { + "epoch": 0.6599091556975176, + "grad_norm": 0.012087918817996979, + "learning_rate": 3.415615325577986e-05, + "loss": 0.24291951656341554, + "step": 153710 + }, + { + "epoch": 0.6599520877875377, + "grad_norm": 0.011719837784767151, + "learning_rate": 3.415184153566224e-05, + "loss": 0.4827817440032959, + "step": 153720 + }, + { + "epoch": 0.6599950198775577, + "grad_norm": 7.859041213989258, + "learning_rate": 3.414752981554462e-05, + "loss": 0.3543118953704834, + "step": 153730 + }, + { + "epoch": 0.6600379519675776, + "grad_norm": 2.688401460647583, + "learning_rate": 3.4143218095426994e-05, + "loss": 0.1372079849243164, + "step": 153740 + }, + { + "epoch": 0.6600808840575977, + "grad_norm": 8.203360557556152, + "learning_rate": 3.4138906375309365e-05, + "loss": 0.21513383388519286, + "step": 153750 + }, + { + "epoch": 0.6601238161476177, + "grad_norm": 1.9034103155136108, + "learning_rate": 3.413459465519174e-05, + "loss": 0.1283172607421875, + "step": 153760 + }, + { + "epoch": 0.6601667482376377, + "grad_norm": 1.7265187501907349, + "learning_rate": 3.413028293507412e-05, + "loss": 0.22327690124511718, + "step": 153770 + }, + { + "epoch": 0.6602096803276577, + "grad_norm": 2.916471242904663, + "learning_rate": 3.4125971214956497e-05, + "loss": 0.35086472034454347, + "step": 153780 + }, + { + "epoch": 0.6602526124176777, + "grad_norm": 0.002500958973541856, + "learning_rate": 3.412165949483887e-05, + "loss": 0.007671752572059631, + "step": 153790 + }, + { + "epoch": 0.6602955445076977, + "grad_norm": 0.0166263859719038, + "learning_rate": 3.4117347774721244e-05, + "loss": 0.1450344443321228, + "step": 153800 + }, + { + "epoch": 0.6603384765977177, + "grad_norm": 0.02013245038688183, + "learning_rate": 3.411303605460363e-05, + "loss": 0.14859265089035034, + "step": 153810 + }, + { + "epoch": 0.6603814086877378, + "grad_norm": 0.07011377066373825, + "learning_rate": 3.4108724334486006e-05, + "loss": 0.00771598219871521, + "step": 153820 + }, + { + "epoch": 0.6604243407777577, + "grad_norm": 0.023680120706558228, + "learning_rate": 3.4104412614368377e-05, + "loss": 0.25690062046051027, + "step": 153830 + }, + { + "epoch": 0.6604672728677777, + "grad_norm": 0.47995421290397644, + "learning_rate": 3.4100100894250754e-05, + "loss": 0.261479115486145, + "step": 153840 + }, + { + "epoch": 0.6605102049577978, + "grad_norm": 1.021770715713501, + "learning_rate": 3.409578917413313e-05, + "loss": 0.17420082092285155, + "step": 153850 + }, + { + "epoch": 0.6605531370478177, + "grad_norm": 0.33425313234329224, + "learning_rate": 3.409147745401551e-05, + "loss": 0.21330676078796387, + "step": 153860 + }, + { + "epoch": 0.6605960691378377, + "grad_norm": 0.009500543586909771, + "learning_rate": 3.4087165733897886e-05, + "loss": 0.31008138656616213, + "step": 153870 + }, + { + "epoch": 0.6606390012278578, + "grad_norm": 0.0007844572537578642, + "learning_rate": 3.4082854013780256e-05, + "loss": 0.23995554447174072, + "step": 153880 + }, + { + "epoch": 0.6606819333178778, + "grad_norm": 0.091548390686512, + "learning_rate": 3.4078542293662634e-05, + "loss": 0.18022990226745605, + "step": 153890 + }, + { + "epoch": 0.6607248654078978, + "grad_norm": 3.5057220458984375, + "learning_rate": 3.407423057354501e-05, + "loss": 0.16820919513702393, + "step": 153900 + }, + { + "epoch": 0.6607677974979178, + "grad_norm": 0.03062303550541401, + "learning_rate": 3.406991885342739e-05, + "loss": 0.054902291297912596, + "step": 153910 + }, + { + "epoch": 0.6608107295879379, + "grad_norm": 2.594014883041382, + "learning_rate": 3.4065607133309766e-05, + "loss": 0.036429685354232785, + "step": 153920 + }, + { + "epoch": 0.6608536616779578, + "grad_norm": 0.20943856239318848, + "learning_rate": 3.406129541319214e-05, + "loss": 0.2615687608718872, + "step": 153930 + }, + { + "epoch": 0.6608965937679778, + "grad_norm": 0.012907395139336586, + "learning_rate": 3.405698369307452e-05, + "loss": 0.19143770933151244, + "step": 153940 + }, + { + "epoch": 0.6609395258579979, + "grad_norm": 1.944933533668518, + "learning_rate": 3.40526719729569e-05, + "loss": 0.1128533959388733, + "step": 153950 + }, + { + "epoch": 0.6609824579480178, + "grad_norm": 0.04854433983564377, + "learning_rate": 3.404836025283927e-05, + "loss": 0.14492179155349733, + "step": 153960 + }, + { + "epoch": 0.6610253900380378, + "grad_norm": 0.0480261892080307, + "learning_rate": 3.4044048532721646e-05, + "loss": 0.07904550433158875, + "step": 153970 + }, + { + "epoch": 0.6610683221280579, + "grad_norm": 0.0033766271080821753, + "learning_rate": 3.403973681260402e-05, + "loss": 0.061150580644607544, + "step": 153980 + }, + { + "epoch": 0.6611112542180778, + "grad_norm": 0.29395806789398193, + "learning_rate": 3.40354250924864e-05, + "loss": 0.3163429021835327, + "step": 153990 + }, + { + "epoch": 0.6611541863080979, + "grad_norm": 0.004031799267977476, + "learning_rate": 3.403111337236877e-05, + "loss": 0.19167883396148683, + "step": 154000 + }, + { + "epoch": 0.6611541863080979, + "eval_loss": 0.40580880641937256, + "eval_runtime": 27.197, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 3.677, + "step": 154000 + }, + { + "epoch": 0.6611971183981179, + "grad_norm": 0.5312119722366333, + "learning_rate": 3.402680165225115e-05, + "loss": 0.1988541841506958, + "step": 154010 + }, + { + "epoch": 0.6612400504881378, + "grad_norm": 1.5345256328582764, + "learning_rate": 3.4022489932133526e-05, + "loss": 0.2834207534790039, + "step": 154020 + }, + { + "epoch": 0.6612829825781579, + "grad_norm": 0.19601844251155853, + "learning_rate": 3.40181782120159e-05, + "loss": 0.14888355731964112, + "step": 154030 + }, + { + "epoch": 0.6613259146681779, + "grad_norm": 0.00023016732302494347, + "learning_rate": 3.401386649189828e-05, + "loss": 0.20627593994140625, + "step": 154040 + }, + { + "epoch": 0.6613688467581978, + "grad_norm": 0.24135711789131165, + "learning_rate": 3.400955477178066e-05, + "loss": 0.2646329402923584, + "step": 154050 + }, + { + "epoch": 0.6614117788482179, + "grad_norm": 1.4794398546218872, + "learning_rate": 3.4005243051663035e-05, + "loss": 0.3707741737365723, + "step": 154060 + }, + { + "epoch": 0.6614547109382379, + "grad_norm": 0.36191725730895996, + "learning_rate": 3.400093133154541e-05, + "loss": 0.23887648582458496, + "step": 154070 + }, + { + "epoch": 0.6614976430282579, + "grad_norm": 0.01881462335586548, + "learning_rate": 3.399661961142778e-05, + "loss": 0.293936562538147, + "step": 154080 + }, + { + "epoch": 0.6615405751182779, + "grad_norm": 0.14759767055511475, + "learning_rate": 3.399230789131016e-05, + "loss": 0.07919049859046937, + "step": 154090 + }, + { + "epoch": 0.661583507208298, + "grad_norm": 0.9646201133728027, + "learning_rate": 3.398799617119254e-05, + "loss": 0.40249104499816896, + "step": 154100 + }, + { + "epoch": 0.6616264392983179, + "grad_norm": 0.00739828497171402, + "learning_rate": 3.3983684451074915e-05, + "loss": 0.3890352725982666, + "step": 154110 + }, + { + "epoch": 0.6616693713883379, + "grad_norm": 0.018785202875733376, + "learning_rate": 3.3979372730957286e-05, + "loss": 0.10795661211013793, + "step": 154120 + }, + { + "epoch": 0.661712303478358, + "grad_norm": 0.07079663127660751, + "learning_rate": 3.397506101083966e-05, + "loss": 0.2482135057449341, + "step": 154130 + }, + { + "epoch": 0.6617552355683779, + "grad_norm": 0.3511267304420471, + "learning_rate": 3.397074929072204e-05, + "loss": 0.2015997886657715, + "step": 154140 + }, + { + "epoch": 0.6617981676583979, + "grad_norm": 0.005133177153766155, + "learning_rate": 3.396643757060442e-05, + "loss": 0.29262471199035645, + "step": 154150 + }, + { + "epoch": 0.661841099748418, + "grad_norm": 0.030422838404774666, + "learning_rate": 3.3962125850486795e-05, + "loss": 0.25519423484802245, + "step": 154160 + }, + { + "epoch": 0.6618840318384379, + "grad_norm": 0.9170808792114258, + "learning_rate": 3.395781413036917e-05, + "loss": 0.20135126113891602, + "step": 154170 + }, + { + "epoch": 0.661926963928458, + "grad_norm": 1.5381137132644653, + "learning_rate": 3.395350241025155e-05, + "loss": 0.25586743354797364, + "step": 154180 + }, + { + "epoch": 0.661969896018478, + "grad_norm": 0.004393408540636301, + "learning_rate": 3.394919069013393e-05, + "loss": 0.09318647980690002, + "step": 154190 + }, + { + "epoch": 0.6620128281084979, + "grad_norm": 0.046787384897470474, + "learning_rate": 3.39448789700163e-05, + "loss": 0.15083513259887696, + "step": 154200 + }, + { + "epoch": 0.662055760198518, + "grad_norm": 0.5503677725791931, + "learning_rate": 3.3940567249898675e-05, + "loss": 0.04096742570400238, + "step": 154210 + }, + { + "epoch": 0.662098692288538, + "grad_norm": 2.4673948287963867, + "learning_rate": 3.393625552978105e-05, + "loss": 0.20666275024414063, + "step": 154220 + }, + { + "epoch": 0.662141624378558, + "grad_norm": 0.740168571472168, + "learning_rate": 3.393194380966343e-05, + "loss": 0.06488361954689026, + "step": 154230 + }, + { + "epoch": 0.662184556468578, + "grad_norm": 3.7524595260620117, + "learning_rate": 3.392763208954581e-05, + "loss": 0.31080782413482666, + "step": 154240 + }, + { + "epoch": 0.662227488558598, + "grad_norm": 3.262477159500122, + "learning_rate": 3.392332036942818e-05, + "loss": 0.19835619926452636, + "step": 154250 + }, + { + "epoch": 0.662270420648618, + "grad_norm": 0.24744971096515656, + "learning_rate": 3.3919008649310555e-05, + "loss": 0.26123824119567873, + "step": 154260 + }, + { + "epoch": 0.662313352738638, + "grad_norm": 0.019804177805781364, + "learning_rate": 3.391469692919293e-05, + "loss": 0.1702876567840576, + "step": 154270 + }, + { + "epoch": 0.662356284828658, + "grad_norm": 0.40778955817222595, + "learning_rate": 3.391038520907531e-05, + "loss": 0.13165427446365358, + "step": 154280 + }, + { + "epoch": 0.662399216918678, + "grad_norm": 0.00482986168935895, + "learning_rate": 3.390607348895769e-05, + "loss": 0.16334230899810792, + "step": 154290 + }, + { + "epoch": 0.662442149008698, + "grad_norm": 0.011098915711045265, + "learning_rate": 3.3901761768840064e-05, + "loss": 0.33003227710723876, + "step": 154300 + }, + { + "epoch": 0.6624850810987181, + "grad_norm": 0.035571079701185226, + "learning_rate": 3.389745004872244e-05, + "loss": 0.11664698123931885, + "step": 154310 + }, + { + "epoch": 0.6625280131887381, + "grad_norm": 0.027940262109041214, + "learning_rate": 3.389313832860482e-05, + "loss": 0.1556524395942688, + "step": 154320 + }, + { + "epoch": 0.662570945278758, + "grad_norm": 1.5852309465408325, + "learning_rate": 3.388882660848719e-05, + "loss": 0.35993731021881104, + "step": 154330 + }, + { + "epoch": 0.6626138773687781, + "grad_norm": 0.4660571217536926, + "learning_rate": 3.388451488836957e-05, + "loss": 0.16389789581298828, + "step": 154340 + }, + { + "epoch": 0.6626568094587981, + "grad_norm": 1.251645803451538, + "learning_rate": 3.3880203168251944e-05, + "loss": 0.12421987056732178, + "step": 154350 + }, + { + "epoch": 0.662699741548818, + "grad_norm": 1.0762736797332764, + "learning_rate": 3.387589144813432e-05, + "loss": 0.5422662258148193, + "step": 154360 + }, + { + "epoch": 0.6627426736388381, + "grad_norm": 1.0706373453140259, + "learning_rate": 3.387157972801669e-05, + "loss": 0.23794779777526856, + "step": 154370 + }, + { + "epoch": 0.6627856057288581, + "grad_norm": 0.021342789754271507, + "learning_rate": 3.386726800789907e-05, + "loss": 0.21014752388000488, + "step": 154380 + }, + { + "epoch": 0.6628285378188781, + "grad_norm": 0.0008740926277823746, + "learning_rate": 3.386295628778145e-05, + "loss": 0.08506430983543396, + "step": 154390 + }, + { + "epoch": 0.6628714699088981, + "grad_norm": 0.11168550699949265, + "learning_rate": 3.385864456766383e-05, + "loss": 0.26096253395080565, + "step": 154400 + }, + { + "epoch": 0.6629144019989182, + "grad_norm": 0.041811276227235794, + "learning_rate": 3.38543328475462e-05, + "loss": 0.11968740224838256, + "step": 154410 + }, + { + "epoch": 0.6629573340889381, + "grad_norm": 8.693405151367188, + "learning_rate": 3.385002112742858e-05, + "loss": 0.3119154691696167, + "step": 154420 + }, + { + "epoch": 0.6630002661789581, + "grad_norm": 0.019545461982488632, + "learning_rate": 3.3845709407310956e-05, + "loss": 0.12815322875976562, + "step": 154430 + }, + { + "epoch": 0.6630431982689782, + "grad_norm": 0.01368553377687931, + "learning_rate": 3.3841397687193333e-05, + "loss": 0.204140043258667, + "step": 154440 + }, + { + "epoch": 0.6630861303589981, + "grad_norm": 0.00397186353802681, + "learning_rate": 3.3837085967075704e-05, + "loss": 0.22590651512145996, + "step": 154450 + }, + { + "epoch": 0.6631290624490181, + "grad_norm": 0.10365869104862213, + "learning_rate": 3.383277424695808e-05, + "loss": 0.04067247211933136, + "step": 154460 + }, + { + "epoch": 0.6631719945390382, + "grad_norm": 0.9955406188964844, + "learning_rate": 3.382846252684046e-05, + "loss": 0.273392391204834, + "step": 154470 + }, + { + "epoch": 0.6632149266290581, + "grad_norm": 0.017841503024101257, + "learning_rate": 3.3824150806722836e-05, + "loss": 0.14227255582809448, + "step": 154480 + }, + { + "epoch": 0.6632578587190782, + "grad_norm": 0.018622983247041702, + "learning_rate": 3.3819839086605207e-05, + "loss": 0.15459182262420654, + "step": 154490 + }, + { + "epoch": 0.6633007908090982, + "grad_norm": 1.244162917137146, + "learning_rate": 3.3815527366487584e-05, + "loss": 0.3782146215438843, + "step": 154500 + }, + { + "epoch": 0.6633437228991181, + "grad_norm": 0.0622965507209301, + "learning_rate": 3.381121564636997e-05, + "loss": 0.3015340089797974, + "step": 154510 + }, + { + "epoch": 0.6633866549891382, + "grad_norm": 19.102006912231445, + "learning_rate": 3.3806903926252345e-05, + "loss": 0.23232758045196533, + "step": 154520 + }, + { + "epoch": 0.6634295870791582, + "grad_norm": 0.04483085870742798, + "learning_rate": 3.3802592206134716e-05, + "loss": 0.23261446952819825, + "step": 154530 + }, + { + "epoch": 0.6634725191691782, + "grad_norm": 1.1958906650543213, + "learning_rate": 3.379828048601709e-05, + "loss": 0.23122506141662597, + "step": 154540 + }, + { + "epoch": 0.6635154512591982, + "grad_norm": 0.018912751227617264, + "learning_rate": 3.379396876589947e-05, + "loss": 0.2717156410217285, + "step": 154550 + }, + { + "epoch": 0.6635583833492182, + "grad_norm": 0.008527401834726334, + "learning_rate": 3.378965704578185e-05, + "loss": 0.12283614873886109, + "step": 154560 + }, + { + "epoch": 0.6636013154392382, + "grad_norm": 2.4211113452911377, + "learning_rate": 3.378534532566422e-05, + "loss": 0.43242511749267576, + "step": 154570 + }, + { + "epoch": 0.6636442475292582, + "grad_norm": 0.01818317361176014, + "learning_rate": 3.3781033605546596e-05, + "loss": 0.25135438442230223, + "step": 154580 + }, + { + "epoch": 0.6636871796192783, + "grad_norm": 0.006349280942231417, + "learning_rate": 3.377672188542897e-05, + "loss": 0.242250657081604, + "step": 154590 + }, + { + "epoch": 0.6637301117092982, + "grad_norm": 0.003851179266348481, + "learning_rate": 3.377241016531135e-05, + "loss": 0.17027580738067627, + "step": 154600 + }, + { + "epoch": 0.6637730437993182, + "grad_norm": 3.806344985961914, + "learning_rate": 3.376809844519373e-05, + "loss": 0.26419591903686523, + "step": 154610 + }, + { + "epoch": 0.6638159758893383, + "grad_norm": 5.863420009613037, + "learning_rate": 3.3763786725076105e-05, + "loss": 0.42422032356262207, + "step": 154620 + }, + { + "epoch": 0.6638589079793582, + "grad_norm": 0.22056105732917786, + "learning_rate": 3.375947500495848e-05, + "loss": 0.06590731143951416, + "step": 154630 + }, + { + "epoch": 0.6639018400693782, + "grad_norm": 1.7405567169189453, + "learning_rate": 3.375516328484086e-05, + "loss": 0.13169472217559813, + "step": 154640 + }, + { + "epoch": 0.6639447721593983, + "grad_norm": 0.08242862671613693, + "learning_rate": 3.375085156472324e-05, + "loss": 0.3194495439529419, + "step": 154650 + }, + { + "epoch": 0.6639877042494182, + "grad_norm": 0.17024773359298706, + "learning_rate": 3.374653984460561e-05, + "loss": 0.22560033798217774, + "step": 154660 + }, + { + "epoch": 0.6640306363394383, + "grad_norm": 0.01813027262687683, + "learning_rate": 3.3742228124487985e-05, + "loss": 0.33473637104034426, + "step": 154670 + }, + { + "epoch": 0.6640735684294583, + "grad_norm": 0.1378939300775528, + "learning_rate": 3.373791640437036e-05, + "loss": 0.18422240018844604, + "step": 154680 + }, + { + "epoch": 0.6641165005194782, + "grad_norm": 0.44337424635887146, + "learning_rate": 3.373360468425274e-05, + "loss": 0.12879838943481445, + "step": 154690 + }, + { + "epoch": 0.6641594326094983, + "grad_norm": 0.04703165963292122, + "learning_rate": 3.372929296413511e-05, + "loss": 0.1979563355445862, + "step": 154700 + }, + { + "epoch": 0.6642023646995183, + "grad_norm": 0.0008391539449803531, + "learning_rate": 3.372498124401749e-05, + "loss": 0.21918249130249023, + "step": 154710 + }, + { + "epoch": 0.6642452967895383, + "grad_norm": 1.3081825971603394, + "learning_rate": 3.3720669523899865e-05, + "loss": 0.22604124546051024, + "step": 154720 + }, + { + "epoch": 0.6642882288795583, + "grad_norm": 0.26419490575790405, + "learning_rate": 3.371635780378224e-05, + "loss": 0.19358288049697875, + "step": 154730 + }, + { + "epoch": 0.6643311609695783, + "grad_norm": 0.19384314119815826, + "learning_rate": 3.371204608366462e-05, + "loss": 0.18703948259353637, + "step": 154740 + }, + { + "epoch": 0.6643740930595984, + "grad_norm": 0.0002898857928812504, + "learning_rate": 3.3707734363547e-05, + "loss": 0.23569769859313966, + "step": 154750 + }, + { + "epoch": 0.6644170251496183, + "grad_norm": 1.9487954378128052, + "learning_rate": 3.3703422643429375e-05, + "loss": 0.1675286054611206, + "step": 154760 + }, + { + "epoch": 0.6644599572396384, + "grad_norm": 4.060025691986084, + "learning_rate": 3.369911092331175e-05, + "loss": 0.2972411155700684, + "step": 154770 + }, + { + "epoch": 0.6645028893296584, + "grad_norm": 0.18675732612609863, + "learning_rate": 3.369479920319412e-05, + "loss": 0.2717970609664917, + "step": 154780 + }, + { + "epoch": 0.6645458214196783, + "grad_norm": 0.0183492973446846, + "learning_rate": 3.36904874830765e-05, + "loss": 0.16480090618133544, + "step": 154790 + }, + { + "epoch": 0.6645887535096984, + "grad_norm": 4.149248123168945, + "learning_rate": 3.368617576295888e-05, + "loss": 0.2041304349899292, + "step": 154800 + }, + { + "epoch": 0.6646316855997184, + "grad_norm": 0.18096356093883514, + "learning_rate": 3.3681864042841254e-05, + "loss": 0.14483392238616943, + "step": 154810 + }, + { + "epoch": 0.6646746176897383, + "grad_norm": 0.04419800266623497, + "learning_rate": 3.3677552322723625e-05, + "loss": 0.14780722856521605, + "step": 154820 + }, + { + "epoch": 0.6647175497797584, + "grad_norm": 2.218914747238159, + "learning_rate": 3.3673240602606e-05, + "loss": 0.39519851207733153, + "step": 154830 + }, + { + "epoch": 0.6647604818697784, + "grad_norm": 0.037824928760528564, + "learning_rate": 3.366892888248838e-05, + "loss": 0.1591792106628418, + "step": 154840 + }, + { + "epoch": 0.6648034139597984, + "grad_norm": 0.018527628853917122, + "learning_rate": 3.366461716237076e-05, + "loss": 0.14791398048400878, + "step": 154850 + }, + { + "epoch": 0.6648463460498184, + "grad_norm": 1.1151235103607178, + "learning_rate": 3.3660305442253134e-05, + "loss": 0.14960010051727296, + "step": 154860 + }, + { + "epoch": 0.6648892781398384, + "grad_norm": 1.7647981643676758, + "learning_rate": 3.365599372213551e-05, + "loss": 0.13042131662368775, + "step": 154870 + }, + { + "epoch": 0.6649322102298584, + "grad_norm": 0.005045225378125906, + "learning_rate": 3.365168200201789e-05, + "loss": 0.25216450691223147, + "step": 154880 + }, + { + "epoch": 0.6649751423198784, + "grad_norm": 0.37474727630615234, + "learning_rate": 3.3647370281900266e-05, + "loss": 0.12212558984756469, + "step": 154890 + }, + { + "epoch": 0.6650180744098985, + "grad_norm": 0.025088215246796608, + "learning_rate": 3.364305856178264e-05, + "loss": 0.13682304620742797, + "step": 154900 + }, + { + "epoch": 0.6650610064999184, + "grad_norm": 1.0522595643997192, + "learning_rate": 3.3638746841665014e-05, + "loss": 0.1837522029876709, + "step": 154910 + }, + { + "epoch": 0.6651039385899384, + "grad_norm": 0.06288671493530273, + "learning_rate": 3.363443512154739e-05, + "loss": 0.19506425857543946, + "step": 154920 + }, + { + "epoch": 0.6651468706799585, + "grad_norm": 0.042734645307064056, + "learning_rate": 3.363012340142977e-05, + "loss": 0.15663498640060425, + "step": 154930 + }, + { + "epoch": 0.6651898027699784, + "grad_norm": 0.15944519639015198, + "learning_rate": 3.362581168131214e-05, + "loss": 0.2185056686401367, + "step": 154940 + }, + { + "epoch": 0.6652327348599985, + "grad_norm": 0.002336745150387287, + "learning_rate": 3.362149996119452e-05, + "loss": 0.25134222507476806, + "step": 154950 + }, + { + "epoch": 0.6652756669500185, + "grad_norm": 0.17214441299438477, + "learning_rate": 3.3617188241076894e-05, + "loss": 0.14887669086456298, + "step": 154960 + }, + { + "epoch": 0.6653185990400384, + "grad_norm": 0.19424983859062195, + "learning_rate": 3.361287652095927e-05, + "loss": 0.3506969690322876, + "step": 154970 + }, + { + "epoch": 0.6653615311300585, + "grad_norm": 0.8946022391319275, + "learning_rate": 3.360856480084165e-05, + "loss": 0.2966755867004395, + "step": 154980 + }, + { + "epoch": 0.6654044632200785, + "grad_norm": 0.046246446669101715, + "learning_rate": 3.3604253080724026e-05, + "loss": 0.14898393154144288, + "step": 154990 + }, + { + "epoch": 0.6654473953100984, + "grad_norm": 3.2097201347351074, + "learning_rate": 3.3599941360606404e-05, + "loss": 0.25090641975402833, + "step": 155000 + }, + { + "epoch": 0.6654473953100984, + "eval_loss": 0.40841829776763916, + "eval_runtime": 27.1474, + "eval_samples_per_second": 3.684, + "eval_steps_per_second": 3.684, + "step": 155000 + }, + { + "epoch": 0.6654903274001185, + "grad_norm": 0.2784558832645416, + "learning_rate": 3.359562964048878e-05, + "loss": 0.05249403715133667, + "step": 155010 + }, + { + "epoch": 0.6655332594901385, + "grad_norm": 0.1900515854358673, + "learning_rate": 3.359131792037116e-05, + "loss": 0.135952889919281, + "step": 155020 + }, + { + "epoch": 0.6655761915801585, + "grad_norm": 0.12390508502721786, + "learning_rate": 3.358700620025353e-05, + "loss": 0.13241848945617676, + "step": 155030 + }, + { + "epoch": 0.6656191236701785, + "grad_norm": 1.9098988771438599, + "learning_rate": 3.3582694480135906e-05, + "loss": 0.18672947883605956, + "step": 155040 + }, + { + "epoch": 0.6656620557601985, + "grad_norm": 1.399539828300476, + "learning_rate": 3.3578382760018284e-05, + "loss": 0.24725022315979003, + "step": 155050 + }, + { + "epoch": 0.6657049878502185, + "grad_norm": 1.9422625303268433, + "learning_rate": 3.357407103990066e-05, + "loss": 0.18814780712127685, + "step": 155060 + }, + { + "epoch": 0.6657479199402385, + "grad_norm": 1.2785696983337402, + "learning_rate": 3.356975931978303e-05, + "loss": 0.21161410808563233, + "step": 155070 + }, + { + "epoch": 0.6657908520302586, + "grad_norm": 0.047315359115600586, + "learning_rate": 3.356544759966541e-05, + "loss": 0.190315580368042, + "step": 155080 + }, + { + "epoch": 0.6658337841202785, + "grad_norm": 0.07102714478969574, + "learning_rate": 3.3561135879547786e-05, + "loss": 0.2807221651077271, + "step": 155090 + }, + { + "epoch": 0.6658767162102985, + "grad_norm": 0.020219087600708008, + "learning_rate": 3.3556824159430164e-05, + "loss": 0.0993302583694458, + "step": 155100 + }, + { + "epoch": 0.6659196483003186, + "grad_norm": 0.0007031201384961605, + "learning_rate": 3.355251243931254e-05, + "loss": 0.2325591564178467, + "step": 155110 + }, + { + "epoch": 0.6659625803903385, + "grad_norm": 0.025428976863622665, + "learning_rate": 3.354820071919492e-05, + "loss": 0.3676584243774414, + "step": 155120 + }, + { + "epoch": 0.6660055124803586, + "grad_norm": 0.05800691992044449, + "learning_rate": 3.3543888999077296e-05, + "loss": 0.16004191637039183, + "step": 155130 + }, + { + "epoch": 0.6660484445703786, + "grad_norm": 0.02092166617512703, + "learning_rate": 3.353957727895967e-05, + "loss": 0.3744916200637817, + "step": 155140 + }, + { + "epoch": 0.6660913766603985, + "grad_norm": 0.005752600729465485, + "learning_rate": 3.3535265558842043e-05, + "loss": 0.10523773431777954, + "step": 155150 + }, + { + "epoch": 0.6661343087504186, + "grad_norm": 1.9601504802703857, + "learning_rate": 3.353095383872442e-05, + "loss": 0.31460092067718504, + "step": 155160 + }, + { + "epoch": 0.6661772408404386, + "grad_norm": 0.4158020615577698, + "learning_rate": 3.35266421186068e-05, + "loss": 0.23941915035247802, + "step": 155170 + }, + { + "epoch": 0.6662201729304587, + "grad_norm": 0.811876654624939, + "learning_rate": 3.3522330398489175e-05, + "loss": 0.3740152597427368, + "step": 155180 + }, + { + "epoch": 0.6662631050204786, + "grad_norm": 16.450340270996094, + "learning_rate": 3.3518018678371546e-05, + "loss": 0.25375795364379883, + "step": 155190 + }, + { + "epoch": 0.6663060371104986, + "grad_norm": 0.004651315044611692, + "learning_rate": 3.3513706958253923e-05, + "loss": 0.236892032623291, + "step": 155200 + }, + { + "epoch": 0.6663489692005187, + "grad_norm": 0.07660853117704391, + "learning_rate": 3.35093952381363e-05, + "loss": 0.3181604862213135, + "step": 155210 + }, + { + "epoch": 0.6663919012905386, + "grad_norm": 0.1353936493396759, + "learning_rate": 3.3505083518018685e-05, + "loss": 0.3340135097503662, + "step": 155220 + }, + { + "epoch": 0.6664348333805586, + "grad_norm": 1.6373554468154907, + "learning_rate": 3.3500771797901055e-05, + "loss": 0.27181158065795896, + "step": 155230 + }, + { + "epoch": 0.6664777654705787, + "grad_norm": 0.0005052989581599832, + "learning_rate": 3.349646007778343e-05, + "loss": 0.24267089366912842, + "step": 155240 + }, + { + "epoch": 0.6665206975605986, + "grad_norm": 0.0029463102109730244, + "learning_rate": 3.349214835766581e-05, + "loss": 0.2408508539199829, + "step": 155250 + }, + { + "epoch": 0.6665636296506187, + "grad_norm": 0.013165873475372791, + "learning_rate": 3.348783663754819e-05, + "loss": 0.24375631809234619, + "step": 155260 + }, + { + "epoch": 0.6666065617406387, + "grad_norm": 1.2235469818115234, + "learning_rate": 3.348352491743056e-05, + "loss": 0.25014405250549315, + "step": 155270 + }, + { + "epoch": 0.6666494938306586, + "grad_norm": 5.4037275314331055, + "learning_rate": 3.3479213197312935e-05, + "loss": 0.21150333881378175, + "step": 155280 + }, + { + "epoch": 0.6666924259206787, + "grad_norm": 1.3540924787521362, + "learning_rate": 3.347490147719531e-05, + "loss": 0.2414700984954834, + "step": 155290 + }, + { + "epoch": 0.6667353580106987, + "grad_norm": 0.0003013101522810757, + "learning_rate": 3.347058975707769e-05, + "loss": 0.23213748931884765, + "step": 155300 + }, + { + "epoch": 0.6667782901007187, + "grad_norm": 2.7998039722442627, + "learning_rate": 3.346627803696006e-05, + "loss": 0.22673821449279785, + "step": 155310 + }, + { + "epoch": 0.6668212221907387, + "grad_norm": 0.4135327935218811, + "learning_rate": 3.346196631684244e-05, + "loss": 0.015942250192165375, + "step": 155320 + }, + { + "epoch": 0.6668641542807587, + "grad_norm": 0.380653440952301, + "learning_rate": 3.345765459672482e-05, + "loss": 0.21530394554138182, + "step": 155330 + }, + { + "epoch": 0.6669070863707787, + "grad_norm": 0.003645789111033082, + "learning_rate": 3.34533428766072e-05, + "loss": 0.1611285090446472, + "step": 155340 + }, + { + "epoch": 0.6669500184607987, + "grad_norm": 0.00044938252540305257, + "learning_rate": 3.344903115648958e-05, + "loss": 0.3208571434020996, + "step": 155350 + }, + { + "epoch": 0.6669929505508188, + "grad_norm": 0.06646423786878586, + "learning_rate": 3.344471943637195e-05, + "loss": 0.03411422073841095, + "step": 155360 + }, + { + "epoch": 0.6670358826408387, + "grad_norm": 0.0701594427227974, + "learning_rate": 3.3440407716254325e-05, + "loss": 0.18013898134231568, + "step": 155370 + }, + { + "epoch": 0.6670788147308587, + "grad_norm": 0.015400223433971405, + "learning_rate": 3.34360959961367e-05, + "loss": 0.16810702085494994, + "step": 155380 + }, + { + "epoch": 0.6671217468208788, + "grad_norm": 1.3191877603530884, + "learning_rate": 3.343178427601908e-05, + "loss": 0.028722655773162842, + "step": 155390 + }, + { + "epoch": 0.6671646789108987, + "grad_norm": 2.3753511905670166, + "learning_rate": 3.342747255590145e-05, + "loss": 0.2831770420074463, + "step": 155400 + }, + { + "epoch": 0.6672076110009187, + "grad_norm": 0.5197781920433044, + "learning_rate": 3.342316083578383e-05, + "loss": 0.4160111427307129, + "step": 155410 + }, + { + "epoch": 0.6672505430909388, + "grad_norm": 0.0006538841407746077, + "learning_rate": 3.3418849115666205e-05, + "loss": 0.06287867426872254, + "step": 155420 + }, + { + "epoch": 0.6672934751809587, + "grad_norm": 0.037650611251592636, + "learning_rate": 3.341453739554858e-05, + "loss": 0.044908028841018674, + "step": 155430 + }, + { + "epoch": 0.6673364072709788, + "grad_norm": 0.0031805236358195543, + "learning_rate": 3.341022567543096e-05, + "loss": 0.059893810749053956, + "step": 155440 + }, + { + "epoch": 0.6673793393609988, + "grad_norm": 2.588815689086914, + "learning_rate": 3.340591395531334e-05, + "loss": 0.33464751243591306, + "step": 155450 + }, + { + "epoch": 0.6674222714510187, + "grad_norm": 0.06854215264320374, + "learning_rate": 3.3401602235195714e-05, + "loss": 0.12744616270065307, + "step": 155460 + }, + { + "epoch": 0.6674652035410388, + "grad_norm": 0.002928926143795252, + "learning_rate": 3.339729051507809e-05, + "loss": 0.2883869409561157, + "step": 155470 + }, + { + "epoch": 0.6675081356310588, + "grad_norm": 1.0097222328186035, + "learning_rate": 3.339297879496046e-05, + "loss": 0.1543585777282715, + "step": 155480 + }, + { + "epoch": 0.6675510677210788, + "grad_norm": 1.2233046293258667, + "learning_rate": 3.338866707484284e-05, + "loss": 0.16075161695480347, + "step": 155490 + }, + { + "epoch": 0.6675939998110988, + "grad_norm": 2.7273221015930176, + "learning_rate": 3.3384355354725217e-05, + "loss": 0.2358945608139038, + "step": 155500 + }, + { + "epoch": 0.6676369319011188, + "grad_norm": 0.0005102035938762128, + "learning_rate": 3.3380043634607594e-05, + "loss": 0.07890783548355103, + "step": 155510 + }, + { + "epoch": 0.6676798639911388, + "grad_norm": 0.056222833693027496, + "learning_rate": 3.3375731914489965e-05, + "loss": 0.19825621843338012, + "step": 155520 + }, + { + "epoch": 0.6677227960811588, + "grad_norm": 0.0347059927880764, + "learning_rate": 3.337142019437234e-05, + "loss": 0.09248555302619935, + "step": 155530 + }, + { + "epoch": 0.6677657281711789, + "grad_norm": 0.19800904393196106, + "learning_rate": 3.336710847425472e-05, + "loss": 0.11048685312271118, + "step": 155540 + }, + { + "epoch": 0.6678086602611988, + "grad_norm": 3.2383038997650146, + "learning_rate": 3.3362796754137097e-05, + "loss": 0.24872050285339356, + "step": 155550 + }, + { + "epoch": 0.6678515923512188, + "grad_norm": 0.050600819289684296, + "learning_rate": 3.3358485034019474e-05, + "loss": 0.39221649169921874, + "step": 155560 + }, + { + "epoch": 0.6678945244412389, + "grad_norm": 0.08034029603004456, + "learning_rate": 3.335417331390185e-05, + "loss": 0.2349752426147461, + "step": 155570 + }, + { + "epoch": 0.6679374565312588, + "grad_norm": 0.002499277936294675, + "learning_rate": 3.334986159378423e-05, + "loss": 0.007202136516571045, + "step": 155580 + }, + { + "epoch": 0.6679803886212788, + "grad_norm": 2.0072579383850098, + "learning_rate": 3.3345549873666606e-05, + "loss": 0.4786830902099609, + "step": 155590 + }, + { + "epoch": 0.6680233207112989, + "grad_norm": 0.13278234004974365, + "learning_rate": 3.3341238153548976e-05, + "loss": 0.04938144087791443, + "step": 155600 + }, + { + "epoch": 0.6680662528013189, + "grad_norm": 2.5657100677490234, + "learning_rate": 3.3336926433431354e-05, + "loss": 0.10031435489654542, + "step": 155610 + }, + { + "epoch": 0.6681091848913389, + "grad_norm": 0.006575672887265682, + "learning_rate": 3.333261471331373e-05, + "loss": 0.199531090259552, + "step": 155620 + }, + { + "epoch": 0.6681521169813589, + "grad_norm": 0.0015442880103364587, + "learning_rate": 3.332830299319611e-05, + "loss": 0.25533859729766845, + "step": 155630 + }, + { + "epoch": 0.6681950490713789, + "grad_norm": 0.11753270030021667, + "learning_rate": 3.332399127307848e-05, + "loss": 0.23512325286865235, + "step": 155640 + }, + { + "epoch": 0.6682379811613989, + "grad_norm": 0.043603476136922836, + "learning_rate": 3.3319679552960856e-05, + "loss": 0.2509476661682129, + "step": 155650 + }, + { + "epoch": 0.6682809132514189, + "grad_norm": 1.2565324306488037, + "learning_rate": 3.3315367832843234e-05, + "loss": 0.2644261598587036, + "step": 155660 + }, + { + "epoch": 0.668323845341439, + "grad_norm": 0.9435896277427673, + "learning_rate": 3.331105611272561e-05, + "loss": 0.3087204933166504, + "step": 155670 + }, + { + "epoch": 0.6683667774314589, + "grad_norm": 0.005109146703034639, + "learning_rate": 3.330674439260799e-05, + "loss": 0.30349709987640383, + "step": 155680 + }, + { + "epoch": 0.6684097095214789, + "grad_norm": 0.23565086722373962, + "learning_rate": 3.3302432672490366e-05, + "loss": 0.033765727281570436, + "step": 155690 + }, + { + "epoch": 0.668452641611499, + "grad_norm": 0.012045781128108501, + "learning_rate": 3.329812095237274e-05, + "loss": 0.32736949920654296, + "step": 155700 + }, + { + "epoch": 0.6684955737015189, + "grad_norm": 0.003710044315084815, + "learning_rate": 3.329380923225512e-05, + "loss": 0.12195107936859131, + "step": 155710 + }, + { + "epoch": 0.668538505791539, + "grad_norm": 0.02557937614619732, + "learning_rate": 3.32894975121375e-05, + "loss": 0.4231997013092041, + "step": 155720 + }, + { + "epoch": 0.668581437881559, + "grad_norm": 0.06147534027695656, + "learning_rate": 3.328518579201987e-05, + "loss": 0.27741050720214844, + "step": 155730 + }, + { + "epoch": 0.6686243699715789, + "grad_norm": 0.031692177057266235, + "learning_rate": 3.3280874071902246e-05, + "loss": 0.12817578315734862, + "step": 155740 + }, + { + "epoch": 0.668667302061599, + "grad_norm": 12.631524085998535, + "learning_rate": 3.327656235178462e-05, + "loss": 0.19571025371551515, + "step": 155750 + }, + { + "epoch": 0.668710234151619, + "grad_norm": 2.5644569396972656, + "learning_rate": 3.3272250631667e-05, + "loss": 0.21142849922180176, + "step": 155760 + }, + { + "epoch": 0.6687531662416389, + "grad_norm": 2.2528250217437744, + "learning_rate": 3.326793891154937e-05, + "loss": 0.11066330671310425, + "step": 155770 + }, + { + "epoch": 0.668796098331659, + "grad_norm": 0.1499422937631607, + "learning_rate": 3.326362719143175e-05, + "loss": 0.14271847009658814, + "step": 155780 + }, + { + "epoch": 0.668839030421679, + "grad_norm": 0.07106486707925797, + "learning_rate": 3.3259315471314126e-05, + "loss": 0.03233225643634796, + "step": 155790 + }, + { + "epoch": 0.668881962511699, + "grad_norm": 0.09190256148576736, + "learning_rate": 3.32550037511965e-05, + "loss": 0.20338826179504393, + "step": 155800 + }, + { + "epoch": 0.668924894601719, + "grad_norm": 0.26690569519996643, + "learning_rate": 3.325069203107888e-05, + "loss": 0.19002293348312377, + "step": 155810 + }, + { + "epoch": 0.668967826691739, + "grad_norm": 2.454503297805786, + "learning_rate": 3.324638031096126e-05, + "loss": 0.3581833839416504, + "step": 155820 + }, + { + "epoch": 0.669010758781759, + "grad_norm": 6.0999755859375, + "learning_rate": 3.3242068590843635e-05, + "loss": 0.3228771686553955, + "step": 155830 + }, + { + "epoch": 0.669053690871779, + "grad_norm": 0.006542257498949766, + "learning_rate": 3.323775687072601e-05, + "loss": 0.21552627086639403, + "step": 155840 + }, + { + "epoch": 0.6690966229617991, + "grad_norm": 1.2118092775344849, + "learning_rate": 3.323344515060838e-05, + "loss": 0.26827244758605956, + "step": 155850 + }, + { + "epoch": 0.669139555051819, + "grad_norm": 0.0797976404428482, + "learning_rate": 3.322913343049076e-05, + "loss": 0.22456207275390624, + "step": 155860 + }, + { + "epoch": 0.669182487141839, + "grad_norm": 0.2964301109313965, + "learning_rate": 3.322482171037314e-05, + "loss": 0.2766679048538208, + "step": 155870 + }, + { + "epoch": 0.6692254192318591, + "grad_norm": 3.1502134799957275, + "learning_rate": 3.3220509990255515e-05, + "loss": 0.36197125911712646, + "step": 155880 + }, + { + "epoch": 0.669268351321879, + "grad_norm": 0.03810900077223778, + "learning_rate": 3.3216198270137886e-05, + "loss": 0.14142590761184692, + "step": 155890 + }, + { + "epoch": 0.669311283411899, + "grad_norm": 2.3641881942749023, + "learning_rate": 3.321188655002026e-05, + "loss": 0.19493041038513184, + "step": 155900 + }, + { + "epoch": 0.6693542155019191, + "grad_norm": 42.50188064575195, + "learning_rate": 3.320757482990264e-05, + "loss": 0.16444485187530516, + "step": 155910 + }, + { + "epoch": 0.669397147591939, + "grad_norm": 0.01806536316871643, + "learning_rate": 3.3203263109785024e-05, + "loss": 0.36703317165374755, + "step": 155920 + }, + { + "epoch": 0.6694400796819591, + "grad_norm": 0.08311089873313904, + "learning_rate": 3.3198951389667395e-05, + "loss": 0.24496970176696778, + "step": 155930 + }, + { + "epoch": 0.6694830117719791, + "grad_norm": 0.0521310456097126, + "learning_rate": 3.319463966954977e-05, + "loss": 0.2650176763534546, + "step": 155940 + }, + { + "epoch": 0.669525943861999, + "grad_norm": 0.10356919467449188, + "learning_rate": 3.319032794943215e-05, + "loss": 0.2823972702026367, + "step": 155950 + }, + { + "epoch": 0.6695688759520191, + "grad_norm": 0.0009236885816790164, + "learning_rate": 3.318601622931453e-05, + "loss": 0.09366688132286072, + "step": 155960 + }, + { + "epoch": 0.6696118080420391, + "grad_norm": 0.03426114842295647, + "learning_rate": 3.31817045091969e-05, + "loss": 0.2280339241027832, + "step": 155970 + }, + { + "epoch": 0.6696547401320591, + "grad_norm": 0.09179901331663132, + "learning_rate": 3.3177392789079275e-05, + "loss": 0.17582715749740602, + "step": 155980 + }, + { + "epoch": 0.6696976722220791, + "grad_norm": 0.04506692662835121, + "learning_rate": 3.317308106896165e-05, + "loss": 0.08360955119132996, + "step": 155990 + }, + { + "epoch": 0.6697406043120991, + "grad_norm": 1.34595787525177, + "learning_rate": 3.316876934884403e-05, + "loss": 0.20075461864471436, + "step": 156000 + }, + { + "epoch": 0.6697406043120991, + "eval_loss": 0.39436066150665283, + "eval_runtime": 27.161, + "eval_samples_per_second": 3.682, + "eval_steps_per_second": 3.682, + "step": 156000 + }, + { + "epoch": 0.6697835364021191, + "grad_norm": 1.7346657514572144, + "learning_rate": 3.31644576287264e-05, + "loss": 0.31329498291015623, + "step": 156010 + }, + { + "epoch": 0.6698264684921391, + "grad_norm": 0.012260986492037773, + "learning_rate": 3.316014590860878e-05, + "loss": 0.14117549657821654, + "step": 156020 + }, + { + "epoch": 0.6698694005821592, + "grad_norm": 0.07373584061861038, + "learning_rate": 3.315583418849116e-05, + "loss": 0.07356157898902893, + "step": 156030 + }, + { + "epoch": 0.6699123326721792, + "grad_norm": 1.865148663520813, + "learning_rate": 3.315152246837354e-05, + "loss": 0.31308043003082275, + "step": 156040 + }, + { + "epoch": 0.6699552647621991, + "grad_norm": 0.000529519107658416, + "learning_rate": 3.3147210748255916e-05, + "loss": 0.264267110824585, + "step": 156050 + }, + { + "epoch": 0.6699981968522192, + "grad_norm": 0.051425471901893616, + "learning_rate": 3.314289902813829e-05, + "loss": 0.27402348518371583, + "step": 156060 + }, + { + "epoch": 0.6700411289422392, + "grad_norm": 0.07035046815872192, + "learning_rate": 3.3138587308020664e-05, + "loss": 0.20741422176361085, + "step": 156070 + }, + { + "epoch": 0.6700840610322591, + "grad_norm": 0.03581790253520012, + "learning_rate": 3.313427558790304e-05, + "loss": 0.16281461715698242, + "step": 156080 + }, + { + "epoch": 0.6701269931222792, + "grad_norm": 1.2740488052368164, + "learning_rate": 3.312996386778542e-05, + "loss": 0.18876116275787352, + "step": 156090 + }, + { + "epoch": 0.6701699252122992, + "grad_norm": 1.7908412218093872, + "learning_rate": 3.312565214766779e-05, + "loss": 0.24298477172851562, + "step": 156100 + }, + { + "epoch": 0.6702128573023192, + "grad_norm": 0.593454897403717, + "learning_rate": 3.312134042755017e-05, + "loss": 0.2801332473754883, + "step": 156110 + }, + { + "epoch": 0.6702557893923392, + "grad_norm": 1.5664280652999878, + "learning_rate": 3.3117028707432544e-05, + "loss": 0.22960739135742186, + "step": 156120 + }, + { + "epoch": 0.6702987214823592, + "grad_norm": 0.19571813941001892, + "learning_rate": 3.311271698731492e-05, + "loss": 0.00823623463511467, + "step": 156130 + }, + { + "epoch": 0.6703416535723792, + "grad_norm": 0.0967881977558136, + "learning_rate": 3.31084052671973e-05, + "loss": 0.3439939498901367, + "step": 156140 + }, + { + "epoch": 0.6703845856623992, + "grad_norm": 1.896458387374878, + "learning_rate": 3.3104093547079676e-05, + "loss": 0.33332931995391846, + "step": 156150 + }, + { + "epoch": 0.6704275177524193, + "grad_norm": 0.15472471714019775, + "learning_rate": 3.3099781826962053e-05, + "loss": 0.006204599142074585, + "step": 156160 + }, + { + "epoch": 0.6704704498424392, + "grad_norm": 0.8047659397125244, + "learning_rate": 3.309547010684443e-05, + "loss": 0.07587890028953552, + "step": 156170 + }, + { + "epoch": 0.6705133819324592, + "grad_norm": 0.0013157215435057878, + "learning_rate": 3.30911583867268e-05, + "loss": 0.32708396911621096, + "step": 156180 + }, + { + "epoch": 0.6705563140224793, + "grad_norm": 2.68961501121521, + "learning_rate": 3.308684666660918e-05, + "loss": 0.15513334274291993, + "step": 156190 + }, + { + "epoch": 0.6705992461124992, + "grad_norm": 0.21397237479686737, + "learning_rate": 3.3082534946491556e-05, + "loss": 0.1734054684638977, + "step": 156200 + }, + { + "epoch": 0.6706421782025193, + "grad_norm": 0.763003408908844, + "learning_rate": 3.3078223226373933e-05, + "loss": 0.3408956527709961, + "step": 156210 + }, + { + "epoch": 0.6706851102925393, + "grad_norm": 2.3349876403808594, + "learning_rate": 3.3073911506256304e-05, + "loss": 0.4397918701171875, + "step": 156220 + }, + { + "epoch": 0.6707280423825592, + "grad_norm": 0.10397858917713165, + "learning_rate": 3.306959978613868e-05, + "loss": 0.3239002227783203, + "step": 156230 + }, + { + "epoch": 0.6707709744725793, + "grad_norm": 0.04382269084453583, + "learning_rate": 3.306528806602106e-05, + "loss": 0.14653106927871704, + "step": 156240 + }, + { + "epoch": 0.6708139065625993, + "grad_norm": 0.061937592923641205, + "learning_rate": 3.3060976345903436e-05, + "loss": 0.10898768901824951, + "step": 156250 + }, + { + "epoch": 0.6708568386526192, + "grad_norm": 0.19617868959903717, + "learning_rate": 3.305666462578581e-05, + "loss": 0.041546297073364255, + "step": 156260 + }, + { + "epoch": 0.6708997707426393, + "grad_norm": 1.3481838703155518, + "learning_rate": 3.305235290566819e-05, + "loss": 0.04840826690196991, + "step": 156270 + }, + { + "epoch": 0.6709427028326593, + "grad_norm": 0.003372886683791876, + "learning_rate": 3.304804118555057e-05, + "loss": 0.034439802169799805, + "step": 156280 + }, + { + "epoch": 0.6709856349226793, + "grad_norm": 2.043133020401001, + "learning_rate": 3.3043729465432945e-05, + "loss": 0.48386592864990235, + "step": 156290 + }, + { + "epoch": 0.6710285670126993, + "grad_norm": 0.006714210379868746, + "learning_rate": 3.3039417745315316e-05, + "loss": 0.08989254236221314, + "step": 156300 + }, + { + "epoch": 0.6710714991027193, + "grad_norm": 0.026555676013231277, + "learning_rate": 3.303510602519769e-05, + "loss": 0.23003294467926025, + "step": 156310 + }, + { + "epoch": 0.6711144311927393, + "grad_norm": 0.6874704957008362, + "learning_rate": 3.303079430508007e-05, + "loss": 0.1800214648246765, + "step": 156320 + }, + { + "epoch": 0.6711573632827593, + "grad_norm": 0.00029299809830263257, + "learning_rate": 3.302648258496245e-05, + "loss": 0.14955276250839233, + "step": 156330 + }, + { + "epoch": 0.6712002953727794, + "grad_norm": 0.04590775445103645, + "learning_rate": 3.302217086484482e-05, + "loss": 0.2385624885559082, + "step": 156340 + }, + { + "epoch": 0.6712432274627993, + "grad_norm": 0.23930826783180237, + "learning_rate": 3.3017859144727196e-05, + "loss": 0.15384535789489745, + "step": 156350 + }, + { + "epoch": 0.6712861595528193, + "grad_norm": 0.03432545065879822, + "learning_rate": 3.301354742460957e-05, + "loss": 0.2647146463394165, + "step": 156360 + }, + { + "epoch": 0.6713290916428394, + "grad_norm": 0.3528565466403961, + "learning_rate": 3.300923570449195e-05, + "loss": 0.15096168518066405, + "step": 156370 + }, + { + "epoch": 0.6713720237328593, + "grad_norm": 0.0011686889920383692, + "learning_rate": 3.300492398437433e-05, + "loss": 0.15718039274215698, + "step": 156380 + }, + { + "epoch": 0.6714149558228794, + "grad_norm": 0.07616129517555237, + "learning_rate": 3.3000612264256705e-05, + "loss": 0.144598650932312, + "step": 156390 + }, + { + "epoch": 0.6714578879128994, + "grad_norm": 3.6899969577789307, + "learning_rate": 3.299630054413908e-05, + "loss": 0.21471176147460938, + "step": 156400 + }, + { + "epoch": 0.6715008200029193, + "grad_norm": 1.9858283996582031, + "learning_rate": 3.299198882402146e-05, + "loss": 0.3098663091659546, + "step": 156410 + }, + { + "epoch": 0.6715437520929394, + "grad_norm": 1.5402371883392334, + "learning_rate": 3.298767710390384e-05, + "loss": 0.06938784122467041, + "step": 156420 + }, + { + "epoch": 0.6715866841829594, + "grad_norm": 0.005206105764955282, + "learning_rate": 3.298336538378621e-05, + "loss": 0.1276816248893738, + "step": 156430 + }, + { + "epoch": 0.6716296162729793, + "grad_norm": 0.01194666512310505, + "learning_rate": 3.2979053663668585e-05, + "loss": 0.08454192280769349, + "step": 156440 + }, + { + "epoch": 0.6716725483629994, + "grad_norm": 0.026871588081121445, + "learning_rate": 3.297474194355096e-05, + "loss": 0.15202608108520507, + "step": 156450 + }, + { + "epoch": 0.6717154804530194, + "grad_norm": 0.31858089566230774, + "learning_rate": 3.297043022343334e-05, + "loss": 0.20315639972686766, + "step": 156460 + }, + { + "epoch": 0.6717584125430395, + "grad_norm": 0.0001176847072201781, + "learning_rate": 3.296611850331571e-05, + "loss": 0.0934857428073883, + "step": 156470 + }, + { + "epoch": 0.6718013446330594, + "grad_norm": 0.00044366047950461507, + "learning_rate": 3.296180678319809e-05, + "loss": 0.3745831727981567, + "step": 156480 + }, + { + "epoch": 0.6718442767230794, + "grad_norm": 3.130305051803589, + "learning_rate": 3.2957495063080465e-05, + "loss": 0.2931442022323608, + "step": 156490 + }, + { + "epoch": 0.6718872088130995, + "grad_norm": 0.0010793895926326513, + "learning_rate": 3.295318334296284e-05, + "loss": 0.10714174509048462, + "step": 156500 + }, + { + "epoch": 0.6719301409031194, + "grad_norm": 1.57980215549469, + "learning_rate": 3.294887162284522e-05, + "loss": 0.11323796510696411, + "step": 156510 + }, + { + "epoch": 0.6719730729931395, + "grad_norm": 2.0298821926116943, + "learning_rate": 3.29445599027276e-05, + "loss": 0.15041579008102418, + "step": 156520 + }, + { + "epoch": 0.6720160050831595, + "grad_norm": 0.04598362371325493, + "learning_rate": 3.2940248182609974e-05, + "loss": 0.07781398296356201, + "step": 156530 + }, + { + "epoch": 0.6720589371731794, + "grad_norm": 0.01166477520018816, + "learning_rate": 3.293593646249235e-05, + "loss": 0.10332920551300048, + "step": 156540 + }, + { + "epoch": 0.6721018692631995, + "grad_norm": 0.9563781023025513, + "learning_rate": 3.293162474237472e-05, + "loss": 0.23160524368286134, + "step": 156550 + }, + { + "epoch": 0.6721448013532195, + "grad_norm": 2.392322301864624, + "learning_rate": 3.29273130222571e-05, + "loss": 0.17795255184173583, + "step": 156560 + }, + { + "epoch": 0.6721877334432395, + "grad_norm": 0.007823570631444454, + "learning_rate": 3.292300130213948e-05, + "loss": 0.06307123899459839, + "step": 156570 + }, + { + "epoch": 0.6722306655332595, + "grad_norm": 2.6560440063476562, + "learning_rate": 3.2918689582021854e-05, + "loss": 0.35802664756774905, + "step": 156580 + }, + { + "epoch": 0.6722735976232795, + "grad_norm": 2.988377332687378, + "learning_rate": 3.2914377861904225e-05, + "loss": 0.1856152057647705, + "step": 156590 + }, + { + "epoch": 0.6723165297132995, + "grad_norm": 0.003383405040949583, + "learning_rate": 3.29100661417866e-05, + "loss": 0.12469416856765747, + "step": 156600 + }, + { + "epoch": 0.6723594618033195, + "grad_norm": 3.6829562187194824, + "learning_rate": 3.290575442166898e-05, + "loss": 0.012735322117805481, + "step": 156610 + }, + { + "epoch": 0.6724023938933396, + "grad_norm": 2.7931692600250244, + "learning_rate": 3.2901442701551364e-05, + "loss": 0.2340226173400879, + "step": 156620 + }, + { + "epoch": 0.6724453259833595, + "grad_norm": 1.3279739618301392, + "learning_rate": 3.2897130981433734e-05, + "loss": 0.3694106101989746, + "step": 156630 + }, + { + "epoch": 0.6724882580733795, + "grad_norm": 1.672303557395935, + "learning_rate": 3.289281926131611e-05, + "loss": 0.08186525702476502, + "step": 156640 + }, + { + "epoch": 0.6725311901633996, + "grad_norm": 0.005892850458621979, + "learning_rate": 3.288850754119849e-05, + "loss": 0.33636016845703126, + "step": 156650 + }, + { + "epoch": 0.6725741222534195, + "grad_norm": 0.17561239004135132, + "learning_rate": 3.2884195821080866e-05, + "loss": 0.1381277322769165, + "step": 156660 + }, + { + "epoch": 0.6726170543434395, + "grad_norm": 0.014752616174519062, + "learning_rate": 3.287988410096324e-05, + "loss": 0.1700663685798645, + "step": 156670 + }, + { + "epoch": 0.6726599864334596, + "grad_norm": 7.623488903045654, + "learning_rate": 3.2875572380845614e-05, + "loss": 0.3724008083343506, + "step": 156680 + }, + { + "epoch": 0.6727029185234795, + "grad_norm": 4.932303428649902, + "learning_rate": 3.287126066072799e-05, + "loss": 0.3138710975646973, + "step": 156690 + }, + { + "epoch": 0.6727458506134996, + "grad_norm": 0.005752002354711294, + "learning_rate": 3.286694894061037e-05, + "loss": 0.11339826583862304, + "step": 156700 + }, + { + "epoch": 0.6727887827035196, + "grad_norm": 0.06204256787896156, + "learning_rate": 3.286263722049274e-05, + "loss": 0.21384499073028565, + "step": 156710 + }, + { + "epoch": 0.6728317147935395, + "grad_norm": 0.004700512159615755, + "learning_rate": 3.285832550037512e-05, + "loss": 0.13949614763259888, + "step": 156720 + }, + { + "epoch": 0.6728746468835596, + "grad_norm": 7.897902965545654, + "learning_rate": 3.28540137802575e-05, + "loss": 0.22153210639953613, + "step": 156730 + }, + { + "epoch": 0.6729175789735796, + "grad_norm": 0.01706327125430107, + "learning_rate": 3.284970206013988e-05, + "loss": 0.15281535387039186, + "step": 156740 + }, + { + "epoch": 0.6729605110635996, + "grad_norm": 11.205368041992188, + "learning_rate": 3.284539034002225e-05, + "loss": 0.12583911418914795, + "step": 156750 + }, + { + "epoch": 0.6730034431536196, + "grad_norm": 1.768643856048584, + "learning_rate": 3.2841078619904626e-05, + "loss": 0.3392418622970581, + "step": 156760 + }, + { + "epoch": 0.6730463752436396, + "grad_norm": 0.09945378452539444, + "learning_rate": 3.2836766899787004e-05, + "loss": 0.13553329706192016, + "step": 156770 + }, + { + "epoch": 0.6730893073336596, + "grad_norm": 0.013488375581800938, + "learning_rate": 3.283245517966938e-05, + "loss": 0.2017906427383423, + "step": 156780 + }, + { + "epoch": 0.6731322394236796, + "grad_norm": 0.17563210427761078, + "learning_rate": 3.282814345955176e-05, + "loss": 0.18367542028427125, + "step": 156790 + }, + { + "epoch": 0.6731751715136997, + "grad_norm": 2.4380056858062744, + "learning_rate": 3.282383173943413e-05, + "loss": 0.3962520360946655, + "step": 156800 + }, + { + "epoch": 0.6732181036037196, + "grad_norm": 1.7807360887527466, + "learning_rate": 3.2819520019316506e-05, + "loss": 0.17535316944122314, + "step": 156810 + }, + { + "epoch": 0.6732610356937396, + "grad_norm": 0.03467885032296181, + "learning_rate": 3.2815208299198884e-05, + "loss": 0.09742856621742249, + "step": 156820 + }, + { + "epoch": 0.6733039677837597, + "grad_norm": 0.9437536597251892, + "learning_rate": 3.281089657908126e-05, + "loss": 0.21734330654144288, + "step": 156830 + }, + { + "epoch": 0.6733468998737796, + "grad_norm": 1.783363699913025, + "learning_rate": 3.280658485896364e-05, + "loss": 0.28152852058410643, + "step": 156840 + }, + { + "epoch": 0.6733898319637996, + "grad_norm": 0.009937250055372715, + "learning_rate": 3.2802273138846016e-05, + "loss": 0.18057806491851808, + "step": 156850 + }, + { + "epoch": 0.6734327640538197, + "grad_norm": 0.0626106932759285, + "learning_rate": 3.279796141872839e-05, + "loss": 0.12527823448181152, + "step": 156860 + }, + { + "epoch": 0.6734756961438396, + "grad_norm": 0.024701496586203575, + "learning_rate": 3.279364969861077e-05, + "loss": 0.073065185546875, + "step": 156870 + }, + { + "epoch": 0.6735186282338597, + "grad_norm": 0.019258178770542145, + "learning_rate": 3.278933797849314e-05, + "loss": 0.03922346830368042, + "step": 156880 + }, + { + "epoch": 0.6735615603238797, + "grad_norm": 6.451083183288574, + "learning_rate": 3.278502625837552e-05, + "loss": 0.3139482498168945, + "step": 156890 + }, + { + "epoch": 0.6736044924138997, + "grad_norm": 2.2330548763275146, + "learning_rate": 3.2780714538257896e-05, + "loss": 0.20146529674530028, + "step": 156900 + }, + { + "epoch": 0.6736474245039197, + "grad_norm": 5.211982727050781, + "learning_rate": 3.277640281814027e-05, + "loss": 0.11265192031860352, + "step": 156910 + }, + { + "epoch": 0.6736903565939397, + "grad_norm": 0.2216411679983139, + "learning_rate": 3.2772091098022643e-05, + "loss": 0.10636416673660279, + "step": 156920 + }, + { + "epoch": 0.6737332886839598, + "grad_norm": 0.01977216824889183, + "learning_rate": 3.276777937790502e-05, + "loss": 0.11636031866073608, + "step": 156930 + }, + { + "epoch": 0.6737762207739797, + "grad_norm": 0.037923719733953476, + "learning_rate": 3.27634676577874e-05, + "loss": 0.16940144300460816, + "step": 156940 + }, + { + "epoch": 0.6738191528639997, + "grad_norm": 2.47348690032959, + "learning_rate": 3.2759155937669775e-05, + "loss": 0.43205738067626953, + "step": 156950 + }, + { + "epoch": 0.6738620849540198, + "grad_norm": 0.2728065252304077, + "learning_rate": 3.275484421755215e-05, + "loss": 0.23530976772308348, + "step": 156960 + }, + { + "epoch": 0.6739050170440397, + "grad_norm": 0.0499393604695797, + "learning_rate": 3.275053249743453e-05, + "loss": 0.14187155961990355, + "step": 156970 + }, + { + "epoch": 0.6739479491340598, + "grad_norm": 0.022202298045158386, + "learning_rate": 3.274622077731691e-05, + "loss": 0.20078377723693847, + "step": 156980 + }, + { + "epoch": 0.6739908812240798, + "grad_norm": 0.006260285619646311, + "learning_rate": 3.2741909057199285e-05, + "loss": 0.07577277421951294, + "step": 156990 + }, + { + "epoch": 0.6740338133140997, + "grad_norm": 4.734714508056641, + "learning_rate": 3.2737597337081655e-05, + "loss": 0.2892103433609009, + "step": 157000 + }, + { + "epoch": 0.6740338133140997, + "eval_loss": 0.38809940218925476, + "eval_runtime": 27.2561, + "eval_samples_per_second": 3.669, + "eval_steps_per_second": 3.669, + "step": 157000 + }, + { + "epoch": 0.6740767454041198, + "grad_norm": 0.3949276804924011, + "learning_rate": 3.273328561696403e-05, + "loss": 0.11092907190322876, + "step": 157010 + }, + { + "epoch": 0.6741196774941398, + "grad_norm": 0.5621246695518494, + "learning_rate": 3.272897389684641e-05, + "loss": 0.0834064543247223, + "step": 157020 + }, + { + "epoch": 0.6741626095841597, + "grad_norm": 2.7552273273468018, + "learning_rate": 3.272466217672879e-05, + "loss": 0.3163444995880127, + "step": 157030 + }, + { + "epoch": 0.6742055416741798, + "grad_norm": 5.054752349853516, + "learning_rate": 3.272035045661116e-05, + "loss": 0.27749512195587156, + "step": 157040 + }, + { + "epoch": 0.6742484737641998, + "grad_norm": 0.000382843310944736, + "learning_rate": 3.2716038736493535e-05, + "loss": 0.24705393314361573, + "step": 157050 + }, + { + "epoch": 0.6742914058542198, + "grad_norm": 0.047599148005247116, + "learning_rate": 3.271172701637591e-05, + "loss": 0.05157904624938965, + "step": 157060 + }, + { + "epoch": 0.6743343379442398, + "grad_norm": 0.13552670180797577, + "learning_rate": 3.270741529625829e-05, + "loss": 0.215932035446167, + "step": 157070 + }, + { + "epoch": 0.6743772700342598, + "grad_norm": 0.6506958603858948, + "learning_rate": 3.270310357614067e-05, + "loss": 0.22939252853393555, + "step": 157080 + }, + { + "epoch": 0.6744202021242798, + "grad_norm": 0.005644662771373987, + "learning_rate": 3.2698791856023045e-05, + "loss": 0.3055266380310059, + "step": 157090 + }, + { + "epoch": 0.6744631342142998, + "grad_norm": 0.00816242303699255, + "learning_rate": 3.269448013590542e-05, + "loss": 0.2139211416244507, + "step": 157100 + }, + { + "epoch": 0.6745060663043199, + "grad_norm": 0.00204946706071496, + "learning_rate": 3.26901684157878e-05, + "loss": 0.18893979787826537, + "step": 157110 + }, + { + "epoch": 0.6745489983943398, + "grad_norm": 0.060986656695604324, + "learning_rate": 3.268585669567017e-05, + "loss": 0.19850542545318603, + "step": 157120 + }, + { + "epoch": 0.6745919304843598, + "grad_norm": 0.02972446009516716, + "learning_rate": 3.268154497555255e-05, + "loss": 0.06751665472984314, + "step": 157130 + }, + { + "epoch": 0.6746348625743799, + "grad_norm": 2.524022102355957, + "learning_rate": 3.2677233255434925e-05, + "loss": 0.3849080324172974, + "step": 157140 + }, + { + "epoch": 0.6746777946643998, + "grad_norm": 0.002453405410051346, + "learning_rate": 3.26729215353173e-05, + "loss": 0.09670175909996033, + "step": 157150 + }, + { + "epoch": 0.6747207267544199, + "grad_norm": 2.7271265983581543, + "learning_rate": 3.266860981519968e-05, + "loss": 0.07853362560272217, + "step": 157160 + }, + { + "epoch": 0.6747636588444399, + "grad_norm": 2.5940101146698, + "learning_rate": 3.266429809508205e-05, + "loss": 0.5087200164794922, + "step": 157170 + }, + { + "epoch": 0.6748065909344598, + "grad_norm": 4.071169853210449, + "learning_rate": 3.265998637496443e-05, + "loss": 0.1377337694168091, + "step": 157180 + }, + { + "epoch": 0.6748495230244799, + "grad_norm": 0.012241007760167122, + "learning_rate": 3.2655674654846805e-05, + "loss": 0.06032915115356445, + "step": 157190 + }, + { + "epoch": 0.6748924551144999, + "grad_norm": 1.3309521675109863, + "learning_rate": 3.265136293472918e-05, + "loss": 0.23908510208129882, + "step": 157200 + }, + { + "epoch": 0.6749353872045198, + "grad_norm": 0.0017846348928287625, + "learning_rate": 3.264705121461156e-05, + "loss": 0.1398613214492798, + "step": 157210 + }, + { + "epoch": 0.6749783192945399, + "grad_norm": 2.251039981842041, + "learning_rate": 3.264273949449394e-05, + "loss": 0.3664930105209351, + "step": 157220 + }, + { + "epoch": 0.6750212513845599, + "grad_norm": 0.06420256197452545, + "learning_rate": 3.2638427774376314e-05, + "loss": 0.1947470188140869, + "step": 157230 + }, + { + "epoch": 0.6750641834745799, + "grad_norm": 0.30117225646972656, + "learning_rate": 3.263411605425869e-05, + "loss": 0.24840478897094725, + "step": 157240 + }, + { + "epoch": 0.6751071155645999, + "grad_norm": 0.10018105059862137, + "learning_rate": 3.262980433414106e-05, + "loss": 0.08070742487907409, + "step": 157250 + }, + { + "epoch": 0.67515004765462, + "grad_norm": 0.011599382385611534, + "learning_rate": 3.262549261402344e-05, + "loss": 0.09008162021636963, + "step": 157260 + }, + { + "epoch": 0.6751929797446399, + "grad_norm": 0.004888639319688082, + "learning_rate": 3.2621180893905817e-05, + "loss": 0.18968068361282348, + "step": 157270 + }, + { + "epoch": 0.6752359118346599, + "grad_norm": 0.004041858017444611, + "learning_rate": 3.2616869173788194e-05, + "loss": 0.27539896965026855, + "step": 157280 + }, + { + "epoch": 0.67527884392468, + "grad_norm": 1.4639886617660522, + "learning_rate": 3.2612557453670564e-05, + "loss": 0.25031161308288574, + "step": 157290 + }, + { + "epoch": 0.6753217760146999, + "grad_norm": 2.1338157653808594, + "learning_rate": 3.260824573355294e-05, + "loss": 0.15990082025527955, + "step": 157300 + }, + { + "epoch": 0.6753647081047199, + "grad_norm": 10.418062210083008, + "learning_rate": 3.260393401343532e-05, + "loss": 0.08970891237258911, + "step": 157310 + }, + { + "epoch": 0.67540764019474, + "grad_norm": 0.017363833263516426, + "learning_rate": 3.2599622293317696e-05, + "loss": 0.2596541404724121, + "step": 157320 + }, + { + "epoch": 0.67545057228476, + "grad_norm": 0.2788954973220825, + "learning_rate": 3.2595310573200074e-05, + "loss": 0.1514696002006531, + "step": 157330 + }, + { + "epoch": 0.67549350437478, + "grad_norm": 1.667123556137085, + "learning_rate": 3.259099885308245e-05, + "loss": 0.15894722938537598, + "step": 157340 + }, + { + "epoch": 0.6755364364648, + "grad_norm": 0.01323763933032751, + "learning_rate": 3.258668713296483e-05, + "loss": 0.21982104778289796, + "step": 157350 + }, + { + "epoch": 0.67557936855482, + "grad_norm": 0.06124216318130493, + "learning_rate": 3.2582375412847206e-05, + "loss": 0.19194493293762208, + "step": 157360 + }, + { + "epoch": 0.67562230064484, + "grad_norm": 0.007614783942699432, + "learning_rate": 3.2578063692729576e-05, + "loss": 0.39401233196258545, + "step": 157370 + }, + { + "epoch": 0.67566523273486, + "grad_norm": 1.8661552667617798, + "learning_rate": 3.2573751972611954e-05, + "loss": 0.18967965841293336, + "step": 157380 + }, + { + "epoch": 0.67570816482488, + "grad_norm": 0.014832602813839912, + "learning_rate": 3.256944025249433e-05, + "loss": 0.022698092460632324, + "step": 157390 + }, + { + "epoch": 0.6757510969149, + "grad_norm": 3.17002534866333, + "learning_rate": 3.256512853237671e-05, + "loss": 0.19420602321624755, + "step": 157400 + }, + { + "epoch": 0.67579402900492, + "grad_norm": 0.06440910696983337, + "learning_rate": 3.256081681225908e-05, + "loss": 0.2151397943496704, + "step": 157410 + }, + { + "epoch": 0.6758369610949401, + "grad_norm": 0.4137822389602661, + "learning_rate": 3.2556505092141456e-05, + "loss": 0.2239288330078125, + "step": 157420 + }, + { + "epoch": 0.67587989318496, + "grad_norm": 0.1950918734073639, + "learning_rate": 3.2552193372023834e-05, + "loss": 0.15644675493240356, + "step": 157430 + }, + { + "epoch": 0.67592282527498, + "grad_norm": 1.746835470199585, + "learning_rate": 3.254788165190622e-05, + "loss": 0.2628674030303955, + "step": 157440 + }, + { + "epoch": 0.6759657573650001, + "grad_norm": 4.402871608734131, + "learning_rate": 3.254356993178859e-05, + "loss": 0.40744714736938475, + "step": 157450 + }, + { + "epoch": 0.67600868945502, + "grad_norm": 0.15432725846767426, + "learning_rate": 3.2539258211670966e-05, + "loss": 0.11367126703262329, + "step": 157460 + }, + { + "epoch": 0.6760516215450401, + "grad_norm": 0.004775689914822578, + "learning_rate": 3.253494649155334e-05, + "loss": 0.07618749141693115, + "step": 157470 + }, + { + "epoch": 0.6760945536350601, + "grad_norm": 0.0029706796631217003, + "learning_rate": 3.253063477143572e-05, + "loss": 0.3006477355957031, + "step": 157480 + }, + { + "epoch": 0.67613748572508, + "grad_norm": 4.908012866973877, + "learning_rate": 3.252632305131809e-05, + "loss": 0.341683554649353, + "step": 157490 + }, + { + "epoch": 0.6761804178151001, + "grad_norm": 0.03939557820558548, + "learning_rate": 3.252201133120047e-05, + "loss": 0.11358660459518433, + "step": 157500 + }, + { + "epoch": 0.6762233499051201, + "grad_norm": 2.5295541286468506, + "learning_rate": 3.2517699611082846e-05, + "loss": 0.23426496982574463, + "step": 157510 + }, + { + "epoch": 0.67626628199514, + "grad_norm": 0.016799015924334526, + "learning_rate": 3.251338789096522e-05, + "loss": 0.23719797134399415, + "step": 157520 + }, + { + "epoch": 0.6763092140851601, + "grad_norm": 6.704193592071533, + "learning_rate": 3.25090761708476e-05, + "loss": 0.12964043617248536, + "step": 157530 + }, + { + "epoch": 0.6763521461751801, + "grad_norm": 0.000664949300698936, + "learning_rate": 3.250476445072997e-05, + "loss": 0.19647494554519654, + "step": 157540 + }, + { + "epoch": 0.6763950782652001, + "grad_norm": 0.05981343239545822, + "learning_rate": 3.2500452730612355e-05, + "loss": 0.15086113214492797, + "step": 157550 + }, + { + "epoch": 0.6764380103552201, + "grad_norm": 0.008232577703893185, + "learning_rate": 3.249614101049473e-05, + "loss": 0.20063190460205077, + "step": 157560 + }, + { + "epoch": 0.6764809424452402, + "grad_norm": 0.0008475049980916083, + "learning_rate": 3.249182929037711e-05, + "loss": 0.11724573373794556, + "step": 157570 + }, + { + "epoch": 0.6765238745352601, + "grad_norm": 1.6179267168045044, + "learning_rate": 3.248751757025948e-05, + "loss": 0.13589624166488648, + "step": 157580 + }, + { + "epoch": 0.6765668066252801, + "grad_norm": 0.011852581985294819, + "learning_rate": 3.248320585014186e-05, + "loss": 0.2451772928237915, + "step": 157590 + }, + { + "epoch": 0.6766097387153002, + "grad_norm": 0.030088340863585472, + "learning_rate": 3.2478894130024235e-05, + "loss": 0.1790969967842102, + "step": 157600 + }, + { + "epoch": 0.6766526708053201, + "grad_norm": 0.09731682389974594, + "learning_rate": 3.247458240990661e-05, + "loss": 0.23853666782379152, + "step": 157610 + }, + { + "epoch": 0.6766956028953401, + "grad_norm": 0.006340175401419401, + "learning_rate": 3.247027068978898e-05, + "loss": 0.3634488105773926, + "step": 157620 + }, + { + "epoch": 0.6767385349853602, + "grad_norm": 5.072446823120117, + "learning_rate": 3.246595896967136e-05, + "loss": 0.18125200271606445, + "step": 157630 + }, + { + "epoch": 0.6767814670753801, + "grad_norm": 1.7390445470809937, + "learning_rate": 3.246164724955374e-05, + "loss": 0.1761768102645874, + "step": 157640 + }, + { + "epoch": 0.6768243991654002, + "grad_norm": 0.06036270782351494, + "learning_rate": 3.2457335529436115e-05, + "loss": 0.13214666843414308, + "step": 157650 + }, + { + "epoch": 0.6768673312554202, + "grad_norm": 1.2427871227264404, + "learning_rate": 3.245302380931849e-05, + "loss": 0.23174049854278564, + "step": 157660 + }, + { + "epoch": 0.6769102633454401, + "grad_norm": 0.008723437786102295, + "learning_rate": 3.244871208920087e-05, + "loss": 0.16431721448898315, + "step": 157670 + }, + { + "epoch": 0.6769531954354602, + "grad_norm": 0.0026647180784493685, + "learning_rate": 3.244440036908325e-05, + "loss": 0.25285816192626953, + "step": 157680 + }, + { + "epoch": 0.6769961275254802, + "grad_norm": 0.01261743251234293, + "learning_rate": 3.2440088648965624e-05, + "loss": 0.1764390230178833, + "step": 157690 + }, + { + "epoch": 0.6770390596155001, + "grad_norm": 0.04876869544386864, + "learning_rate": 3.2435776928847995e-05, + "loss": 0.18486921787261962, + "step": 157700 + }, + { + "epoch": 0.6770819917055202, + "grad_norm": 0.06611602008342743, + "learning_rate": 3.243146520873037e-05, + "loss": 0.16170352697372437, + "step": 157710 + }, + { + "epoch": 0.6771249237955402, + "grad_norm": 0.20194700360298157, + "learning_rate": 3.242715348861275e-05, + "loss": 0.14869037866592408, + "step": 157720 + }, + { + "epoch": 0.6771678558855602, + "grad_norm": 0.00841581355780363, + "learning_rate": 3.242284176849513e-05, + "loss": 0.1633516311645508, + "step": 157730 + }, + { + "epoch": 0.6772107879755802, + "grad_norm": 0.023939453065395355, + "learning_rate": 3.24185300483775e-05, + "loss": 0.23998985290527344, + "step": 157740 + }, + { + "epoch": 0.6772537200656003, + "grad_norm": 0.06220559775829315, + "learning_rate": 3.2414218328259875e-05, + "loss": 0.12186563014984131, + "step": 157750 + }, + { + "epoch": 0.6772966521556203, + "grad_norm": 0.10329548269510269, + "learning_rate": 3.240990660814225e-05, + "loss": 0.059828144311904904, + "step": 157760 + }, + { + "epoch": 0.6773395842456402, + "grad_norm": 0.0009203555528074503, + "learning_rate": 3.240559488802463e-05, + "loss": 0.10206557512283325, + "step": 157770 + }, + { + "epoch": 0.6773825163356603, + "grad_norm": 0.1713915914297104, + "learning_rate": 3.240128316790701e-05, + "loss": 0.2474388837814331, + "step": 157780 + }, + { + "epoch": 0.6774254484256803, + "grad_norm": 0.009260360151529312, + "learning_rate": 3.2396971447789384e-05, + "loss": 0.2259169340133667, + "step": 157790 + }, + { + "epoch": 0.6774683805157002, + "grad_norm": 0.004235635045915842, + "learning_rate": 3.239265972767176e-05, + "loss": 0.37403192520141604, + "step": 157800 + }, + { + "epoch": 0.6775113126057203, + "grad_norm": 8.215943336486816, + "learning_rate": 3.238834800755414e-05, + "loss": 0.23961963653564453, + "step": 157810 + }, + { + "epoch": 0.6775542446957403, + "grad_norm": 0.08245151489973068, + "learning_rate": 3.238403628743651e-05, + "loss": 0.240964674949646, + "step": 157820 + }, + { + "epoch": 0.6775971767857603, + "grad_norm": 2.935471534729004, + "learning_rate": 3.237972456731889e-05, + "loss": 0.06698665022850037, + "step": 157830 + }, + { + "epoch": 0.6776401088757803, + "grad_norm": 8.233935356140137, + "learning_rate": 3.2375412847201264e-05, + "loss": 0.2231219530105591, + "step": 157840 + }, + { + "epoch": 0.6776830409658003, + "grad_norm": 0.005562909878790379, + "learning_rate": 3.237110112708364e-05, + "loss": 0.11405689716339111, + "step": 157850 + }, + { + "epoch": 0.6777259730558203, + "grad_norm": 0.5877987146377563, + "learning_rate": 3.236678940696601e-05, + "loss": 0.11020293235778808, + "step": 157860 + }, + { + "epoch": 0.6777689051458403, + "grad_norm": 0.000158540831762366, + "learning_rate": 3.236247768684839e-05, + "loss": 0.17420880794525145, + "step": 157870 + }, + { + "epoch": 0.6778118372358604, + "grad_norm": 0.010848082602024078, + "learning_rate": 3.235816596673077e-05, + "loss": 0.20020055770874023, + "step": 157880 + }, + { + "epoch": 0.6778547693258803, + "grad_norm": 0.024501841515302658, + "learning_rate": 3.2353854246613144e-05, + "loss": 0.19236403703689575, + "step": 157890 + }, + { + "epoch": 0.6778977014159003, + "grad_norm": 0.08429605513811111, + "learning_rate": 3.234954252649552e-05, + "loss": 0.07690611481666565, + "step": 157900 + }, + { + "epoch": 0.6779406335059204, + "grad_norm": 0.006016223691403866, + "learning_rate": 3.23452308063779e-05, + "loss": 0.22260491847991942, + "step": 157910 + }, + { + "epoch": 0.6779835655959403, + "grad_norm": 0.17743821442127228, + "learning_rate": 3.2340919086260276e-05, + "loss": 0.09943286776542663, + "step": 157920 + }, + { + "epoch": 0.6780264976859603, + "grad_norm": 0.24222442507743835, + "learning_rate": 3.2336607366142653e-05, + "loss": 0.2702426195144653, + "step": 157930 + }, + { + "epoch": 0.6780694297759804, + "grad_norm": 0.15780700743198395, + "learning_rate": 3.233229564602503e-05, + "loss": 0.07441672682762146, + "step": 157940 + }, + { + "epoch": 0.6781123618660003, + "grad_norm": 1.1437824964523315, + "learning_rate": 3.23279839259074e-05, + "loss": 0.116599702835083, + "step": 157950 + }, + { + "epoch": 0.6781552939560204, + "grad_norm": 0.01496734656393528, + "learning_rate": 3.232367220578978e-05, + "loss": 0.27400927543640136, + "step": 157960 + }, + { + "epoch": 0.6781982260460404, + "grad_norm": 1.0968358516693115, + "learning_rate": 3.2319360485672156e-05, + "loss": 0.36545302867889407, + "step": 157970 + }, + { + "epoch": 0.6782411581360603, + "grad_norm": 2.342289924621582, + "learning_rate": 3.231504876555453e-05, + "loss": 0.2468874931335449, + "step": 157980 + }, + { + "epoch": 0.6782840902260804, + "grad_norm": 0.007133483421057463, + "learning_rate": 3.2310737045436904e-05, + "loss": 0.11157327890396118, + "step": 157990 + }, + { + "epoch": 0.6783270223161004, + "grad_norm": 0.019744692370295525, + "learning_rate": 3.230642532531928e-05, + "loss": 0.19470452070236205, + "step": 158000 + }, + { + "epoch": 0.6783270223161004, + "eval_loss": 0.3861476480960846, + "eval_runtime": 27.2611, + "eval_samples_per_second": 3.668, + "eval_steps_per_second": 3.668, + "step": 158000 + }, + { + "epoch": 0.6783699544061204, + "grad_norm": 0.47449058294296265, + "learning_rate": 3.230211360520166e-05, + "loss": 0.030623114109039305, + "step": 158010 + }, + { + "epoch": 0.6784128864961404, + "grad_norm": 0.3131827116012573, + "learning_rate": 3.2297801885084036e-05, + "loss": 0.22363739013671874, + "step": 158020 + }, + { + "epoch": 0.6784558185861604, + "grad_norm": 0.033570338040590286, + "learning_rate": 3.229349016496641e-05, + "loss": 0.007682143151760102, + "step": 158030 + }, + { + "epoch": 0.6784987506761804, + "grad_norm": 3.0705337524414062, + "learning_rate": 3.228917844484879e-05, + "loss": 0.3733278751373291, + "step": 158040 + }, + { + "epoch": 0.6785416827662004, + "grad_norm": 1.8934067487716675, + "learning_rate": 3.228486672473117e-05, + "loss": 0.26838254928588867, + "step": 158050 + }, + { + "epoch": 0.6785846148562205, + "grad_norm": 0.0044560618698596954, + "learning_rate": 3.2280555004613545e-05, + "loss": 0.08115120530128479, + "step": 158060 + }, + { + "epoch": 0.6786275469462404, + "grad_norm": 0.0790332779288292, + "learning_rate": 3.2276243284495916e-05, + "loss": 0.18160150051116944, + "step": 158070 + }, + { + "epoch": 0.6786704790362604, + "grad_norm": 0.0017745341174304485, + "learning_rate": 3.227193156437829e-05, + "loss": 0.29369025230407714, + "step": 158080 + }, + { + "epoch": 0.6787134111262805, + "grad_norm": 1.6305290460586548, + "learning_rate": 3.226761984426067e-05, + "loss": 0.14866819381713867, + "step": 158090 + }, + { + "epoch": 0.6787563432163004, + "grad_norm": 10.965187072753906, + "learning_rate": 3.226330812414305e-05, + "loss": 0.23119993209838868, + "step": 158100 + }, + { + "epoch": 0.6787992753063204, + "grad_norm": 0.0011797469342127442, + "learning_rate": 3.225899640402542e-05, + "loss": 0.3503007411956787, + "step": 158110 + }, + { + "epoch": 0.6788422073963405, + "grad_norm": 3.7308449745178223, + "learning_rate": 3.2254684683907796e-05, + "loss": 0.23552021980285645, + "step": 158120 + }, + { + "epoch": 0.6788851394863604, + "grad_norm": 0.07517192512750626, + "learning_rate": 3.225037296379017e-05, + "loss": 0.37005517482757566, + "step": 158130 + }, + { + "epoch": 0.6789280715763805, + "grad_norm": 1.7629700899124146, + "learning_rate": 3.224606124367256e-05, + "loss": 0.4034713268280029, + "step": 158140 + }, + { + "epoch": 0.6789710036664005, + "grad_norm": 0.6647354364395142, + "learning_rate": 3.224174952355493e-05, + "loss": 0.2033230781555176, + "step": 158150 + }, + { + "epoch": 0.6790139357564204, + "grad_norm": 0.19703158736228943, + "learning_rate": 3.2237437803437305e-05, + "loss": 0.1884385108947754, + "step": 158160 + }, + { + "epoch": 0.6790568678464405, + "grad_norm": 2.4684970378875732, + "learning_rate": 3.223312608331968e-05, + "loss": 0.2863666772842407, + "step": 158170 + }, + { + "epoch": 0.6790997999364605, + "grad_norm": 0.17179302871227264, + "learning_rate": 3.222881436320206e-05, + "loss": 0.1833629846572876, + "step": 158180 + }, + { + "epoch": 0.6791427320264806, + "grad_norm": 0.18861034512519836, + "learning_rate": 3.222450264308443e-05, + "loss": 0.36298208236694335, + "step": 158190 + }, + { + "epoch": 0.6791856641165005, + "grad_norm": 0.037184134125709534, + "learning_rate": 3.222019092296681e-05, + "loss": 0.17432732582092286, + "step": 158200 + }, + { + "epoch": 0.6792285962065205, + "grad_norm": 1.2532052993774414, + "learning_rate": 3.2215879202849185e-05, + "loss": 0.07648919224739074, + "step": 158210 + }, + { + "epoch": 0.6792715282965406, + "grad_norm": 0.02248220331966877, + "learning_rate": 3.221156748273156e-05, + "loss": 0.28692572116851806, + "step": 158220 + }, + { + "epoch": 0.6793144603865605, + "grad_norm": 1.3784197568893433, + "learning_rate": 3.220725576261394e-05, + "loss": 0.037394005060195926, + "step": 158230 + }, + { + "epoch": 0.6793573924765806, + "grad_norm": 0.009938391856849194, + "learning_rate": 3.220294404249631e-05, + "loss": 0.18498718738555908, + "step": 158240 + }, + { + "epoch": 0.6794003245666006, + "grad_norm": 0.050672680139541626, + "learning_rate": 3.2198632322378695e-05, + "loss": 0.2385341167449951, + "step": 158250 + }, + { + "epoch": 0.6794432566566205, + "grad_norm": 2.427584409713745, + "learning_rate": 3.219432060226107e-05, + "loss": 0.3927887439727783, + "step": 158260 + }, + { + "epoch": 0.6794861887466406, + "grad_norm": 0.007242992520332336, + "learning_rate": 3.219000888214345e-05, + "loss": 0.18903251886367797, + "step": 158270 + }, + { + "epoch": 0.6795291208366606, + "grad_norm": 2.674262046813965, + "learning_rate": 3.218569716202582e-05, + "loss": 0.3244601249694824, + "step": 158280 + }, + { + "epoch": 0.6795720529266805, + "grad_norm": 1.95919930934906, + "learning_rate": 3.21813854419082e-05, + "loss": 0.27033874988555906, + "step": 158290 + }, + { + "epoch": 0.6796149850167006, + "grad_norm": 0.0013277186080813408, + "learning_rate": 3.2177073721790574e-05, + "loss": 0.18444887399673462, + "step": 158300 + }, + { + "epoch": 0.6796579171067206, + "grad_norm": 0.05387435480952263, + "learning_rate": 3.217276200167295e-05, + "loss": 0.18853824138641356, + "step": 158310 + }, + { + "epoch": 0.6797008491967406, + "grad_norm": 0.7749758958816528, + "learning_rate": 3.216845028155532e-05, + "loss": 0.18170182704925536, + "step": 158320 + }, + { + "epoch": 0.6797437812867606, + "grad_norm": 5.2914791012881324e-05, + "learning_rate": 3.21641385614377e-05, + "loss": 0.17878755331039428, + "step": 158330 + }, + { + "epoch": 0.6797867133767806, + "grad_norm": 2.84869384765625, + "learning_rate": 3.215982684132008e-05, + "loss": 0.23383090496063233, + "step": 158340 + }, + { + "epoch": 0.6798296454668006, + "grad_norm": 3.1321818828582764, + "learning_rate": 3.2155515121202454e-05, + "loss": 0.07024050951004028, + "step": 158350 + }, + { + "epoch": 0.6798725775568206, + "grad_norm": 8.468925476074219, + "learning_rate": 3.215120340108483e-05, + "loss": 0.5075342655181885, + "step": 158360 + }, + { + "epoch": 0.6799155096468407, + "grad_norm": 5.578030109405518, + "learning_rate": 3.214689168096721e-05, + "loss": 0.4054515838623047, + "step": 158370 + }, + { + "epoch": 0.6799584417368606, + "grad_norm": 0.7275596857070923, + "learning_rate": 3.2142579960849586e-05, + "loss": 0.238731050491333, + "step": 158380 + }, + { + "epoch": 0.6800013738268806, + "grad_norm": 0.0905885100364685, + "learning_rate": 3.2138268240731964e-05, + "loss": 0.16746869087219238, + "step": 158390 + }, + { + "epoch": 0.6800443059169007, + "grad_norm": 1.944628357887268, + "learning_rate": 3.2133956520614334e-05, + "loss": 0.18926804065704345, + "step": 158400 + }, + { + "epoch": 0.6800872380069206, + "grad_norm": 5.562248229980469, + "learning_rate": 3.212964480049671e-05, + "loss": 0.4556564807891846, + "step": 158410 + }, + { + "epoch": 0.6801301700969407, + "grad_norm": 0.008871596306562424, + "learning_rate": 3.212533308037909e-05, + "loss": 0.19651058912277222, + "step": 158420 + }, + { + "epoch": 0.6801731021869607, + "grad_norm": 0.017508648335933685, + "learning_rate": 3.2121021360261466e-05, + "loss": 0.1843596339225769, + "step": 158430 + }, + { + "epoch": 0.6802160342769806, + "grad_norm": 0.009043751284480095, + "learning_rate": 3.211670964014384e-05, + "loss": 0.06310344934463501, + "step": 158440 + }, + { + "epoch": 0.6802589663670007, + "grad_norm": 0.10740286856889725, + "learning_rate": 3.2112397920026214e-05, + "loss": 0.2981158018112183, + "step": 158450 + }, + { + "epoch": 0.6803018984570207, + "grad_norm": 0.061360158026218414, + "learning_rate": 3.210808619990859e-05, + "loss": 0.07388191819190978, + "step": 158460 + }, + { + "epoch": 0.6803448305470406, + "grad_norm": 0.1522047519683838, + "learning_rate": 3.210377447979097e-05, + "loss": 0.3216759204864502, + "step": 158470 + }, + { + "epoch": 0.6803877626370607, + "grad_norm": 0.003306443803012371, + "learning_rate": 3.2099462759673346e-05, + "loss": 0.04231602251529694, + "step": 158480 + }, + { + "epoch": 0.6804306947270807, + "grad_norm": 0.002171689411625266, + "learning_rate": 3.2095151039555724e-05, + "loss": 0.08375156521797181, + "step": 158490 + }, + { + "epoch": 0.6804736268171007, + "grad_norm": 1.1639153957366943, + "learning_rate": 3.20908393194381e-05, + "loss": 0.26513402462005614, + "step": 158500 + }, + { + "epoch": 0.6805165589071207, + "grad_norm": 0.003427459392696619, + "learning_rate": 3.208652759932048e-05, + "loss": 0.18078973293304443, + "step": 158510 + }, + { + "epoch": 0.6805594909971407, + "grad_norm": 1.5218772888183594, + "learning_rate": 3.208221587920285e-05, + "loss": 0.22859303951263427, + "step": 158520 + }, + { + "epoch": 0.6806024230871607, + "grad_norm": 0.046713389456272125, + "learning_rate": 3.2077904159085226e-05, + "loss": 0.28482129573822024, + "step": 158530 + }, + { + "epoch": 0.6806453551771807, + "grad_norm": 0.009020074270665646, + "learning_rate": 3.2073592438967604e-05, + "loss": 0.16700432300567628, + "step": 158540 + }, + { + "epoch": 0.6806882872672008, + "grad_norm": 0.12690341472625732, + "learning_rate": 3.206928071884998e-05, + "loss": 0.17910512685775756, + "step": 158550 + }, + { + "epoch": 0.6807312193572207, + "grad_norm": 0.9299690127372742, + "learning_rate": 3.206496899873235e-05, + "loss": 0.1592942714691162, + "step": 158560 + }, + { + "epoch": 0.6807741514472407, + "grad_norm": 1.0478684902191162, + "learning_rate": 3.206065727861473e-05, + "loss": 0.17841386795043945, + "step": 158570 + }, + { + "epoch": 0.6808170835372608, + "grad_norm": 0.0024922348093241453, + "learning_rate": 3.2056345558497106e-05, + "loss": 0.4945104122161865, + "step": 158580 + }, + { + "epoch": 0.6808600156272807, + "grad_norm": 9.214879035949707, + "learning_rate": 3.2052033838379484e-05, + "loss": 0.20899596214294433, + "step": 158590 + }, + { + "epoch": 0.6809029477173008, + "grad_norm": 4.420780181884766, + "learning_rate": 3.204772211826186e-05, + "loss": 0.3694831371307373, + "step": 158600 + }, + { + "epoch": 0.6809458798073208, + "grad_norm": 0.012590976431965828, + "learning_rate": 3.204341039814424e-05, + "loss": 0.18441904783248902, + "step": 158610 + }, + { + "epoch": 0.6809888118973408, + "grad_norm": 0.0019417435396462679, + "learning_rate": 3.2039098678026616e-05, + "loss": 0.2644726514816284, + "step": 158620 + }, + { + "epoch": 0.6810317439873608, + "grad_norm": 0.4429931938648224, + "learning_rate": 3.203478695790899e-05, + "loss": 0.34392242431640624, + "step": 158630 + }, + { + "epoch": 0.6810746760773808, + "grad_norm": 1.5727390050888062, + "learning_rate": 3.203047523779137e-05, + "loss": 0.23203063011169434, + "step": 158640 + }, + { + "epoch": 0.6811176081674009, + "grad_norm": 5.987852573394775, + "learning_rate": 3.202616351767374e-05, + "loss": 0.08132092952728272, + "step": 158650 + }, + { + "epoch": 0.6811605402574208, + "grad_norm": 1.5135953426361084, + "learning_rate": 3.202185179755612e-05, + "loss": 0.22478418350219725, + "step": 158660 + }, + { + "epoch": 0.6812034723474408, + "grad_norm": 7.780333518981934, + "learning_rate": 3.2017540077438495e-05, + "loss": 0.3361856937408447, + "step": 158670 + }, + { + "epoch": 0.6812464044374609, + "grad_norm": 0.0940074622631073, + "learning_rate": 3.201322835732087e-05, + "loss": 0.16245073080062866, + "step": 158680 + }, + { + "epoch": 0.6812893365274808, + "grad_norm": 4.361866474151611, + "learning_rate": 3.2008916637203243e-05, + "loss": 0.29499890804290774, + "step": 158690 + }, + { + "epoch": 0.6813322686175008, + "grad_norm": 0.7107620239257812, + "learning_rate": 3.200460491708562e-05, + "loss": 0.16014331579208374, + "step": 158700 + }, + { + "epoch": 0.6813752007075209, + "grad_norm": 0.16708636283874512, + "learning_rate": 3.2000293196968e-05, + "loss": 0.07247481346130372, + "step": 158710 + }, + { + "epoch": 0.6814181327975408, + "grad_norm": 0.9394482970237732, + "learning_rate": 3.1995981476850375e-05, + "loss": 0.06600644588470458, + "step": 158720 + }, + { + "epoch": 0.6814610648875609, + "grad_norm": 0.15201954543590546, + "learning_rate": 3.199166975673275e-05, + "loss": 0.0971751093864441, + "step": 158730 + }, + { + "epoch": 0.6815039969775809, + "grad_norm": 0.19101858139038086, + "learning_rate": 3.198735803661513e-05, + "loss": 0.056906777620315555, + "step": 158740 + }, + { + "epoch": 0.6815469290676008, + "grad_norm": 30.30670738220215, + "learning_rate": 3.198304631649751e-05, + "loss": 0.17296806573867798, + "step": 158750 + }, + { + "epoch": 0.6815898611576209, + "grad_norm": 0.0005419242079369724, + "learning_rate": 3.1978734596379885e-05, + "loss": 0.0724807322025299, + "step": 158760 + }, + { + "epoch": 0.6816327932476409, + "grad_norm": 0.8574475049972534, + "learning_rate": 3.1974422876262255e-05, + "loss": 0.12815709114074708, + "step": 158770 + }, + { + "epoch": 0.6816757253376609, + "grad_norm": 0.031873419880867004, + "learning_rate": 3.197011115614463e-05, + "loss": 0.18840456008911133, + "step": 158780 + }, + { + "epoch": 0.6817186574276809, + "grad_norm": 0.0829717144370079, + "learning_rate": 3.196579943602701e-05, + "loss": 0.064266437292099, + "step": 158790 + }, + { + "epoch": 0.6817615895177009, + "grad_norm": 0.0006163608632050455, + "learning_rate": 3.196148771590939e-05, + "loss": 0.10775055885314941, + "step": 158800 + }, + { + "epoch": 0.6818045216077209, + "grad_norm": 0.6611578464508057, + "learning_rate": 3.195717599579176e-05, + "loss": 0.051128280162811277, + "step": 158810 + }, + { + "epoch": 0.6818474536977409, + "grad_norm": 2.4335217475891113, + "learning_rate": 3.1952864275674135e-05, + "loss": 0.262776780128479, + "step": 158820 + }, + { + "epoch": 0.681890385787761, + "grad_norm": 0.041229743510484695, + "learning_rate": 3.194855255555651e-05, + "loss": 0.09505161046981811, + "step": 158830 + }, + { + "epoch": 0.6819333178777809, + "grad_norm": 0.007038755342364311, + "learning_rate": 3.194424083543889e-05, + "loss": 0.20236282348632811, + "step": 158840 + }, + { + "epoch": 0.6819762499678009, + "grad_norm": 0.00763944536447525, + "learning_rate": 3.193992911532127e-05, + "loss": 0.23800394535064698, + "step": 158850 + }, + { + "epoch": 0.682019182057821, + "grad_norm": 0.0066480184905231, + "learning_rate": 3.1935617395203645e-05, + "loss": 0.1119657278060913, + "step": 158860 + }, + { + "epoch": 0.6820621141478409, + "grad_norm": 0.0007088402635417879, + "learning_rate": 3.193130567508602e-05, + "loss": 0.2503938913345337, + "step": 158870 + }, + { + "epoch": 0.682105046237861, + "grad_norm": 0.12080457806587219, + "learning_rate": 3.19269939549684e-05, + "loss": 0.1805747151374817, + "step": 158880 + }, + { + "epoch": 0.682147978327881, + "grad_norm": 0.06467478722333908, + "learning_rate": 3.192268223485077e-05, + "loss": 0.13013215065002443, + "step": 158890 + }, + { + "epoch": 0.6821909104179009, + "grad_norm": 1.5886485576629639, + "learning_rate": 3.191837051473315e-05, + "loss": 0.3788639783859253, + "step": 158900 + }, + { + "epoch": 0.682233842507921, + "grad_norm": 0.10984954237937927, + "learning_rate": 3.1914058794615525e-05, + "loss": 0.14202741384506226, + "step": 158910 + }, + { + "epoch": 0.682276774597941, + "grad_norm": 6.258497714996338, + "learning_rate": 3.19097470744979e-05, + "loss": 0.14315991401672362, + "step": 158920 + }, + { + "epoch": 0.6823197066879609, + "grad_norm": 0.2764085829257965, + "learning_rate": 3.190543535438027e-05, + "loss": 0.10486044883728027, + "step": 158930 + }, + { + "epoch": 0.682362638777981, + "grad_norm": 1.9766342639923096, + "learning_rate": 3.190112363426265e-05, + "loss": 0.19219329357147216, + "step": 158940 + }, + { + "epoch": 0.682405570868001, + "grad_norm": 0.08026780188083649, + "learning_rate": 3.189681191414503e-05, + "loss": 0.2659833192825317, + "step": 158950 + }, + { + "epoch": 0.682448502958021, + "grad_norm": 0.606878936290741, + "learning_rate": 3.189250019402741e-05, + "loss": 0.2334219694137573, + "step": 158960 + }, + { + "epoch": 0.682491435048041, + "grad_norm": 0.7083647847175598, + "learning_rate": 3.188818847390979e-05, + "loss": 0.1931910514831543, + "step": 158970 + }, + { + "epoch": 0.682534367138061, + "grad_norm": 2.9533469676971436, + "learning_rate": 3.188387675379216e-05, + "loss": 0.3448446035385132, + "step": 158980 + }, + { + "epoch": 0.682577299228081, + "grad_norm": 2.4527599811553955, + "learning_rate": 3.1879565033674537e-05, + "loss": 0.09024946093559265, + "step": 158990 + }, + { + "epoch": 0.682620231318101, + "grad_norm": 1.309434413909912, + "learning_rate": 3.1875253313556914e-05, + "loss": 0.418546724319458, + "step": 159000 + }, + { + "epoch": 0.682620231318101, + "eval_loss": 0.39745739102363586, + "eval_runtime": 27.161, + "eval_samples_per_second": 3.682, + "eval_steps_per_second": 3.682, + "step": 159000 + }, + { + "epoch": 0.682663163408121, + "grad_norm": 0.2863774597644806, + "learning_rate": 3.187094159343929e-05, + "loss": 0.1911346435546875, + "step": 159010 + }, + { + "epoch": 0.682706095498141, + "grad_norm": 0.32828912138938904, + "learning_rate": 3.186662987332166e-05, + "loss": 0.10439178943634034, + "step": 159020 + }, + { + "epoch": 0.682749027588161, + "grad_norm": 0.005384594667702913, + "learning_rate": 3.186231815320404e-05, + "loss": 0.08147462010383606, + "step": 159030 + }, + { + "epoch": 0.6827919596781811, + "grad_norm": 4.924872875213623, + "learning_rate": 3.1858006433086417e-05, + "loss": 0.21583118438720703, + "step": 159040 + }, + { + "epoch": 0.6828348917682011, + "grad_norm": 1.8141136169433594, + "learning_rate": 3.1853694712968794e-05, + "loss": 0.25055086612701416, + "step": 159050 + }, + { + "epoch": 0.682877823858221, + "grad_norm": 0.012755995616316795, + "learning_rate": 3.1849382992851164e-05, + "loss": 0.2830207109451294, + "step": 159060 + }, + { + "epoch": 0.6829207559482411, + "grad_norm": 0.0550185889005661, + "learning_rate": 3.184507127273355e-05, + "loss": 0.36090056896209716, + "step": 159070 + }, + { + "epoch": 0.6829636880382611, + "grad_norm": 0.001908186706714332, + "learning_rate": 3.1840759552615926e-05, + "loss": 0.2745044708251953, + "step": 159080 + }, + { + "epoch": 0.6830066201282811, + "grad_norm": 0.018973547965288162, + "learning_rate": 3.18364478324983e-05, + "loss": 0.09228085875511169, + "step": 159090 + }, + { + "epoch": 0.6830495522183011, + "grad_norm": 2.7169268131256104, + "learning_rate": 3.1832136112380674e-05, + "loss": 0.15420807600021363, + "step": 159100 + }, + { + "epoch": 0.6830924843083211, + "grad_norm": 0.002240829635411501, + "learning_rate": 3.182782439226305e-05, + "loss": 0.246453857421875, + "step": 159110 + }, + { + "epoch": 0.6831354163983411, + "grad_norm": 1.67829167842865, + "learning_rate": 3.182351267214543e-05, + "loss": 0.19712669849395753, + "step": 159120 + }, + { + "epoch": 0.6831783484883611, + "grad_norm": 1.507208228111267, + "learning_rate": 3.1819200952027806e-05, + "loss": 0.12660225629806518, + "step": 159130 + }, + { + "epoch": 0.6832212805783812, + "grad_norm": 0.0003549654793459922, + "learning_rate": 3.1814889231910176e-05, + "loss": 0.24141812324523926, + "step": 159140 + }, + { + "epoch": 0.6832642126684011, + "grad_norm": 1.0748794078826904, + "learning_rate": 3.1810577511792554e-05, + "loss": 0.20776724815368652, + "step": 159150 + }, + { + "epoch": 0.6833071447584211, + "grad_norm": 2.62322735786438, + "learning_rate": 3.180626579167493e-05, + "loss": 0.14376430511474608, + "step": 159160 + }, + { + "epoch": 0.6833500768484412, + "grad_norm": 1.435289978981018, + "learning_rate": 3.180195407155731e-05, + "loss": 0.2463693141937256, + "step": 159170 + }, + { + "epoch": 0.6833930089384611, + "grad_norm": 1.2902711629867554, + "learning_rate": 3.1797642351439686e-05, + "loss": 0.3314396858215332, + "step": 159180 + }, + { + "epoch": 0.6834359410284812, + "grad_norm": 0.006626356393098831, + "learning_rate": 3.179333063132206e-05, + "loss": 0.07337073683738708, + "step": 159190 + }, + { + "epoch": 0.6834788731185012, + "grad_norm": 0.0004718776617664844, + "learning_rate": 3.178901891120444e-05, + "loss": 0.21116044521331787, + "step": 159200 + }, + { + "epoch": 0.6835218052085211, + "grad_norm": 1.6235929727554321, + "learning_rate": 3.178470719108682e-05, + "loss": 0.23272418975830078, + "step": 159210 + }, + { + "epoch": 0.6835647372985412, + "grad_norm": 1.764551043510437, + "learning_rate": 3.178039547096919e-05, + "loss": 0.1310117721557617, + "step": 159220 + }, + { + "epoch": 0.6836076693885612, + "grad_norm": 0.8673880696296692, + "learning_rate": 3.1776083750851566e-05, + "loss": 0.33874988555908203, + "step": 159230 + }, + { + "epoch": 0.6836506014785811, + "grad_norm": 0.001663855859078467, + "learning_rate": 3.177177203073394e-05, + "loss": 0.1292450785636902, + "step": 159240 + }, + { + "epoch": 0.6836935335686012, + "grad_norm": 0.2805323302745819, + "learning_rate": 3.176746031061632e-05, + "loss": 0.2075819492340088, + "step": 159250 + }, + { + "epoch": 0.6837364656586212, + "grad_norm": 0.002389042405411601, + "learning_rate": 3.176314859049869e-05, + "loss": 0.07619832158088684, + "step": 159260 + }, + { + "epoch": 0.6837793977486412, + "grad_norm": 0.23299634456634521, + "learning_rate": 3.175883687038107e-05, + "loss": 0.1750471830368042, + "step": 159270 + }, + { + "epoch": 0.6838223298386612, + "grad_norm": 0.017031220719218254, + "learning_rate": 3.1754525150263446e-05, + "loss": 0.13732279539108277, + "step": 159280 + }, + { + "epoch": 0.6838652619286812, + "grad_norm": 0.0018903700402006507, + "learning_rate": 3.175021343014582e-05, + "loss": 0.2203477144241333, + "step": 159290 + }, + { + "epoch": 0.6839081940187012, + "grad_norm": 0.008383017033338547, + "learning_rate": 3.17459017100282e-05, + "loss": 0.1219887375831604, + "step": 159300 + }, + { + "epoch": 0.6839511261087212, + "grad_norm": 1.2795454263687134, + "learning_rate": 3.174158998991058e-05, + "loss": 0.2261826753616333, + "step": 159310 + }, + { + "epoch": 0.6839940581987413, + "grad_norm": 1.2634918689727783, + "learning_rate": 3.1737278269792955e-05, + "loss": 0.30649328231811523, + "step": 159320 + }, + { + "epoch": 0.6840369902887612, + "grad_norm": 0.940351128578186, + "learning_rate": 3.173296654967533e-05, + "loss": 0.24685354232788087, + "step": 159330 + }, + { + "epoch": 0.6840799223787812, + "grad_norm": 0.019773170351982117, + "learning_rate": 3.172865482955771e-05, + "loss": 0.08614250421524047, + "step": 159340 + }, + { + "epoch": 0.6841228544688013, + "grad_norm": 1.0007884502410889, + "learning_rate": 3.172434310944008e-05, + "loss": 0.41218295097351076, + "step": 159350 + }, + { + "epoch": 0.6841657865588212, + "grad_norm": 0.0006132782436907291, + "learning_rate": 3.172003138932246e-05, + "loss": 0.18524746894836425, + "step": 159360 + }, + { + "epoch": 0.6842087186488413, + "grad_norm": 0.13383154571056366, + "learning_rate": 3.1715719669204835e-05, + "loss": 0.061647999286651614, + "step": 159370 + }, + { + "epoch": 0.6842516507388613, + "grad_norm": 0.016351748257875443, + "learning_rate": 3.171140794908721e-05, + "loss": 0.045337098836898806, + "step": 159380 + }, + { + "epoch": 0.6842945828288812, + "grad_norm": 1.11026132106781, + "learning_rate": 3.170709622896958e-05, + "loss": 0.09521642923355103, + "step": 159390 + }, + { + "epoch": 0.6843375149189013, + "grad_norm": 7.038349628448486, + "learning_rate": 3.170278450885196e-05, + "loss": 0.27219247817993164, + "step": 159400 + }, + { + "epoch": 0.6843804470089213, + "grad_norm": 1.584343671798706, + "learning_rate": 3.169847278873434e-05, + "loss": 0.31961917877197266, + "step": 159410 + }, + { + "epoch": 0.6844233790989412, + "grad_norm": 1.0111151933670044, + "learning_rate": 3.1694161068616715e-05, + "loss": 0.16205734014511108, + "step": 159420 + }, + { + "epoch": 0.6844663111889613, + "grad_norm": 0.01199696958065033, + "learning_rate": 3.168984934849909e-05, + "loss": 0.2615689277648926, + "step": 159430 + }, + { + "epoch": 0.6845092432789813, + "grad_norm": 0.7997129559516907, + "learning_rate": 3.168553762838147e-05, + "loss": 0.07657995820045471, + "step": 159440 + }, + { + "epoch": 0.6845521753690013, + "grad_norm": 0.00100781733635813, + "learning_rate": 3.168122590826385e-05, + "loss": 0.10128631591796874, + "step": 159450 + }, + { + "epoch": 0.6845951074590213, + "grad_norm": 0.001099374727346003, + "learning_rate": 3.1676914188146224e-05, + "loss": 0.15639076232910157, + "step": 159460 + }, + { + "epoch": 0.6846380395490413, + "grad_norm": 0.0031978210899978876, + "learning_rate": 3.1672602468028595e-05, + "loss": 0.02175009250640869, + "step": 159470 + }, + { + "epoch": 0.6846809716390614, + "grad_norm": 0.04342164471745491, + "learning_rate": 3.166829074791097e-05, + "loss": 0.38820016384124756, + "step": 159480 + }, + { + "epoch": 0.6847239037290813, + "grad_norm": 0.8257866501808167, + "learning_rate": 3.166397902779335e-05, + "loss": 0.17148340940475465, + "step": 159490 + }, + { + "epoch": 0.6847668358191014, + "grad_norm": 0.8161950707435608, + "learning_rate": 3.165966730767573e-05, + "loss": 0.19095487594604493, + "step": 159500 + }, + { + "epoch": 0.6848097679091214, + "grad_norm": 0.9420151114463806, + "learning_rate": 3.16553555875581e-05, + "loss": 0.3118323802947998, + "step": 159510 + }, + { + "epoch": 0.6848526999991413, + "grad_norm": 40.907676696777344, + "learning_rate": 3.1651043867440475e-05, + "loss": 0.22367384433746337, + "step": 159520 + }, + { + "epoch": 0.6848956320891614, + "grad_norm": 0.04517137259244919, + "learning_rate": 3.164673214732285e-05, + "loss": 0.14464277029037476, + "step": 159530 + }, + { + "epoch": 0.6849385641791814, + "grad_norm": 1.6570888757705688, + "learning_rate": 3.164242042720523e-05, + "loss": 0.18562830686569215, + "step": 159540 + }, + { + "epoch": 0.6849814962692014, + "grad_norm": 2.1816341876983643, + "learning_rate": 3.163810870708761e-05, + "loss": 0.3160805940628052, + "step": 159550 + }, + { + "epoch": 0.6850244283592214, + "grad_norm": 0.47744184732437134, + "learning_rate": 3.1633796986969984e-05, + "loss": 0.13849132061004638, + "step": 159560 + }, + { + "epoch": 0.6850673604492414, + "grad_norm": 0.006263840477913618, + "learning_rate": 3.162948526685236e-05, + "loss": 0.33607680797576905, + "step": 159570 + }, + { + "epoch": 0.6851102925392614, + "grad_norm": 2.6430816650390625, + "learning_rate": 3.162517354673474e-05, + "loss": 0.48966164588928224, + "step": 159580 + }, + { + "epoch": 0.6851532246292814, + "grad_norm": 0.0013148311991244555, + "learning_rate": 3.162086182661711e-05, + "loss": 0.07859854102134704, + "step": 159590 + }, + { + "epoch": 0.6851961567193015, + "grad_norm": 0.1708020567893982, + "learning_rate": 3.161655010649949e-05, + "loss": 0.28932130336761475, + "step": 159600 + }, + { + "epoch": 0.6852390888093214, + "grad_norm": 0.0995602011680603, + "learning_rate": 3.1612238386381864e-05, + "loss": 0.04597091376781463, + "step": 159610 + }, + { + "epoch": 0.6852820208993414, + "grad_norm": 1.5684750080108643, + "learning_rate": 3.160792666626424e-05, + "loss": 0.28462212085723876, + "step": 159620 + }, + { + "epoch": 0.6853249529893615, + "grad_norm": 0.0007086934638209641, + "learning_rate": 3.160361494614661e-05, + "loss": 0.13424597978591918, + "step": 159630 + }, + { + "epoch": 0.6853678850793814, + "grad_norm": 2.4835119247436523, + "learning_rate": 3.159930322602899e-05, + "loss": 0.16594674587249755, + "step": 159640 + }, + { + "epoch": 0.6854108171694014, + "grad_norm": 1.9306057691574097, + "learning_rate": 3.159499150591137e-05, + "loss": 0.30010387897491453, + "step": 159650 + }, + { + "epoch": 0.6854537492594215, + "grad_norm": 0.3395925462245941, + "learning_rate": 3.159067978579375e-05, + "loss": 0.2560022592544556, + "step": 159660 + }, + { + "epoch": 0.6854966813494414, + "grad_norm": 0.00048148524365387857, + "learning_rate": 3.158636806567612e-05, + "loss": 0.073485267162323, + "step": 159670 + }, + { + "epoch": 0.6855396134394615, + "grad_norm": 1.5306675434112549, + "learning_rate": 3.15820563455585e-05, + "loss": 0.1691593885421753, + "step": 159680 + }, + { + "epoch": 0.6855825455294815, + "grad_norm": 7.120598793029785, + "learning_rate": 3.1577744625440876e-05, + "loss": 0.24635562896728516, + "step": 159690 + }, + { + "epoch": 0.6856254776195014, + "grad_norm": 0.6594388484954834, + "learning_rate": 3.1573432905323253e-05, + "loss": 0.09426384568214416, + "step": 159700 + }, + { + "epoch": 0.6856684097095215, + "grad_norm": 0.019663846120238304, + "learning_rate": 3.156912118520563e-05, + "loss": 0.0715424358844757, + "step": 159710 + }, + { + "epoch": 0.6857113417995415, + "grad_norm": 5.1663432121276855, + "learning_rate": 3.1564809465088e-05, + "loss": 0.39462015628814695, + "step": 159720 + }, + { + "epoch": 0.6857542738895614, + "grad_norm": 0.09658759087324142, + "learning_rate": 3.156049774497038e-05, + "loss": 0.16555283069610596, + "step": 159730 + }, + { + "epoch": 0.6857972059795815, + "grad_norm": 0.000354397197952494, + "learning_rate": 3.1556186024852756e-05, + "loss": 0.21570301055908203, + "step": 159740 + }, + { + "epoch": 0.6858401380696015, + "grad_norm": 8.785991668701172, + "learning_rate": 3.155187430473513e-05, + "loss": 0.27770447731018066, + "step": 159750 + }, + { + "epoch": 0.6858830701596215, + "grad_norm": 0.06055491045117378, + "learning_rate": 3.1547562584617504e-05, + "loss": 0.0837552785873413, + "step": 159760 + }, + { + "epoch": 0.6859260022496415, + "grad_norm": 0.2875801920890808, + "learning_rate": 3.154325086449989e-05, + "loss": 0.19612315893173218, + "step": 159770 + }, + { + "epoch": 0.6859689343396616, + "grad_norm": 0.024394122883677483, + "learning_rate": 3.1538939144382265e-05, + "loss": 0.23119087219238282, + "step": 159780 + }, + { + "epoch": 0.6860118664296815, + "grad_norm": 0.0021245151292532682, + "learning_rate": 3.153462742426464e-05, + "loss": 0.1430792450904846, + "step": 159790 + }, + { + "epoch": 0.6860547985197015, + "grad_norm": 0.04334768280386925, + "learning_rate": 3.153031570414701e-05, + "loss": 0.1303979992866516, + "step": 159800 + }, + { + "epoch": 0.6860977306097216, + "grad_norm": 0.020839013159275055, + "learning_rate": 3.152600398402939e-05, + "loss": 0.07669672966003419, + "step": 159810 + }, + { + "epoch": 0.6861406626997415, + "grad_norm": 1.1684037446975708, + "learning_rate": 3.152169226391177e-05, + "loss": 0.17646958827972412, + "step": 159820 + }, + { + "epoch": 0.6861835947897615, + "grad_norm": 0.00033721158979460597, + "learning_rate": 3.1517380543794145e-05, + "loss": 0.23716237545013427, + "step": 159830 + }, + { + "epoch": 0.6862265268797816, + "grad_norm": 0.6565494537353516, + "learning_rate": 3.1513068823676516e-05, + "loss": 0.24103212356567383, + "step": 159840 + }, + { + "epoch": 0.6862694589698015, + "grad_norm": 0.006676824763417244, + "learning_rate": 3.150875710355889e-05, + "loss": 0.14299737215042113, + "step": 159850 + }, + { + "epoch": 0.6863123910598216, + "grad_norm": 1.2873560190200806, + "learning_rate": 3.150444538344127e-05, + "loss": 0.39502582550048826, + "step": 159860 + }, + { + "epoch": 0.6863553231498416, + "grad_norm": 0.019151126965880394, + "learning_rate": 3.150013366332365e-05, + "loss": 0.001504859235137701, + "step": 159870 + }, + { + "epoch": 0.6863982552398615, + "grad_norm": 1.4346905946731567, + "learning_rate": 3.1495821943206025e-05, + "loss": 0.453825044631958, + "step": 159880 + }, + { + "epoch": 0.6864411873298816, + "grad_norm": 0.0022187780123203993, + "learning_rate": 3.14915102230884e-05, + "loss": 0.19337046146392822, + "step": 159890 + }, + { + "epoch": 0.6864841194199016, + "grad_norm": 0.5133907198905945, + "learning_rate": 3.148719850297078e-05, + "loss": 0.1229052186012268, + "step": 159900 + }, + { + "epoch": 0.6865270515099217, + "grad_norm": 0.06288864463567734, + "learning_rate": 3.148288678285316e-05, + "loss": 0.3862318754196167, + "step": 159910 + }, + { + "epoch": 0.6865699835999416, + "grad_norm": 0.23961742222309113, + "learning_rate": 3.147857506273553e-05, + "loss": 0.28116235733032224, + "step": 159920 + }, + { + "epoch": 0.6866129156899616, + "grad_norm": 2.730591058731079, + "learning_rate": 3.1474263342617905e-05, + "loss": 0.27672324180603025, + "step": 159930 + }, + { + "epoch": 0.6866558477799817, + "grad_norm": 0.0005314791342243552, + "learning_rate": 3.146995162250028e-05, + "loss": 0.33863320350646975, + "step": 159940 + }, + { + "epoch": 0.6866987798700016, + "grad_norm": 0.9973521828651428, + "learning_rate": 3.146563990238266e-05, + "loss": 0.19327343702316285, + "step": 159950 + }, + { + "epoch": 0.6867417119600217, + "grad_norm": 0.028170527890324593, + "learning_rate": 3.146132818226503e-05, + "loss": 0.26722302436828616, + "step": 159960 + }, + { + "epoch": 0.6867846440500417, + "grad_norm": 0.028313281014561653, + "learning_rate": 3.145701646214741e-05, + "loss": 0.16753727197647095, + "step": 159970 + }, + { + "epoch": 0.6868275761400616, + "grad_norm": 0.00603932561352849, + "learning_rate": 3.1452704742029785e-05, + "loss": 0.313533616065979, + "step": 159980 + }, + { + "epoch": 0.6868705082300817, + "grad_norm": 1.455635905265808, + "learning_rate": 3.144839302191216e-05, + "loss": 0.2084658622741699, + "step": 159990 + }, + { + "epoch": 0.6869134403201017, + "grad_norm": 0.003664513351395726, + "learning_rate": 3.144408130179454e-05, + "loss": 0.26113555431365965, + "step": 160000 + }, + { + "epoch": 0.6869134403201017, + "eval_loss": 0.3895556628704071, + "eval_runtime": 27.1269, + "eval_samples_per_second": 3.686, + "eval_steps_per_second": 3.686, + "step": 160000 + }, + { + "epoch": 0.6869563724101216, + "grad_norm": 1.3072751760482788, + "learning_rate": 3.143976958167692e-05, + "loss": 0.25968708992004397, + "step": 160010 + }, + { + "epoch": 0.6869993045001417, + "grad_norm": 0.5385197997093201, + "learning_rate": 3.1435457861559294e-05, + "loss": 0.13607220649719237, + "step": 160020 + }, + { + "epoch": 0.6870422365901617, + "grad_norm": 0.033412832766771317, + "learning_rate": 3.143114614144167e-05, + "loss": 0.23602588176727296, + "step": 160030 + }, + { + "epoch": 0.6870851686801817, + "grad_norm": 0.029198991134762764, + "learning_rate": 3.142683442132404e-05, + "loss": 0.12028307914733886, + "step": 160040 + }, + { + "epoch": 0.6871281007702017, + "grad_norm": 0.0036634765565395355, + "learning_rate": 3.142252270120642e-05, + "loss": 0.2307124137878418, + "step": 160050 + }, + { + "epoch": 0.6871710328602217, + "grad_norm": 8.274073600769043, + "learning_rate": 3.14182109810888e-05, + "loss": 0.3074865102767944, + "step": 160060 + }, + { + "epoch": 0.6872139649502417, + "grad_norm": 0.010077284649014473, + "learning_rate": 3.1413899260971174e-05, + "loss": 0.07915792465209961, + "step": 160070 + }, + { + "epoch": 0.6872568970402617, + "grad_norm": 0.02237793058156967, + "learning_rate": 3.140958754085355e-05, + "loss": 0.3373283624649048, + "step": 160080 + }, + { + "epoch": 0.6872998291302818, + "grad_norm": 0.01876627467572689, + "learning_rate": 3.140527582073592e-05, + "loss": 0.14437304735183715, + "step": 160090 + }, + { + "epoch": 0.6873427612203017, + "grad_norm": 0.08891065418720245, + "learning_rate": 3.14009641006183e-05, + "loss": 0.028795576095581053, + "step": 160100 + }, + { + "epoch": 0.6873856933103217, + "grad_norm": 0.6405884027481079, + "learning_rate": 3.139665238050068e-05, + "loss": 0.16790642738342285, + "step": 160110 + }, + { + "epoch": 0.6874286254003418, + "grad_norm": 2.350557804107666, + "learning_rate": 3.1392340660383054e-05, + "loss": 0.1628109931945801, + "step": 160120 + }, + { + "epoch": 0.6874715574903617, + "grad_norm": 0.002268231939524412, + "learning_rate": 3.138802894026543e-05, + "loss": 0.18723307847976683, + "step": 160130 + }, + { + "epoch": 0.6875144895803817, + "grad_norm": 0.004576073493808508, + "learning_rate": 3.138371722014781e-05, + "loss": 0.19961996078491212, + "step": 160140 + }, + { + "epoch": 0.6875574216704018, + "grad_norm": 6.669589996337891, + "learning_rate": 3.1379405500030186e-05, + "loss": 0.3290428400039673, + "step": 160150 + }, + { + "epoch": 0.6876003537604217, + "grad_norm": 0.938614010810852, + "learning_rate": 3.1375093779912564e-05, + "loss": 0.20669236183166503, + "step": 160160 + }, + { + "epoch": 0.6876432858504418, + "grad_norm": 0.08867359906435013, + "learning_rate": 3.1370782059794934e-05, + "loss": 0.13379579782485962, + "step": 160170 + }, + { + "epoch": 0.6876862179404618, + "grad_norm": 0.0012897284468635917, + "learning_rate": 3.136647033967731e-05, + "loss": 0.32950284481048586, + "step": 160180 + }, + { + "epoch": 0.6877291500304817, + "grad_norm": 0.14256958663463593, + "learning_rate": 3.136215861955969e-05, + "loss": 0.33088634014129636, + "step": 160190 + }, + { + "epoch": 0.6877720821205018, + "grad_norm": 0.0020969025790691376, + "learning_rate": 3.1357846899442066e-05, + "loss": 0.26999151706695557, + "step": 160200 + }, + { + "epoch": 0.6878150142105218, + "grad_norm": 0.6317549347877502, + "learning_rate": 3.135353517932444e-05, + "loss": 0.34250357151031496, + "step": 160210 + }, + { + "epoch": 0.6878579463005418, + "grad_norm": 1.39315927028656, + "learning_rate": 3.1349223459206814e-05, + "loss": 0.26374197006225586, + "step": 160220 + }, + { + "epoch": 0.6879008783905618, + "grad_norm": 0.11452850699424744, + "learning_rate": 3.134491173908919e-05, + "loss": 0.08184933662414551, + "step": 160230 + }, + { + "epoch": 0.6879438104805818, + "grad_norm": 1.91048264503479, + "learning_rate": 3.134060001897157e-05, + "loss": 0.10932408571243286, + "step": 160240 + }, + { + "epoch": 0.6879867425706018, + "grad_norm": 0.05805795639753342, + "learning_rate": 3.1336288298853946e-05, + "loss": 0.29509758949279785, + "step": 160250 + }, + { + "epoch": 0.6880296746606218, + "grad_norm": 0.13939379155635834, + "learning_rate": 3.1331976578736324e-05, + "loss": 0.1063469409942627, + "step": 160260 + }, + { + "epoch": 0.6880726067506419, + "grad_norm": 0.001417188672348857, + "learning_rate": 3.13276648586187e-05, + "loss": 0.0481675386428833, + "step": 160270 + }, + { + "epoch": 0.6881155388406618, + "grad_norm": 0.44573941826820374, + "learning_rate": 3.132335313850108e-05, + "loss": 0.1946892261505127, + "step": 160280 + }, + { + "epoch": 0.6881584709306818, + "grad_norm": 0.018087316304445267, + "learning_rate": 3.131904141838345e-05, + "loss": 0.08029451370239257, + "step": 160290 + }, + { + "epoch": 0.6882014030207019, + "grad_norm": 3.0328128337860107, + "learning_rate": 3.1314729698265826e-05, + "loss": 0.2553675651550293, + "step": 160300 + }, + { + "epoch": 0.6882443351107218, + "grad_norm": 0.018584132194519043, + "learning_rate": 3.1310417978148204e-05, + "loss": 0.15149282217025756, + "step": 160310 + }, + { + "epoch": 0.6882872672007418, + "grad_norm": 0.024891283363103867, + "learning_rate": 3.130610625803058e-05, + "loss": 0.2260596513748169, + "step": 160320 + }, + { + "epoch": 0.6883301992907619, + "grad_norm": 0.010092142969369888, + "learning_rate": 3.130179453791295e-05, + "loss": 0.3067698240280151, + "step": 160330 + }, + { + "epoch": 0.6883731313807819, + "grad_norm": 0.05609379708766937, + "learning_rate": 3.129748281779533e-05, + "loss": 0.007509586960077285, + "step": 160340 + }, + { + "epoch": 0.6884160634708019, + "grad_norm": 2.336320400238037, + "learning_rate": 3.1293171097677706e-05, + "loss": 0.14287641048431396, + "step": 160350 + }, + { + "epoch": 0.6884589955608219, + "grad_norm": 0.007429433986544609, + "learning_rate": 3.128885937756009e-05, + "loss": 0.28202841281890867, + "step": 160360 + }, + { + "epoch": 0.688501927650842, + "grad_norm": 0.31570228934288025, + "learning_rate": 3.128454765744246e-05, + "loss": 0.16258682012557985, + "step": 160370 + }, + { + "epoch": 0.6885448597408619, + "grad_norm": 0.2192019522190094, + "learning_rate": 3.128023593732484e-05, + "loss": 0.06812013983726502, + "step": 160380 + }, + { + "epoch": 0.6885877918308819, + "grad_norm": 1.0381423234939575, + "learning_rate": 3.1275924217207216e-05, + "loss": 0.1076446533203125, + "step": 160390 + }, + { + "epoch": 0.688630723920902, + "grad_norm": 0.008879567496478558, + "learning_rate": 3.127161249708959e-05, + "loss": 0.2584580183029175, + "step": 160400 + }, + { + "epoch": 0.6886736560109219, + "grad_norm": 7.12844705581665, + "learning_rate": 3.126730077697197e-05, + "loss": 0.059986865520477294, + "step": 160410 + }, + { + "epoch": 0.6887165881009419, + "grad_norm": 0.026752643287181854, + "learning_rate": 3.126298905685434e-05, + "loss": 0.24028337001800537, + "step": 160420 + }, + { + "epoch": 0.688759520190962, + "grad_norm": 0.056220006197690964, + "learning_rate": 3.125867733673672e-05, + "loss": 0.2983167409896851, + "step": 160430 + }, + { + "epoch": 0.6888024522809819, + "grad_norm": 0.2604270875453949, + "learning_rate": 3.1254365616619095e-05, + "loss": 0.0750252664089203, + "step": 160440 + }, + { + "epoch": 0.688845384371002, + "grad_norm": 0.009172854013741016, + "learning_rate": 3.125005389650147e-05, + "loss": 0.2441507339477539, + "step": 160450 + }, + { + "epoch": 0.688888316461022, + "grad_norm": 0.33372387290000916, + "learning_rate": 3.124574217638384e-05, + "loss": 0.32252767086029055, + "step": 160460 + }, + { + "epoch": 0.6889312485510419, + "grad_norm": 1.07737398147583, + "learning_rate": 3.124143045626623e-05, + "loss": 0.20390644073486328, + "step": 160470 + }, + { + "epoch": 0.688974180641062, + "grad_norm": 1.6875758171081543, + "learning_rate": 3.1237118736148605e-05, + "loss": 0.22852468490600586, + "step": 160480 + }, + { + "epoch": 0.689017112731082, + "grad_norm": 0.16039396822452545, + "learning_rate": 3.123280701603098e-05, + "loss": 0.22296838760375975, + "step": 160490 + }, + { + "epoch": 0.689060044821102, + "grad_norm": 1.185931921005249, + "learning_rate": 3.122849529591335e-05, + "loss": 0.08817307353019714, + "step": 160500 + }, + { + "epoch": 0.689102976911122, + "grad_norm": 8.558279991149902, + "learning_rate": 3.122418357579573e-05, + "loss": 0.22579166889190674, + "step": 160510 + }, + { + "epoch": 0.689145909001142, + "grad_norm": 0.5402048230171204, + "learning_rate": 3.121987185567811e-05, + "loss": 0.32658090591430666, + "step": 160520 + }, + { + "epoch": 0.689188841091162, + "grad_norm": 7.0292463302612305, + "learning_rate": 3.1215560135560485e-05, + "loss": 0.12179718017578126, + "step": 160530 + }, + { + "epoch": 0.689231773181182, + "grad_norm": 0.03717799857258797, + "learning_rate": 3.1211248415442855e-05, + "loss": 0.3219716787338257, + "step": 160540 + }, + { + "epoch": 0.689274705271202, + "grad_norm": 0.08582460135221481, + "learning_rate": 3.120693669532523e-05, + "loss": 0.3392592191696167, + "step": 160550 + }, + { + "epoch": 0.689317637361222, + "grad_norm": 0.05416666343808174, + "learning_rate": 3.120262497520761e-05, + "loss": 0.18559746742248534, + "step": 160560 + }, + { + "epoch": 0.689360569451242, + "grad_norm": 0.020222166553139687, + "learning_rate": 3.119831325508999e-05, + "loss": 0.10422813892364502, + "step": 160570 + }, + { + "epoch": 0.6894035015412621, + "grad_norm": 2.5958380699157715, + "learning_rate": 3.1194001534972365e-05, + "loss": 0.3050968170166016, + "step": 160580 + }, + { + "epoch": 0.689446433631282, + "grad_norm": 0.02333100326359272, + "learning_rate": 3.118968981485474e-05, + "loss": 0.36220335960388184, + "step": 160590 + }, + { + "epoch": 0.689489365721302, + "grad_norm": 0.003293287241831422, + "learning_rate": 3.118537809473712e-05, + "loss": 0.1430499792098999, + "step": 160600 + }, + { + "epoch": 0.6895322978113221, + "grad_norm": 0.003580573247745633, + "learning_rate": 3.11810663746195e-05, + "loss": 0.26941795349121095, + "step": 160610 + }, + { + "epoch": 0.689575229901342, + "grad_norm": 0.03517276421189308, + "learning_rate": 3.117675465450187e-05, + "loss": 0.2626699447631836, + "step": 160620 + }, + { + "epoch": 0.689618161991362, + "grad_norm": 0.8139095306396484, + "learning_rate": 3.1172442934384245e-05, + "loss": 0.06488630175590515, + "step": 160630 + }, + { + "epoch": 0.6896610940813821, + "grad_norm": 11.103382110595703, + "learning_rate": 3.116813121426662e-05, + "loss": 0.3776247501373291, + "step": 160640 + }, + { + "epoch": 0.689704026171402, + "grad_norm": 1.2519363164901733, + "learning_rate": 3.1163819494149e-05, + "loss": 0.11121902465820313, + "step": 160650 + }, + { + "epoch": 0.6897469582614221, + "grad_norm": 0.36693301796913147, + "learning_rate": 3.115950777403137e-05, + "loss": 0.2761913061141968, + "step": 160660 + }, + { + "epoch": 0.6897898903514421, + "grad_norm": 0.003073914907872677, + "learning_rate": 3.115519605391375e-05, + "loss": 0.053472626209259036, + "step": 160670 + }, + { + "epoch": 0.689832822441462, + "grad_norm": 0.05906624346971512, + "learning_rate": 3.1150884333796125e-05, + "loss": 0.24504506587982178, + "step": 160680 + }, + { + "epoch": 0.6898757545314821, + "grad_norm": 0.018624769523739815, + "learning_rate": 3.11465726136785e-05, + "loss": 0.16039087772369384, + "step": 160690 + }, + { + "epoch": 0.6899186866215021, + "grad_norm": 0.7570061683654785, + "learning_rate": 3.114226089356088e-05, + "loss": 0.1374683976173401, + "step": 160700 + }, + { + "epoch": 0.6899616187115221, + "grad_norm": 0.5951012372970581, + "learning_rate": 3.1137949173443257e-05, + "loss": 0.21923141479492186, + "step": 160710 + }, + { + "epoch": 0.6900045508015421, + "grad_norm": 0.004915403202176094, + "learning_rate": 3.1133637453325634e-05, + "loss": 0.3021175622940063, + "step": 160720 + }, + { + "epoch": 0.6900474828915621, + "grad_norm": 2.058332920074463, + "learning_rate": 3.112932573320801e-05, + "loss": 0.4457847595214844, + "step": 160730 + }, + { + "epoch": 0.6900904149815821, + "grad_norm": 1.1177704334259033, + "learning_rate": 3.112501401309038e-05, + "loss": 0.23458011150360109, + "step": 160740 + }, + { + "epoch": 0.6901333470716021, + "grad_norm": 0.007893427275121212, + "learning_rate": 3.112070229297276e-05, + "loss": 0.28530762195587156, + "step": 160750 + }, + { + "epoch": 0.6901762791616222, + "grad_norm": 0.0039029717445373535, + "learning_rate": 3.1116390572855137e-05, + "loss": 0.23412270545959474, + "step": 160760 + }, + { + "epoch": 0.6902192112516422, + "grad_norm": 0.0970078706741333, + "learning_rate": 3.1112078852737514e-05, + "loss": 0.2985024929046631, + "step": 160770 + }, + { + "epoch": 0.6902621433416621, + "grad_norm": 3.841125726699829, + "learning_rate": 3.110776713261989e-05, + "loss": 0.28366975784301757, + "step": 160780 + }, + { + "epoch": 0.6903050754316822, + "grad_norm": 0.0038798335008323193, + "learning_rate": 3.110345541250226e-05, + "loss": 0.27859447002410886, + "step": 160790 + }, + { + "epoch": 0.6903480075217022, + "grad_norm": 4.823430061340332, + "learning_rate": 3.109914369238464e-05, + "loss": 0.19317221641540527, + "step": 160800 + }, + { + "epoch": 0.6903909396117222, + "grad_norm": 4.753268718719482, + "learning_rate": 3.1094831972267016e-05, + "loss": 0.11289010047912598, + "step": 160810 + }, + { + "epoch": 0.6904338717017422, + "grad_norm": 5.022866249084473, + "learning_rate": 3.1090520252149394e-05, + "loss": 0.17281042337417601, + "step": 160820 + }, + { + "epoch": 0.6904768037917622, + "grad_norm": 0.7847058176994324, + "learning_rate": 3.108620853203177e-05, + "loss": 0.34594273567199707, + "step": 160830 + }, + { + "epoch": 0.6905197358817822, + "grad_norm": 1.2339727878570557, + "learning_rate": 3.108189681191415e-05, + "loss": 0.4348318576812744, + "step": 160840 + }, + { + "epoch": 0.6905626679718022, + "grad_norm": 0.024677403271198273, + "learning_rate": 3.1077585091796526e-05, + "loss": 0.00807729884982109, + "step": 160850 + }, + { + "epoch": 0.6906056000618223, + "grad_norm": 0.12931907176971436, + "learning_rate": 3.10732733716789e-05, + "loss": 0.11871621608734131, + "step": 160860 + }, + { + "epoch": 0.6906485321518422, + "grad_norm": 0.07335333526134491, + "learning_rate": 3.1068961651561274e-05, + "loss": 0.2547421932220459, + "step": 160870 + }, + { + "epoch": 0.6906914642418622, + "grad_norm": 0.05584121495485306, + "learning_rate": 3.106464993144365e-05, + "loss": 0.2417241096496582, + "step": 160880 + }, + { + "epoch": 0.6907343963318823, + "grad_norm": 0.0022020612377673388, + "learning_rate": 3.106033821132603e-05, + "loss": 0.05848243236541748, + "step": 160890 + }, + { + "epoch": 0.6907773284219022, + "grad_norm": 0.03212396800518036, + "learning_rate": 3.1056026491208406e-05, + "loss": 0.36734011173248293, + "step": 160900 + }, + { + "epoch": 0.6908202605119222, + "grad_norm": 1.3712433576583862, + "learning_rate": 3.1051714771090776e-05, + "loss": 0.11554309129714965, + "step": 160910 + }, + { + "epoch": 0.6908631926019423, + "grad_norm": 0.0002910851326305419, + "learning_rate": 3.1047403050973154e-05, + "loss": 0.3291846513748169, + "step": 160920 + }, + { + "epoch": 0.6909061246919622, + "grad_norm": 0.008376223966479301, + "learning_rate": 3.104309133085553e-05, + "loss": 0.17492603063583373, + "step": 160930 + }, + { + "epoch": 0.6909490567819823, + "grad_norm": 0.00044483650708571076, + "learning_rate": 3.103877961073791e-05, + "loss": 0.003586423024535179, + "step": 160940 + }, + { + "epoch": 0.6909919888720023, + "grad_norm": 0.6287297606468201, + "learning_rate": 3.1034467890620286e-05, + "loss": 0.19123516082763672, + "step": 160950 + }, + { + "epoch": 0.6910349209620222, + "grad_norm": 1.69691002368927, + "learning_rate": 3.103015617050266e-05, + "loss": 0.3056190490722656, + "step": 160960 + }, + { + "epoch": 0.6910778530520423, + "grad_norm": 0.202299565076828, + "learning_rate": 3.102584445038504e-05, + "loss": 0.3371596336364746, + "step": 160970 + }, + { + "epoch": 0.6911207851420623, + "grad_norm": 1.769765853881836, + "learning_rate": 3.102153273026742e-05, + "loss": 0.4470851898193359, + "step": 160980 + }, + { + "epoch": 0.6911637172320823, + "grad_norm": 0.14136166870594025, + "learning_rate": 3.101722101014979e-05, + "loss": 0.19823846817016602, + "step": 160990 + }, + { + "epoch": 0.6912066493221023, + "grad_norm": 1.6474438905715942, + "learning_rate": 3.1012909290032166e-05, + "loss": 0.2073206424713135, + "step": 161000 + }, + { + "epoch": 0.6912066493221023, + "eval_loss": 0.3861920237541199, + "eval_runtime": 27.2145, + "eval_samples_per_second": 3.675, + "eval_steps_per_second": 3.675, + "step": 161000 + }, + { + "epoch": 0.6912495814121223, + "grad_norm": 0.3103042542934418, + "learning_rate": 3.100859756991454e-05, + "loss": 0.09276053905487061, + "step": 161010 + }, + { + "epoch": 0.6912925135021423, + "grad_norm": 0.05611109361052513, + "learning_rate": 3.100428584979692e-05, + "loss": 0.24587397575378417, + "step": 161020 + }, + { + "epoch": 0.6913354455921623, + "grad_norm": 0.03766069933772087, + "learning_rate": 3.099997412967929e-05, + "loss": 0.2660505294799805, + "step": 161030 + }, + { + "epoch": 0.6913783776821824, + "grad_norm": 0.11256032437086105, + "learning_rate": 3.099566240956167e-05, + "loss": 0.06991714239120483, + "step": 161040 + }, + { + "epoch": 0.6914213097722023, + "grad_norm": 0.0912373960018158, + "learning_rate": 3.0991350689444046e-05, + "loss": 0.11046913862228394, + "step": 161050 + }, + { + "epoch": 0.6914642418622223, + "grad_norm": 0.043732017278671265, + "learning_rate": 3.098703896932642e-05, + "loss": 0.1306806683540344, + "step": 161060 + }, + { + "epoch": 0.6915071739522424, + "grad_norm": 0.0003347241145092994, + "learning_rate": 3.09827272492088e-05, + "loss": 0.16925837993621826, + "step": 161070 + }, + { + "epoch": 0.6915501060422623, + "grad_norm": 0.013059504330158234, + "learning_rate": 3.097841552909118e-05, + "loss": 0.08966315388679505, + "step": 161080 + }, + { + "epoch": 0.6915930381322823, + "grad_norm": 0.8090730309486389, + "learning_rate": 3.0974103808973555e-05, + "loss": 0.25440073013305664, + "step": 161090 + }, + { + "epoch": 0.6916359702223024, + "grad_norm": 0.026317503303289413, + "learning_rate": 3.096979208885593e-05, + "loss": 0.17350938320159912, + "step": 161100 + }, + { + "epoch": 0.6916789023123223, + "grad_norm": 0.006155726965516806, + "learning_rate": 3.09654803687383e-05, + "loss": 0.08406718969345092, + "step": 161110 + }, + { + "epoch": 0.6917218344023424, + "grad_norm": 0.000807679258286953, + "learning_rate": 3.096116864862068e-05, + "loss": 0.06873743534088135, + "step": 161120 + }, + { + "epoch": 0.6917647664923624, + "grad_norm": 0.05226420238614082, + "learning_rate": 3.095685692850306e-05, + "loss": 0.1460352897644043, + "step": 161130 + }, + { + "epoch": 0.6918076985823823, + "grad_norm": 0.020941512659192085, + "learning_rate": 3.0952545208385435e-05, + "loss": 0.15456979274749755, + "step": 161140 + }, + { + "epoch": 0.6918506306724024, + "grad_norm": 0.006885021924972534, + "learning_rate": 3.094823348826781e-05, + "loss": 0.12112609148025513, + "step": 161150 + }, + { + "epoch": 0.6918935627624224, + "grad_norm": 0.0010463733924552798, + "learning_rate": 3.094392176815018e-05, + "loss": 0.14944925308227539, + "step": 161160 + }, + { + "epoch": 0.6919364948524424, + "grad_norm": 0.0837843269109726, + "learning_rate": 3.093961004803256e-05, + "loss": 0.0708765983581543, + "step": 161170 + }, + { + "epoch": 0.6919794269424624, + "grad_norm": 0.0009495640988461673, + "learning_rate": 3.0935298327914944e-05, + "loss": 0.397504997253418, + "step": 161180 + }, + { + "epoch": 0.6920223590324824, + "grad_norm": 1.5892711877822876, + "learning_rate": 3.093098660779732e-05, + "loss": 0.27846102714538573, + "step": 161190 + }, + { + "epoch": 0.6920652911225025, + "grad_norm": 0.02117239683866501, + "learning_rate": 3.092667488767969e-05, + "loss": 0.1364277720451355, + "step": 161200 + }, + { + "epoch": 0.6921082232125224, + "grad_norm": 0.029707126319408417, + "learning_rate": 3.092236316756207e-05, + "loss": 0.09654564261436463, + "step": 161210 + }, + { + "epoch": 0.6921511553025425, + "grad_norm": 1.0642058849334717, + "learning_rate": 3.091805144744445e-05, + "loss": 0.20506763458251953, + "step": 161220 + }, + { + "epoch": 0.6921940873925625, + "grad_norm": 0.3429529368877411, + "learning_rate": 3.0913739727326824e-05, + "loss": 0.23607473373413085, + "step": 161230 + }, + { + "epoch": 0.6922370194825824, + "grad_norm": 0.008567404001951218, + "learning_rate": 3.0909428007209195e-05, + "loss": 0.18678658008575438, + "step": 161240 + }, + { + "epoch": 0.6922799515726025, + "grad_norm": 0.72314453125, + "learning_rate": 3.090511628709157e-05, + "loss": 0.14462425708770751, + "step": 161250 + }, + { + "epoch": 0.6923228836626225, + "grad_norm": 0.0002619328151922673, + "learning_rate": 3.090080456697395e-05, + "loss": 0.15907952785491944, + "step": 161260 + }, + { + "epoch": 0.6923658157526424, + "grad_norm": 0.003494508331641555, + "learning_rate": 3.089649284685633e-05, + "loss": 0.19829418659210205, + "step": 161270 + }, + { + "epoch": 0.6924087478426625, + "grad_norm": 0.01050402969121933, + "learning_rate": 3.08921811267387e-05, + "loss": 0.23870673179626464, + "step": 161280 + }, + { + "epoch": 0.6924516799326825, + "grad_norm": 2.830049514770508, + "learning_rate": 3.088786940662108e-05, + "loss": 0.3266881465911865, + "step": 161290 + }, + { + "epoch": 0.6924946120227025, + "grad_norm": 0.07396464049816132, + "learning_rate": 3.088355768650346e-05, + "loss": 0.24633843898773194, + "step": 161300 + }, + { + "epoch": 0.6925375441127225, + "grad_norm": 6.711921691894531, + "learning_rate": 3.0879245966385836e-05, + "loss": 0.3260725259780884, + "step": 161310 + }, + { + "epoch": 0.6925804762027425, + "grad_norm": 0.00216602417640388, + "learning_rate": 3.087493424626821e-05, + "loss": 0.152586030960083, + "step": 161320 + }, + { + "epoch": 0.6926234082927625, + "grad_norm": 0.004018913954496384, + "learning_rate": 3.0870622526150584e-05, + "loss": 0.17486083507537842, + "step": 161330 + }, + { + "epoch": 0.6926663403827825, + "grad_norm": 0.04566876217722893, + "learning_rate": 3.086631080603296e-05, + "loss": 0.11617728471755981, + "step": 161340 + }, + { + "epoch": 0.6927092724728026, + "grad_norm": 0.2247513383626938, + "learning_rate": 3.086199908591534e-05, + "loss": 0.23323283195495606, + "step": 161350 + }, + { + "epoch": 0.6927522045628225, + "grad_norm": 0.7604058980941772, + "learning_rate": 3.085768736579771e-05, + "loss": 0.4106475353240967, + "step": 161360 + }, + { + "epoch": 0.6927951366528425, + "grad_norm": 20.811613082885742, + "learning_rate": 3.085337564568009e-05, + "loss": 0.1343095064163208, + "step": 161370 + }, + { + "epoch": 0.6928380687428626, + "grad_norm": 1.6863338947296143, + "learning_rate": 3.0849063925562464e-05, + "loss": 0.2618619680404663, + "step": 161380 + }, + { + "epoch": 0.6928810008328825, + "grad_norm": 0.0029260574374347925, + "learning_rate": 3.084475220544484e-05, + "loss": 0.1627426862716675, + "step": 161390 + }, + { + "epoch": 0.6929239329229026, + "grad_norm": 0.00023793634318280965, + "learning_rate": 3.084044048532722e-05, + "loss": 0.2666466474533081, + "step": 161400 + }, + { + "epoch": 0.6929668650129226, + "grad_norm": 0.010093128308653831, + "learning_rate": 3.0836128765209596e-05, + "loss": 0.26630597114562987, + "step": 161410 + }, + { + "epoch": 0.6930097971029425, + "grad_norm": 0.001136020408011973, + "learning_rate": 3.0831817045091973e-05, + "loss": 0.24697518348693848, + "step": 161420 + }, + { + "epoch": 0.6930527291929626, + "grad_norm": 0.9408860802650452, + "learning_rate": 3.082750532497435e-05, + "loss": 0.16035972833633422, + "step": 161430 + }, + { + "epoch": 0.6930956612829826, + "grad_norm": 7.362483024597168, + "learning_rate": 3.082319360485672e-05, + "loss": 0.1600419044494629, + "step": 161440 + }, + { + "epoch": 0.6931385933730025, + "grad_norm": 0.0010307239135727286, + "learning_rate": 3.08188818847391e-05, + "loss": 0.2767223119735718, + "step": 161450 + }, + { + "epoch": 0.6931815254630226, + "grad_norm": 1.1499717235565186, + "learning_rate": 3.0814570164621476e-05, + "loss": 0.3421752691268921, + "step": 161460 + }, + { + "epoch": 0.6932244575530426, + "grad_norm": 1.340930700302124, + "learning_rate": 3.081025844450385e-05, + "loss": 0.3829795360565186, + "step": 161470 + }, + { + "epoch": 0.6932673896430626, + "grad_norm": 0.5305355787277222, + "learning_rate": 3.0805946724386224e-05, + "loss": 0.32126965522766116, + "step": 161480 + }, + { + "epoch": 0.6933103217330826, + "grad_norm": 0.019384825602173805, + "learning_rate": 3.08016350042686e-05, + "loss": 0.2005154609680176, + "step": 161490 + }, + { + "epoch": 0.6933532538231026, + "grad_norm": 0.0003718382795341313, + "learning_rate": 3.079732328415098e-05, + "loss": 0.19061292409896852, + "step": 161500 + }, + { + "epoch": 0.6933961859131226, + "grad_norm": 0.5108389854431152, + "learning_rate": 3.0793011564033356e-05, + "loss": 0.25514309406280516, + "step": 161510 + }, + { + "epoch": 0.6934391180031426, + "grad_norm": 0.6739466190338135, + "learning_rate": 3.078869984391573e-05, + "loss": 0.2486048936843872, + "step": 161520 + }, + { + "epoch": 0.6934820500931627, + "grad_norm": 1.4415817260742188, + "learning_rate": 3.078438812379811e-05, + "loss": 0.35487942695617675, + "step": 161530 + }, + { + "epoch": 0.6935249821831826, + "grad_norm": 0.013796456158161163, + "learning_rate": 3.078007640368049e-05, + "loss": 0.1332213282585144, + "step": 161540 + }, + { + "epoch": 0.6935679142732026, + "grad_norm": 0.011534093879163265, + "learning_rate": 3.0775764683562865e-05, + "loss": 0.12506383657455444, + "step": 161550 + }, + { + "epoch": 0.6936108463632227, + "grad_norm": 0.444654256105423, + "learning_rate": 3.077145296344524e-05, + "loss": 0.018143628537654877, + "step": 161560 + }, + { + "epoch": 0.6936537784532426, + "grad_norm": 0.0323002003133297, + "learning_rate": 3.076714124332761e-05, + "loss": 0.12150543928146362, + "step": 161570 + }, + { + "epoch": 0.6936967105432627, + "grad_norm": 0.0209288839250803, + "learning_rate": 3.076282952320999e-05, + "loss": 0.4645984172821045, + "step": 161580 + }, + { + "epoch": 0.6937396426332827, + "grad_norm": 0.04016595706343651, + "learning_rate": 3.075851780309237e-05, + "loss": 0.21200146675109863, + "step": 161590 + }, + { + "epoch": 0.6937825747233026, + "grad_norm": 2.0530660152435303, + "learning_rate": 3.0754206082974745e-05, + "loss": 0.24847280979156494, + "step": 161600 + }, + { + "epoch": 0.6938255068133227, + "grad_norm": 0.0006301469402387738, + "learning_rate": 3.0749894362857116e-05, + "loss": 0.21240553855895997, + "step": 161610 + }, + { + "epoch": 0.6938684389033427, + "grad_norm": 0.0044422089122235775, + "learning_rate": 3.074558264273949e-05, + "loss": 0.2668626070022583, + "step": 161620 + }, + { + "epoch": 0.6939113709933628, + "grad_norm": 0.0022322386503219604, + "learning_rate": 3.074127092262187e-05, + "loss": 0.25018165111541746, + "step": 161630 + }, + { + "epoch": 0.6939543030833827, + "grad_norm": 0.0028129578568041325, + "learning_rate": 3.073695920250425e-05, + "loss": 0.11671936511993408, + "step": 161640 + }, + { + "epoch": 0.6939972351734027, + "grad_norm": 0.033121053129434586, + "learning_rate": 3.0732647482386625e-05, + "loss": 0.2637017726898193, + "step": 161650 + }, + { + "epoch": 0.6940401672634228, + "grad_norm": 0.0012332494370639324, + "learning_rate": 3.0728335762269e-05, + "loss": 0.25103752613067626, + "step": 161660 + }, + { + "epoch": 0.6940830993534427, + "grad_norm": 5.106101036071777, + "learning_rate": 3.072402404215138e-05, + "loss": 0.27963852882385254, + "step": 161670 + }, + { + "epoch": 0.6941260314434627, + "grad_norm": 0.5057955384254456, + "learning_rate": 3.071971232203376e-05, + "loss": 0.22827911376953125, + "step": 161680 + }, + { + "epoch": 0.6941689635334828, + "grad_norm": 2.2521936893463135, + "learning_rate": 3.071540060191613e-05, + "loss": 0.13674638271331788, + "step": 161690 + }, + { + "epoch": 0.6942118956235027, + "grad_norm": 0.2217772901058197, + "learning_rate": 3.0711088881798505e-05, + "loss": 0.06654155850410462, + "step": 161700 + }, + { + "epoch": 0.6942548277135228, + "grad_norm": 0.98880535364151, + "learning_rate": 3.070677716168088e-05, + "loss": 0.19069713354110718, + "step": 161710 + }, + { + "epoch": 0.6942977598035428, + "grad_norm": 1.4045045375823975, + "learning_rate": 3.070246544156326e-05, + "loss": 0.18224425315856935, + "step": 161720 + }, + { + "epoch": 0.6943406918935627, + "grad_norm": 0.017259785905480385, + "learning_rate": 3.069815372144563e-05, + "loss": 0.14208518266677855, + "step": 161730 + }, + { + "epoch": 0.6943836239835828, + "grad_norm": 0.004367508925497532, + "learning_rate": 3.069384200132801e-05, + "loss": 0.1599483847618103, + "step": 161740 + }, + { + "epoch": 0.6944265560736028, + "grad_norm": 0.0010708282934501767, + "learning_rate": 3.0689530281210385e-05, + "loss": 0.06576325893402099, + "step": 161750 + }, + { + "epoch": 0.6944694881636228, + "grad_norm": 9.624493598937988, + "learning_rate": 3.068521856109276e-05, + "loss": 0.22715427875518798, + "step": 161760 + }, + { + "epoch": 0.6945124202536428, + "grad_norm": 0.0009245334658771753, + "learning_rate": 3.068090684097514e-05, + "loss": 0.41392016410827637, + "step": 161770 + }, + { + "epoch": 0.6945553523436628, + "grad_norm": 0.07319210469722748, + "learning_rate": 3.067659512085752e-05, + "loss": 0.1870087742805481, + "step": 161780 + }, + { + "epoch": 0.6945982844336828, + "grad_norm": 0.010038390755653381, + "learning_rate": 3.0672283400739894e-05, + "loss": 0.4162933826446533, + "step": 161790 + }, + { + "epoch": 0.6946412165237028, + "grad_norm": 1.0667264461517334, + "learning_rate": 3.066797168062227e-05, + "loss": 0.36954782009124754, + "step": 161800 + }, + { + "epoch": 0.6946841486137229, + "grad_norm": 0.09390226751565933, + "learning_rate": 3.066365996050464e-05, + "loss": 0.08525258898735047, + "step": 161810 + }, + { + "epoch": 0.6947270807037428, + "grad_norm": 1.0057406425476074, + "learning_rate": 3.065934824038702e-05, + "loss": 0.10366959571838379, + "step": 161820 + }, + { + "epoch": 0.6947700127937628, + "grad_norm": 0.027829967439174652, + "learning_rate": 3.06550365202694e-05, + "loss": 0.13686974048614503, + "step": 161830 + }, + { + "epoch": 0.6948129448837829, + "grad_norm": 0.48255249857902527, + "learning_rate": 3.0650724800151774e-05, + "loss": 0.2699114799499512, + "step": 161840 + }, + { + "epoch": 0.6948558769738028, + "grad_norm": 0.5585467219352722, + "learning_rate": 3.0646413080034145e-05, + "loss": 0.16011971235275269, + "step": 161850 + }, + { + "epoch": 0.6948988090638228, + "grad_norm": 0.003796887816861272, + "learning_rate": 3.064210135991652e-05, + "loss": 0.21151573657989503, + "step": 161860 + }, + { + "epoch": 0.6949417411538429, + "grad_norm": 0.0015773115446791053, + "learning_rate": 3.06377896397989e-05, + "loss": 0.30381500720977783, + "step": 161870 + }, + { + "epoch": 0.6949846732438628, + "grad_norm": 1.4874831438064575, + "learning_rate": 3.0633477919681284e-05, + "loss": 0.12889543771743775, + "step": 161880 + }, + { + "epoch": 0.6950276053338829, + "grad_norm": 4.193164825439453, + "learning_rate": 3.062916619956366e-05, + "loss": 0.17410954236984252, + "step": 161890 + }, + { + "epoch": 0.6950705374239029, + "grad_norm": 1.0373023748397827, + "learning_rate": 3.062485447944603e-05, + "loss": 0.3082738399505615, + "step": 161900 + }, + { + "epoch": 0.6951134695139228, + "grad_norm": 0.0028177013155072927, + "learning_rate": 3.062054275932841e-05, + "loss": 0.3014000654220581, + "step": 161910 + }, + { + "epoch": 0.6951564016039429, + "grad_norm": 0.821558952331543, + "learning_rate": 3.0616231039210786e-05, + "loss": 0.18940749168395996, + "step": 161920 + }, + { + "epoch": 0.6951993336939629, + "grad_norm": 0.30685821175575256, + "learning_rate": 3.0611919319093164e-05, + "loss": 0.3193447828292847, + "step": 161930 + }, + { + "epoch": 0.6952422657839828, + "grad_norm": 0.001213685842230916, + "learning_rate": 3.0607607598975534e-05, + "loss": 0.1531369924545288, + "step": 161940 + }, + { + "epoch": 0.6952851978740029, + "grad_norm": 0.2140471488237381, + "learning_rate": 3.060329587885791e-05, + "loss": 0.254035210609436, + "step": 161950 + }, + { + "epoch": 0.6953281299640229, + "grad_norm": 1.7509942054748535, + "learning_rate": 3.059898415874029e-05, + "loss": 0.23533205986022948, + "step": 161960 + }, + { + "epoch": 0.6953710620540429, + "grad_norm": 2.06317400932312, + "learning_rate": 3.0594672438622666e-05, + "loss": 0.1791067361831665, + "step": 161970 + }, + { + "epoch": 0.6954139941440629, + "grad_norm": 0.04516097903251648, + "learning_rate": 3.059036071850504e-05, + "loss": 0.17306737899780272, + "step": 161980 + }, + { + "epoch": 0.695456926234083, + "grad_norm": 0.0025410563684999943, + "learning_rate": 3.058604899838742e-05, + "loss": 0.2030165672302246, + "step": 161990 + }, + { + "epoch": 0.6954998583241029, + "grad_norm": 1.482028603553772, + "learning_rate": 3.05817372782698e-05, + "loss": 0.4031516551971436, + "step": 162000 + }, + { + "epoch": 0.6954998583241029, + "eval_loss": 0.38559821248054504, + "eval_runtime": 27.2501, + "eval_samples_per_second": 3.67, + "eval_steps_per_second": 3.67, + "step": 162000 + }, + { + "epoch": 0.6955427904141229, + "grad_norm": 0.06098842993378639, + "learning_rate": 3.0577425558152176e-05, + "loss": 0.032109972834587094, + "step": 162010 + }, + { + "epoch": 0.695585722504143, + "grad_norm": 0.001314207329414785, + "learning_rate": 3.0573113838034546e-05, + "loss": 0.42228131294250487, + "step": 162020 + }, + { + "epoch": 0.6956286545941629, + "grad_norm": 0.01732785813510418, + "learning_rate": 3.0568802117916924e-05, + "loss": 0.25459225177764894, + "step": 162030 + }, + { + "epoch": 0.6956715866841829, + "grad_norm": 6.963688850402832, + "learning_rate": 3.05644903977993e-05, + "loss": 0.23578083515167236, + "step": 162040 + }, + { + "epoch": 0.695714518774203, + "grad_norm": 1.8500609397888184, + "learning_rate": 3.056017867768168e-05, + "loss": 0.08068374395370484, + "step": 162050 + }, + { + "epoch": 0.695757450864223, + "grad_norm": 3.6719369888305664, + "learning_rate": 3.055586695756405e-05, + "loss": 0.2566834926605225, + "step": 162060 + }, + { + "epoch": 0.695800382954243, + "grad_norm": 0.011841950006783009, + "learning_rate": 3.0551555237446426e-05, + "loss": 0.12200218439102173, + "step": 162070 + }, + { + "epoch": 0.695843315044263, + "grad_norm": 0.5013278126716614, + "learning_rate": 3.0547243517328804e-05, + "loss": 0.11271430253982544, + "step": 162080 + }, + { + "epoch": 0.695886247134283, + "grad_norm": 0.0028883975464850664, + "learning_rate": 3.054293179721118e-05, + "loss": 0.06581767201423645, + "step": 162090 + }, + { + "epoch": 0.695929179224303, + "grad_norm": 0.05332833155989647, + "learning_rate": 3.053862007709356e-05, + "loss": 0.12176322937011719, + "step": 162100 + }, + { + "epoch": 0.695972111314323, + "grad_norm": 0.029225388541817665, + "learning_rate": 3.0534308356975936e-05, + "loss": 0.29567220211029055, + "step": 162110 + }, + { + "epoch": 0.6960150434043431, + "grad_norm": 0.010390534065663815, + "learning_rate": 3.052999663685831e-05, + "loss": 0.2578840732574463, + "step": 162120 + }, + { + "epoch": 0.696057975494363, + "grad_norm": 2.247405767440796, + "learning_rate": 3.052568491674069e-05, + "loss": 0.461102819442749, + "step": 162130 + }, + { + "epoch": 0.696100907584383, + "grad_norm": 0.010135611519217491, + "learning_rate": 3.052137319662306e-05, + "loss": 0.278436279296875, + "step": 162140 + }, + { + "epoch": 0.6961438396744031, + "grad_norm": 0.01691882126033306, + "learning_rate": 3.0517061476505438e-05, + "loss": 0.11886712312698364, + "step": 162150 + }, + { + "epoch": 0.696186771764423, + "grad_norm": 0.030704345554113388, + "learning_rate": 3.0512749756387815e-05, + "loss": 0.3023359775543213, + "step": 162160 + }, + { + "epoch": 0.696229703854443, + "grad_norm": 0.002497435314580798, + "learning_rate": 3.0508438036270193e-05, + "loss": 0.22855567932128906, + "step": 162170 + }, + { + "epoch": 0.6962726359444631, + "grad_norm": 0.034782905131578445, + "learning_rate": 3.0504126316152563e-05, + "loss": 0.06216605305671692, + "step": 162180 + }, + { + "epoch": 0.696315568034483, + "grad_norm": 1.0647518634796143, + "learning_rate": 3.049981459603494e-05, + "loss": 0.1632169008255005, + "step": 162190 + }, + { + "epoch": 0.6963585001245031, + "grad_norm": 0.20471031963825226, + "learning_rate": 3.049550287591732e-05, + "loss": 0.17721099853515626, + "step": 162200 + }, + { + "epoch": 0.6964014322145231, + "grad_norm": 0.0030015218071639538, + "learning_rate": 3.04911911557997e-05, + "loss": 0.2211029052734375, + "step": 162210 + }, + { + "epoch": 0.696444364304543, + "grad_norm": 1.2218440771102905, + "learning_rate": 3.048687943568207e-05, + "loss": 0.21433238983154296, + "step": 162220 + }, + { + "epoch": 0.6964872963945631, + "grad_norm": 1.407080888748169, + "learning_rate": 3.0482567715564447e-05, + "loss": 0.23432455062866211, + "step": 162230 + }, + { + "epoch": 0.6965302284845831, + "grad_norm": 0.1319386512041092, + "learning_rate": 3.0478255995446824e-05, + "loss": 0.17873740196228027, + "step": 162240 + }, + { + "epoch": 0.6965731605746031, + "grad_norm": 0.0010449601104483008, + "learning_rate": 3.04739442753292e-05, + "loss": 0.14610795974731444, + "step": 162250 + }, + { + "epoch": 0.6966160926646231, + "grad_norm": 1.0889495611190796, + "learning_rate": 3.046963255521158e-05, + "loss": 0.23132104873657228, + "step": 162260 + }, + { + "epoch": 0.6966590247546431, + "grad_norm": 3.6569557189941406, + "learning_rate": 3.0465320835093953e-05, + "loss": 0.25479922294616697, + "step": 162270 + }, + { + "epoch": 0.6967019568446631, + "grad_norm": 0.13837526738643646, + "learning_rate": 3.046100911497633e-05, + "loss": 0.023555827140808106, + "step": 162280 + }, + { + "epoch": 0.6967448889346831, + "grad_norm": 0.11677303165197372, + "learning_rate": 3.0456697394858707e-05, + "loss": 0.035855191946029666, + "step": 162290 + }, + { + "epoch": 0.6967878210247032, + "grad_norm": 0.03607289493083954, + "learning_rate": 3.0452385674741085e-05, + "loss": 0.3567406892776489, + "step": 162300 + }, + { + "epoch": 0.6968307531147231, + "grad_norm": 0.022890016436576843, + "learning_rate": 3.044807395462346e-05, + "loss": 0.08604545593261718, + "step": 162310 + }, + { + "epoch": 0.6968736852047431, + "grad_norm": 0.02981598488986492, + "learning_rate": 3.0443762234505836e-05, + "loss": 0.19815583229064943, + "step": 162320 + }, + { + "epoch": 0.6969166172947632, + "grad_norm": 0.316649854183197, + "learning_rate": 3.0439450514388213e-05, + "loss": 0.1974259853363037, + "step": 162330 + }, + { + "epoch": 0.6969595493847831, + "grad_norm": 2.9811651706695557, + "learning_rate": 3.043513879427059e-05, + "loss": 0.1774838924407959, + "step": 162340 + }, + { + "epoch": 0.6970024814748031, + "grad_norm": 0.007071362342685461, + "learning_rate": 3.043082707415296e-05, + "loss": 0.09746151566505432, + "step": 162350 + }, + { + "epoch": 0.6970454135648232, + "grad_norm": 4.316941738128662, + "learning_rate": 3.042651535403534e-05, + "loss": 0.30870752334594725, + "step": 162360 + }, + { + "epoch": 0.6970883456548431, + "grad_norm": 0.0259851086884737, + "learning_rate": 3.0422203633917716e-05, + "loss": 0.2086500644683838, + "step": 162370 + }, + { + "epoch": 0.6971312777448632, + "grad_norm": 0.08933248370885849, + "learning_rate": 3.0417891913800097e-05, + "loss": 0.30362553596496583, + "step": 162380 + }, + { + "epoch": 0.6971742098348832, + "grad_norm": 0.010057899169623852, + "learning_rate": 3.0413580193682467e-05, + "loss": 0.11705089807510376, + "step": 162390 + }, + { + "epoch": 0.6972171419249031, + "grad_norm": 0.01879618689417839, + "learning_rate": 3.0409268473564845e-05, + "loss": 0.3392336845397949, + "step": 162400 + }, + { + "epoch": 0.6972600740149232, + "grad_norm": 74.72895050048828, + "learning_rate": 3.0404956753447222e-05, + "loss": 0.23954296112060547, + "step": 162410 + }, + { + "epoch": 0.6973030061049432, + "grad_norm": 0.0661221295595169, + "learning_rate": 3.04006450333296e-05, + "loss": 0.3313016176223755, + "step": 162420 + }, + { + "epoch": 0.6973459381949632, + "grad_norm": 3.2034785747528076, + "learning_rate": 3.0396333313211973e-05, + "loss": 0.202050518989563, + "step": 162430 + }, + { + "epoch": 0.6973888702849832, + "grad_norm": 0.010310985147953033, + "learning_rate": 3.039202159309435e-05, + "loss": 0.3512110233306885, + "step": 162440 + }, + { + "epoch": 0.6974318023750032, + "grad_norm": 3.286404848098755, + "learning_rate": 3.0387709872976728e-05, + "loss": 0.30471627712249755, + "step": 162450 + }, + { + "epoch": 0.6974747344650232, + "grad_norm": 0.011127809062600136, + "learning_rate": 3.0383398152859105e-05, + "loss": 0.0021067624911665916, + "step": 162460 + }, + { + "epoch": 0.6975176665550432, + "grad_norm": 0.2516888976097107, + "learning_rate": 3.0379086432741476e-05, + "loss": 0.18555980920791626, + "step": 162470 + }, + { + "epoch": 0.6975605986450633, + "grad_norm": 4.618213653564453, + "learning_rate": 3.0374774712623853e-05, + "loss": 0.3885476112365723, + "step": 162480 + }, + { + "epoch": 0.6976035307350833, + "grad_norm": 0.1642121523618698, + "learning_rate": 3.0370462992506234e-05, + "loss": 0.23192245960235597, + "step": 162490 + }, + { + "epoch": 0.6976464628251032, + "grad_norm": 0.015001763589680195, + "learning_rate": 3.036615127238861e-05, + "loss": 0.2386481523513794, + "step": 162500 + }, + { + "epoch": 0.6976893949151233, + "grad_norm": 0.03155812993645668, + "learning_rate": 3.0361839552270982e-05, + "loss": 0.20554816722869873, + "step": 162510 + }, + { + "epoch": 0.6977323270051433, + "grad_norm": 0.04460495710372925, + "learning_rate": 3.035752783215336e-05, + "loss": 0.23647263050079345, + "step": 162520 + }, + { + "epoch": 0.6977752590951632, + "grad_norm": 0.040947332978248596, + "learning_rate": 3.0353216112035737e-05, + "loss": 0.23655667304992675, + "step": 162530 + }, + { + "epoch": 0.6978181911851833, + "grad_norm": 0.008029299788177013, + "learning_rate": 3.0348904391918114e-05, + "loss": 0.158699893951416, + "step": 162540 + }, + { + "epoch": 0.6978611232752033, + "grad_norm": 0.03955543786287308, + "learning_rate": 3.0344592671800488e-05, + "loss": 0.09176256060600281, + "step": 162550 + }, + { + "epoch": 0.6979040553652233, + "grad_norm": 0.03155898675322533, + "learning_rate": 3.0340280951682865e-05, + "loss": 0.2126624345779419, + "step": 162560 + }, + { + "epoch": 0.6979469874552433, + "grad_norm": 0.03072783350944519, + "learning_rate": 3.0335969231565242e-05, + "loss": 0.21906230449676514, + "step": 162570 + }, + { + "epoch": 0.6979899195452633, + "grad_norm": 0.2748047411441803, + "learning_rate": 3.033165751144762e-05, + "loss": 0.17269535064697267, + "step": 162580 + }, + { + "epoch": 0.6980328516352833, + "grad_norm": 0.2810547947883606, + "learning_rate": 3.032734579132999e-05, + "loss": 0.21564557552337646, + "step": 162590 + }, + { + "epoch": 0.6980757837253033, + "grad_norm": 2.9127063751220703, + "learning_rate": 3.032303407121237e-05, + "loss": 0.1324462652206421, + "step": 162600 + }, + { + "epoch": 0.6981187158153234, + "grad_norm": 1.284143328666687, + "learning_rate": 3.031872235109475e-05, + "loss": 0.42905235290527344, + "step": 162610 + }, + { + "epoch": 0.6981616479053433, + "grad_norm": 0.05410167947411537, + "learning_rate": 3.0314410630977126e-05, + "loss": 0.19309380054473876, + "step": 162620 + }, + { + "epoch": 0.6982045799953633, + "grad_norm": 0.13504809141159058, + "learning_rate": 3.0310098910859503e-05, + "loss": 0.09252834916114808, + "step": 162630 + }, + { + "epoch": 0.6982475120853834, + "grad_norm": 0.024992559105157852, + "learning_rate": 3.0305787190741874e-05, + "loss": 0.23650457859039306, + "step": 162640 + }, + { + "epoch": 0.6982904441754033, + "grad_norm": 6.187001705169678, + "learning_rate": 3.030147547062425e-05, + "loss": 0.19541068077087403, + "step": 162650 + }, + { + "epoch": 0.6983333762654234, + "grad_norm": 2.498851776123047, + "learning_rate": 3.029716375050663e-05, + "loss": 0.37836170196533203, + "step": 162660 + }, + { + "epoch": 0.6983763083554434, + "grad_norm": 0.027705468237400055, + "learning_rate": 3.0292852030389006e-05, + "loss": 0.2952697277069092, + "step": 162670 + }, + { + "epoch": 0.6984192404454633, + "grad_norm": 0.02884257212281227, + "learning_rate": 3.028854031027138e-05, + "loss": 0.14661755561828613, + "step": 162680 + }, + { + "epoch": 0.6984621725354834, + "grad_norm": 0.21641422808170319, + "learning_rate": 3.0284228590153757e-05, + "loss": 0.2717902421951294, + "step": 162690 + }, + { + "epoch": 0.6985051046255034, + "grad_norm": 1.3049355745315552, + "learning_rate": 3.0279916870036134e-05, + "loss": 0.17167997360229492, + "step": 162700 + }, + { + "epoch": 0.6985480367155233, + "grad_norm": 1.1670953035354614, + "learning_rate": 3.0275605149918512e-05, + "loss": 0.23196604251861572, + "step": 162710 + }, + { + "epoch": 0.6985909688055434, + "grad_norm": 0.0794382318854332, + "learning_rate": 3.0271293429800886e-05, + "loss": 0.2593263626098633, + "step": 162720 + }, + { + "epoch": 0.6986339008955634, + "grad_norm": 4.211735248565674, + "learning_rate": 3.0266981709683263e-05, + "loss": 0.3013274669647217, + "step": 162730 + }, + { + "epoch": 0.6986768329855834, + "grad_norm": 19.782581329345703, + "learning_rate": 3.026266998956564e-05, + "loss": 0.18726621866226195, + "step": 162740 + }, + { + "epoch": 0.6987197650756034, + "grad_norm": 2.8106091022491455, + "learning_rate": 3.0258358269448018e-05, + "loss": 0.2839569330215454, + "step": 162750 + }, + { + "epoch": 0.6987626971656234, + "grad_norm": 0.14473241567611694, + "learning_rate": 3.0254046549330388e-05, + "loss": 0.22602901458740235, + "step": 162760 + }, + { + "epoch": 0.6988056292556434, + "grad_norm": 0.009015548974275589, + "learning_rate": 3.0249734829212766e-05, + "loss": 0.21210157871246338, + "step": 162770 + }, + { + "epoch": 0.6988485613456634, + "grad_norm": 0.007358007598668337, + "learning_rate": 3.0245423109095143e-05, + "loss": 0.1613713264465332, + "step": 162780 + }, + { + "epoch": 0.6988914934356835, + "grad_norm": 0.7762153744697571, + "learning_rate": 3.0241111388977524e-05, + "loss": 0.23849725723266602, + "step": 162790 + }, + { + "epoch": 0.6989344255257034, + "grad_norm": 0.025210915133357048, + "learning_rate": 3.0236799668859894e-05, + "loss": 0.1408442735671997, + "step": 162800 + }, + { + "epoch": 0.6989773576157234, + "grad_norm": 0.2695859372615814, + "learning_rate": 3.023248794874227e-05, + "loss": 0.20804314613342284, + "step": 162810 + }, + { + "epoch": 0.6990202897057435, + "grad_norm": 0.34144577383995056, + "learning_rate": 3.022817622862465e-05, + "loss": 0.23827857971191407, + "step": 162820 + }, + { + "epoch": 0.6990632217957634, + "grad_norm": 0.05939861387014389, + "learning_rate": 3.0223864508507026e-05, + "loss": 0.16040401458740233, + "step": 162830 + }, + { + "epoch": 0.6991061538857835, + "grad_norm": 0.03952369838953018, + "learning_rate": 3.02195527883894e-05, + "loss": 0.04338504374027252, + "step": 162840 + }, + { + "epoch": 0.6991490859758035, + "grad_norm": 0.06770238280296326, + "learning_rate": 3.0215241068271778e-05, + "loss": 0.35053086280822754, + "step": 162850 + }, + { + "epoch": 0.6991920180658234, + "grad_norm": 0.04769308120012283, + "learning_rate": 3.0210929348154155e-05, + "loss": 0.17852011919021607, + "step": 162860 + }, + { + "epoch": 0.6992349501558435, + "grad_norm": 0.1503300666809082, + "learning_rate": 3.0206617628036532e-05, + "loss": 0.17389656305313111, + "step": 162870 + }, + { + "epoch": 0.6992778822458635, + "grad_norm": 23.386436462402344, + "learning_rate": 3.0202305907918903e-05, + "loss": 0.25399277210235593, + "step": 162880 + }, + { + "epoch": 0.6993208143358834, + "grad_norm": 0.028125956654548645, + "learning_rate": 3.019799418780128e-05, + "loss": 0.08183012008666993, + "step": 162890 + }, + { + "epoch": 0.6993637464259035, + "grad_norm": 0.020639831200242043, + "learning_rate": 3.019368246768366e-05, + "loss": 0.19469664096832276, + "step": 162900 + }, + { + "epoch": 0.6994066785159235, + "grad_norm": 0.17118969559669495, + "learning_rate": 3.0189370747566038e-05, + "loss": 0.2829993724822998, + "step": 162910 + }, + { + "epoch": 0.6994496106059436, + "grad_norm": 0.006604107096791267, + "learning_rate": 3.018505902744841e-05, + "loss": 0.3266103267669678, + "step": 162920 + }, + { + "epoch": 0.6994925426959635, + "grad_norm": 0.004010102711617947, + "learning_rate": 3.0180747307330786e-05, + "loss": 0.11663607358932496, + "step": 162930 + }, + { + "epoch": 0.6995354747859835, + "grad_norm": 23.561052322387695, + "learning_rate": 3.0176435587213164e-05, + "loss": 0.04907224178314209, + "step": 162940 + }, + { + "epoch": 0.6995784068760036, + "grad_norm": 0.003062853356823325, + "learning_rate": 3.017212386709554e-05, + "loss": 0.06884866952896118, + "step": 162950 + }, + { + "epoch": 0.6996213389660235, + "grad_norm": 1.0447180271148682, + "learning_rate": 3.0167812146977918e-05, + "loss": 0.23694450855255128, + "step": 162960 + }, + { + "epoch": 0.6996642710560436, + "grad_norm": 0.02204667404294014, + "learning_rate": 3.0163500426860292e-05, + "loss": 0.11371309757232666, + "step": 162970 + }, + { + "epoch": 0.6997072031460636, + "grad_norm": 1.6962567567825317, + "learning_rate": 3.015918870674267e-05, + "loss": 0.253522253036499, + "step": 162980 + }, + { + "epoch": 0.6997501352360835, + "grad_norm": 2.769549608230591, + "learning_rate": 3.0154876986625047e-05, + "loss": 0.18700562715530394, + "step": 162990 + }, + { + "epoch": 0.6997930673261036, + "grad_norm": 0.029071137309074402, + "learning_rate": 3.0150565266507424e-05, + "loss": 0.17749787569046022, + "step": 163000 + }, + { + "epoch": 0.6997930673261036, + "eval_loss": 0.3839460015296936, + "eval_runtime": 27.2931, + "eval_samples_per_second": 3.664, + "eval_steps_per_second": 3.664, + "step": 163000 + }, + { + "epoch": 0.6998359994161236, + "grad_norm": 0.04307461529970169, + "learning_rate": 3.0146253546389798e-05, + "loss": 0.2530116319656372, + "step": 163010 + }, + { + "epoch": 0.6998789315061436, + "grad_norm": 0.6724159717559814, + "learning_rate": 3.0141941826272175e-05, + "loss": 0.2557330846786499, + "step": 163020 + }, + { + "epoch": 0.6999218635961636, + "grad_norm": 0.05646286904811859, + "learning_rate": 3.0137630106154553e-05, + "loss": 0.2898242473602295, + "step": 163030 + }, + { + "epoch": 0.6999647956861836, + "grad_norm": 3.105821371078491, + "learning_rate": 3.013331838603693e-05, + "loss": 0.13781187534332276, + "step": 163040 + }, + { + "epoch": 0.7000077277762036, + "grad_norm": 0.054522983729839325, + "learning_rate": 3.01290066659193e-05, + "loss": 0.37929842472076414, + "step": 163050 + }, + { + "epoch": 0.7000506598662236, + "grad_norm": 0.006386724766343832, + "learning_rate": 3.0124694945801678e-05, + "loss": 0.3429419040679932, + "step": 163060 + }, + { + "epoch": 0.7000935919562437, + "grad_norm": 0.09468874335289001, + "learning_rate": 3.0120383225684055e-05, + "loss": 0.26384339332580564, + "step": 163070 + }, + { + "epoch": 0.7001365240462636, + "grad_norm": 1.3094730377197266, + "learning_rate": 3.0116071505566433e-05, + "loss": 0.17248300313949586, + "step": 163080 + }, + { + "epoch": 0.7001794561362836, + "grad_norm": 0.6385529041290283, + "learning_rate": 3.0111759785448807e-05, + "loss": 0.15869448184967042, + "step": 163090 + }, + { + "epoch": 0.7002223882263037, + "grad_norm": 0.006541683804243803, + "learning_rate": 3.0107448065331184e-05, + "loss": 0.2500231504440308, + "step": 163100 + }, + { + "epoch": 0.7002653203163236, + "grad_norm": 0.0062957098707556725, + "learning_rate": 3.010313634521356e-05, + "loss": 0.14104583263397216, + "step": 163110 + }, + { + "epoch": 0.7003082524063436, + "grad_norm": 0.039266157895326614, + "learning_rate": 3.009882462509594e-05, + "loss": 0.26926708221435547, + "step": 163120 + }, + { + "epoch": 0.7003511844963637, + "grad_norm": 11.471166610717773, + "learning_rate": 3.0094512904978313e-05, + "loss": 0.2930644989013672, + "step": 163130 + }, + { + "epoch": 0.7003941165863836, + "grad_norm": 5.168342590332031, + "learning_rate": 3.009020118486069e-05, + "loss": 0.3205289363861084, + "step": 163140 + }, + { + "epoch": 0.7004370486764037, + "grad_norm": 0.34781506657600403, + "learning_rate": 3.0085889464743067e-05, + "loss": 0.17421427965164185, + "step": 163150 + }, + { + "epoch": 0.7004799807664237, + "grad_norm": 0.009231721051037312, + "learning_rate": 3.0081577744625445e-05, + "loss": 0.22379372119903565, + "step": 163160 + }, + { + "epoch": 0.7005229128564436, + "grad_norm": 2.9101412296295166, + "learning_rate": 3.0077266024507815e-05, + "loss": 0.2867826700210571, + "step": 163170 + }, + { + "epoch": 0.7005658449464637, + "grad_norm": 0.15839970111846924, + "learning_rate": 3.0072954304390193e-05, + "loss": 0.2264240264892578, + "step": 163180 + }, + { + "epoch": 0.7006087770364837, + "grad_norm": 0.0323498360812664, + "learning_rate": 3.006864258427257e-05, + "loss": 0.00971728190779686, + "step": 163190 + }, + { + "epoch": 0.7006517091265037, + "grad_norm": 0.19851140677928925, + "learning_rate": 3.006433086415495e-05, + "loss": 0.25482997894287107, + "step": 163200 + }, + { + "epoch": 0.7006946412165237, + "grad_norm": 0.19752870500087738, + "learning_rate": 3.006001914403732e-05, + "loss": 0.3481988668441772, + "step": 163210 + }, + { + "epoch": 0.7007375733065437, + "grad_norm": 3.4968249797821045, + "learning_rate": 3.00557074239197e-05, + "loss": 0.29238917827606203, + "step": 163220 + }, + { + "epoch": 0.7007805053965637, + "grad_norm": 0.020744258537888527, + "learning_rate": 3.0051395703802076e-05, + "loss": 0.06940485835075379, + "step": 163230 + }, + { + "epoch": 0.7008234374865837, + "grad_norm": 0.6456360816955566, + "learning_rate": 3.0047083983684453e-05, + "loss": 0.42883834838867185, + "step": 163240 + }, + { + "epoch": 0.7008663695766038, + "grad_norm": 1.1091424226760864, + "learning_rate": 3.0042772263566827e-05, + "loss": 0.16743401288986207, + "step": 163250 + }, + { + "epoch": 0.7009093016666237, + "grad_norm": 0.24076159298419952, + "learning_rate": 3.0038460543449205e-05, + "loss": 0.11621925830841065, + "step": 163260 + }, + { + "epoch": 0.7009522337566437, + "grad_norm": 0.013866151683032513, + "learning_rate": 3.0034148823331582e-05, + "loss": 0.330096960067749, + "step": 163270 + }, + { + "epoch": 0.7009951658466638, + "grad_norm": 3.419060468673706, + "learning_rate": 3.002983710321396e-05, + "loss": 0.3908973693847656, + "step": 163280 + }, + { + "epoch": 0.7010380979366837, + "grad_norm": 0.012446640059351921, + "learning_rate": 3.002552538309633e-05, + "loss": 0.34545049667358396, + "step": 163290 + }, + { + "epoch": 0.7010810300267037, + "grad_norm": 1.4316107034683228, + "learning_rate": 3.0021213662978707e-05, + "loss": 0.22633516788482666, + "step": 163300 + }, + { + "epoch": 0.7011239621167238, + "grad_norm": 0.06977680325508118, + "learning_rate": 3.0016901942861088e-05, + "loss": 0.004347345978021622, + "step": 163310 + }, + { + "epoch": 0.7011668942067437, + "grad_norm": 1.5991872549057007, + "learning_rate": 3.0012590222743465e-05, + "loss": 0.12896416187286378, + "step": 163320 + }, + { + "epoch": 0.7012098262967638, + "grad_norm": 1.6064382791519165, + "learning_rate": 3.0008278502625843e-05, + "loss": 0.45462779998779296, + "step": 163330 + }, + { + "epoch": 0.7012527583867838, + "grad_norm": 2.761544942855835, + "learning_rate": 3.0003966782508213e-05, + "loss": 0.2084115743637085, + "step": 163340 + }, + { + "epoch": 0.7012956904768038, + "grad_norm": 3.6172492504119873, + "learning_rate": 2.999965506239059e-05, + "loss": 0.20417706966400145, + "step": 163350 + }, + { + "epoch": 0.7013386225668238, + "grad_norm": 47.53818893432617, + "learning_rate": 2.9995343342272968e-05, + "loss": 0.18387155532836913, + "step": 163360 + }, + { + "epoch": 0.7013815546568438, + "grad_norm": 0.009035330265760422, + "learning_rate": 2.9991031622155345e-05, + "loss": 0.03579406440258026, + "step": 163370 + }, + { + "epoch": 0.7014244867468639, + "grad_norm": 0.04270453378558159, + "learning_rate": 2.998671990203772e-05, + "loss": 0.1977940797805786, + "step": 163380 + }, + { + "epoch": 0.7014674188368838, + "grad_norm": 1.1373462677001953, + "learning_rate": 2.9982408181920097e-05, + "loss": 0.18321282863616944, + "step": 163390 + }, + { + "epoch": 0.7015103509269038, + "grad_norm": 0.001747044501826167, + "learning_rate": 2.9978096461802474e-05, + "loss": 0.21532964706420898, + "step": 163400 + }, + { + "epoch": 0.7015532830169239, + "grad_norm": 5.351308345794678, + "learning_rate": 2.997378474168485e-05, + "loss": 0.22291159629821777, + "step": 163410 + }, + { + "epoch": 0.7015962151069438, + "grad_norm": 2.718691349029541, + "learning_rate": 2.9969473021567225e-05, + "loss": 0.1568952441215515, + "step": 163420 + }, + { + "epoch": 0.7016391471969639, + "grad_norm": 0.00798012875020504, + "learning_rate": 2.9965161301449603e-05, + "loss": 0.16636523008346557, + "step": 163430 + }, + { + "epoch": 0.7016820792869839, + "grad_norm": 0.000884895445778966, + "learning_rate": 2.996084958133198e-05, + "loss": 0.032444655895233154, + "step": 163440 + }, + { + "epoch": 0.7017250113770038, + "grad_norm": 31.404062271118164, + "learning_rate": 2.9956537861214357e-05, + "loss": 0.2867364645004272, + "step": 163450 + }, + { + "epoch": 0.7017679434670239, + "grad_norm": 1.8071774244308472, + "learning_rate": 2.9952226141096728e-05, + "loss": 0.17658989429473876, + "step": 163460 + }, + { + "epoch": 0.7018108755570439, + "grad_norm": 0.2980644702911377, + "learning_rate": 2.9947914420979105e-05, + "loss": 0.17964842319488525, + "step": 163470 + }, + { + "epoch": 0.7018538076470638, + "grad_norm": 53.753273010253906, + "learning_rate": 2.9943602700861482e-05, + "loss": 0.22252166271209717, + "step": 163480 + }, + { + "epoch": 0.7018967397370839, + "grad_norm": 8.756794929504395, + "learning_rate": 2.9939290980743863e-05, + "loss": 0.18342812061309816, + "step": 163490 + }, + { + "epoch": 0.7019396718271039, + "grad_norm": 1.8001806735992432, + "learning_rate": 2.9934979260626234e-05, + "loss": 0.2673120737075806, + "step": 163500 + }, + { + "epoch": 0.7019826039171239, + "grad_norm": 0.003850214881822467, + "learning_rate": 2.993066754050861e-05, + "loss": 0.2851763486862183, + "step": 163510 + }, + { + "epoch": 0.7020255360071439, + "grad_norm": 0.010906051844358444, + "learning_rate": 2.992635582039099e-05, + "loss": 0.24270126819610596, + "step": 163520 + }, + { + "epoch": 0.702068468097164, + "grad_norm": 1.0036766529083252, + "learning_rate": 2.9922044100273366e-05, + "loss": 0.19612650871276854, + "step": 163530 + }, + { + "epoch": 0.7021114001871839, + "grad_norm": 0.01582266017794609, + "learning_rate": 2.991773238015574e-05, + "loss": 0.2539534568786621, + "step": 163540 + }, + { + "epoch": 0.7021543322772039, + "grad_norm": 0.5005191564559937, + "learning_rate": 2.9913420660038117e-05, + "loss": 0.1888645648956299, + "step": 163550 + }, + { + "epoch": 0.702197264367224, + "grad_norm": 0.14598479866981506, + "learning_rate": 2.9909108939920494e-05, + "loss": 0.3390333890914917, + "step": 163560 + }, + { + "epoch": 0.7022401964572439, + "grad_norm": 1.5303585529327393, + "learning_rate": 2.9904797219802872e-05, + "loss": 0.4039153099060059, + "step": 163570 + }, + { + "epoch": 0.7022831285472639, + "grad_norm": 0.0033237277530133724, + "learning_rate": 2.9900485499685242e-05, + "loss": 0.3125044822692871, + "step": 163580 + }, + { + "epoch": 0.702326060637284, + "grad_norm": 0.015881020575761795, + "learning_rate": 2.989617377956762e-05, + "loss": 0.307576847076416, + "step": 163590 + }, + { + "epoch": 0.7023689927273039, + "grad_norm": 0.028332769870758057, + "learning_rate": 2.989186205945e-05, + "loss": 0.1998500108718872, + "step": 163600 + }, + { + "epoch": 0.702411924817324, + "grad_norm": 1.9299815893173218, + "learning_rate": 2.9887550339332378e-05, + "loss": 0.24326274394989014, + "step": 163610 + }, + { + "epoch": 0.702454856907344, + "grad_norm": 0.03395046293735504, + "learning_rate": 2.988323861921475e-05, + "loss": 0.43087282180786135, + "step": 163620 + }, + { + "epoch": 0.7024977889973639, + "grad_norm": 0.9291477799415588, + "learning_rate": 2.9878926899097126e-05, + "loss": 0.24743344783782958, + "step": 163630 + }, + { + "epoch": 0.702540721087384, + "grad_norm": 4.53275203704834, + "learning_rate": 2.9874615178979503e-05, + "loss": 0.25386340618133546, + "step": 163640 + }, + { + "epoch": 0.702583653177404, + "grad_norm": 0.40106382966041565, + "learning_rate": 2.987030345886188e-05, + "loss": 0.3923983573913574, + "step": 163650 + }, + { + "epoch": 0.7026265852674239, + "grad_norm": 0.26700788736343384, + "learning_rate": 2.9865991738744254e-05, + "loss": 0.1837749719619751, + "step": 163660 + }, + { + "epoch": 0.702669517357444, + "grad_norm": 1.212033987045288, + "learning_rate": 2.986168001862663e-05, + "loss": 0.412782096862793, + "step": 163670 + }, + { + "epoch": 0.702712449447464, + "grad_norm": 1.8219190835952759, + "learning_rate": 2.985736829850901e-05, + "loss": 0.3716761350631714, + "step": 163680 + }, + { + "epoch": 0.702755381537484, + "grad_norm": 0.006801880896091461, + "learning_rate": 2.9853056578391386e-05, + "loss": 0.19434766769409179, + "step": 163690 + }, + { + "epoch": 0.702798313627504, + "grad_norm": 0.06828784197568893, + "learning_rate": 2.9848744858273764e-05, + "loss": 0.007297980785369873, + "step": 163700 + }, + { + "epoch": 0.702841245717524, + "grad_norm": 0.14776858687400818, + "learning_rate": 2.9844433138156138e-05, + "loss": 0.1427057147026062, + "step": 163710 + }, + { + "epoch": 0.702884177807544, + "grad_norm": 0.23514142632484436, + "learning_rate": 2.9840121418038515e-05, + "loss": 0.07293086647987365, + "step": 163720 + }, + { + "epoch": 0.702927109897564, + "grad_norm": 4.044814109802246, + "learning_rate": 2.9835809697920892e-05, + "loss": 0.26663668155670167, + "step": 163730 + }, + { + "epoch": 0.7029700419875841, + "grad_norm": 0.8512552380561829, + "learning_rate": 2.983149797780327e-05, + "loss": 0.14922486543655394, + "step": 163740 + }, + { + "epoch": 0.703012974077604, + "grad_norm": 0.0908672958612442, + "learning_rate": 2.982718625768564e-05, + "loss": 0.15927162170410156, + "step": 163750 + }, + { + "epoch": 0.703055906167624, + "grad_norm": 0.006831469014286995, + "learning_rate": 2.9822874537568018e-05, + "loss": 0.03169746696949005, + "step": 163760 + }, + { + "epoch": 0.7030988382576441, + "grad_norm": 1.0652174949645996, + "learning_rate": 2.9818562817450395e-05, + "loss": 0.3184578657150269, + "step": 163770 + }, + { + "epoch": 0.7031417703476641, + "grad_norm": 0.0035700947046279907, + "learning_rate": 2.9814251097332772e-05, + "loss": 0.06982010006904601, + "step": 163780 + }, + { + "epoch": 0.703184702437684, + "grad_norm": 0.1739826202392578, + "learning_rate": 2.9809939377215146e-05, + "loss": 0.27461676597595214, + "step": 163790 + }, + { + "epoch": 0.7032276345277041, + "grad_norm": 2.1455235481262207, + "learning_rate": 2.9805627657097524e-05, + "loss": 0.14346520900726317, + "step": 163800 + }, + { + "epoch": 0.7032705666177241, + "grad_norm": 3.9834342002868652, + "learning_rate": 2.98013159369799e-05, + "loss": 0.22382516860961915, + "step": 163810 + }, + { + "epoch": 0.7033134987077441, + "grad_norm": 1.1752283573150635, + "learning_rate": 2.9797004216862278e-05, + "loss": 0.17417526245117188, + "step": 163820 + }, + { + "epoch": 0.7033564307977641, + "grad_norm": 0.03304734826087952, + "learning_rate": 2.9792692496744652e-05, + "loss": 0.19004673957824708, + "step": 163830 + }, + { + "epoch": 0.7033993628877842, + "grad_norm": 0.0051128193736076355, + "learning_rate": 2.978838077662703e-05, + "loss": 0.18268777132034303, + "step": 163840 + }, + { + "epoch": 0.7034422949778041, + "grad_norm": 0.16571545600891113, + "learning_rate": 2.9784069056509407e-05, + "loss": 0.17011449337005616, + "step": 163850 + }, + { + "epoch": 0.7034852270678241, + "grad_norm": 3.0325210094451904, + "learning_rate": 2.9779757336391784e-05, + "loss": 0.38422060012817383, + "step": 163860 + }, + { + "epoch": 0.7035281591578442, + "grad_norm": 1.36028254032135, + "learning_rate": 2.9775445616274155e-05, + "loss": 0.09718289971351624, + "step": 163870 + }, + { + "epoch": 0.7035710912478641, + "grad_norm": 0.07002713531255722, + "learning_rate": 2.9771133896156532e-05, + "loss": 0.1471462607383728, + "step": 163880 + }, + { + "epoch": 0.7036140233378841, + "grad_norm": 0.015247693285346031, + "learning_rate": 2.976682217603891e-05, + "loss": 0.22470765113830565, + "step": 163890 + }, + { + "epoch": 0.7036569554279042, + "grad_norm": 3.642178535461426, + "learning_rate": 2.976251045592129e-05, + "loss": 0.12147370576858521, + "step": 163900 + }, + { + "epoch": 0.7036998875179241, + "grad_norm": 0.21306264400482178, + "learning_rate": 2.975819873580366e-05, + "loss": 0.14276522397994995, + "step": 163910 + }, + { + "epoch": 0.7037428196079442, + "grad_norm": 0.025422273203730583, + "learning_rate": 2.9753887015686038e-05, + "loss": 0.20691957473754882, + "step": 163920 + }, + { + "epoch": 0.7037857516979642, + "grad_norm": 1.892135500907898, + "learning_rate": 2.9749575295568415e-05, + "loss": 0.3095592260360718, + "step": 163930 + }, + { + "epoch": 0.7038286837879841, + "grad_norm": 0.9915587902069092, + "learning_rate": 2.9745263575450793e-05, + "loss": 0.16103274822235109, + "step": 163940 + }, + { + "epoch": 0.7038716158780042, + "grad_norm": 0.003843039972707629, + "learning_rate": 2.9740951855333167e-05, + "loss": 0.05715287327766418, + "step": 163950 + }, + { + "epoch": 0.7039145479680242, + "grad_norm": 0.7359153628349304, + "learning_rate": 2.9736640135215544e-05, + "loss": 0.11851122379302978, + "step": 163960 + }, + { + "epoch": 0.7039574800580441, + "grad_norm": 0.02368875965476036, + "learning_rate": 2.973232841509792e-05, + "loss": 0.12844005823135377, + "step": 163970 + }, + { + "epoch": 0.7040004121480642, + "grad_norm": 0.0908665657043457, + "learning_rate": 2.97280166949803e-05, + "loss": 0.20799777507781983, + "step": 163980 + }, + { + "epoch": 0.7040433442380842, + "grad_norm": 31.04422378540039, + "learning_rate": 2.972370497486267e-05, + "loss": 0.18210945129394532, + "step": 163990 + }, + { + "epoch": 0.7040862763281042, + "grad_norm": 0.05075189843773842, + "learning_rate": 2.9719393254745047e-05, + "loss": 0.09743213057518005, + "step": 164000 + }, + { + "epoch": 0.7040862763281042, + "eval_loss": 0.3779441714286804, + "eval_runtime": 27.1446, + "eval_samples_per_second": 3.684, + "eval_steps_per_second": 3.684, + "step": 164000 + }, + { + "epoch": 0.7041292084181242, + "grad_norm": 1.5645118951797485, + "learning_rate": 2.9715081534627427e-05, + "loss": 0.25287203788757323, + "step": 164010 + }, + { + "epoch": 0.7041721405081443, + "grad_norm": 0.00030088808853179216, + "learning_rate": 2.9710769814509805e-05, + "loss": 0.16287697553634645, + "step": 164020 + }, + { + "epoch": 0.7042150725981642, + "grad_norm": 6.047826766967773, + "learning_rate": 2.9706458094392175e-05, + "loss": 0.20500681400299073, + "step": 164030 + }, + { + "epoch": 0.7042580046881842, + "grad_norm": 0.08400680869817734, + "learning_rate": 2.9702146374274553e-05, + "loss": 0.29985120296478274, + "step": 164040 + }, + { + "epoch": 0.7043009367782043, + "grad_norm": 0.006796732544898987, + "learning_rate": 2.969783465415693e-05, + "loss": 0.2870039463043213, + "step": 164050 + }, + { + "epoch": 0.7043438688682242, + "grad_norm": 0.008046969771385193, + "learning_rate": 2.9693522934039307e-05, + "loss": 0.3308976888656616, + "step": 164060 + }, + { + "epoch": 0.7043868009582442, + "grad_norm": 2.2312533855438232, + "learning_rate": 2.9689211213921685e-05, + "loss": 0.40290260314941406, + "step": 164070 + }, + { + "epoch": 0.7044297330482643, + "grad_norm": 0.42959392070770264, + "learning_rate": 2.968489949380406e-05, + "loss": 0.06268092393875122, + "step": 164080 + }, + { + "epoch": 0.7044726651382842, + "grad_norm": 0.004218485672026873, + "learning_rate": 2.9680587773686436e-05, + "loss": 0.2813071012496948, + "step": 164090 + }, + { + "epoch": 0.7045155972283043, + "grad_norm": 0.001964650582522154, + "learning_rate": 2.9676276053568813e-05, + "loss": 0.28354315757751464, + "step": 164100 + }, + { + "epoch": 0.7045585293183243, + "grad_norm": 0.9442742466926575, + "learning_rate": 2.967196433345119e-05, + "loss": 0.2996357917785645, + "step": 164110 + }, + { + "epoch": 0.7046014614083442, + "grad_norm": 0.15573832392692566, + "learning_rate": 2.9667652613333565e-05, + "loss": 0.10072107315063476, + "step": 164120 + }, + { + "epoch": 0.7046443934983643, + "grad_norm": 1.8645374774932861, + "learning_rate": 2.9663340893215942e-05, + "loss": 0.28838138580322265, + "step": 164130 + }, + { + "epoch": 0.7046873255883843, + "grad_norm": 0.3522595167160034, + "learning_rate": 2.965902917309832e-05, + "loss": 0.2901811122894287, + "step": 164140 + }, + { + "epoch": 0.7047302576784042, + "grad_norm": 7.661362648010254, + "learning_rate": 2.9654717452980697e-05, + "loss": 0.35709872245788576, + "step": 164150 + }, + { + "epoch": 0.7047731897684243, + "grad_norm": 0.27700191736221313, + "learning_rate": 2.9650405732863067e-05, + "loss": 0.15413910150527954, + "step": 164160 + }, + { + "epoch": 0.7048161218584443, + "grad_norm": 0.004466890823096037, + "learning_rate": 2.9646094012745445e-05, + "loss": 0.03817626535892486, + "step": 164170 + }, + { + "epoch": 0.7048590539484643, + "grad_norm": 0.9056572318077087, + "learning_rate": 2.9641782292627822e-05, + "loss": 0.13778855800628662, + "step": 164180 + }, + { + "epoch": 0.7049019860384843, + "grad_norm": 0.007168032694607973, + "learning_rate": 2.96374705725102e-05, + "loss": 0.1839970111846924, + "step": 164190 + }, + { + "epoch": 0.7049449181285043, + "grad_norm": 0.019991997629404068, + "learning_rate": 2.9633158852392573e-05, + "loss": 0.26103713512420657, + "step": 164200 + }, + { + "epoch": 0.7049878502185244, + "grad_norm": 0.290790319442749, + "learning_rate": 2.962884713227495e-05, + "loss": 0.24262683391571044, + "step": 164210 + }, + { + "epoch": 0.7050307823085443, + "grad_norm": 7.077462196350098, + "learning_rate": 2.9624535412157328e-05, + "loss": 0.3445000648498535, + "step": 164220 + }, + { + "epoch": 0.7050737143985644, + "grad_norm": 2.3658647537231445, + "learning_rate": 2.9620223692039705e-05, + "loss": 0.34368832111358644, + "step": 164230 + }, + { + "epoch": 0.7051166464885844, + "grad_norm": 0.002537068212404847, + "learning_rate": 2.961591197192208e-05, + "loss": 0.1858936071395874, + "step": 164240 + }, + { + "epoch": 0.7051595785786043, + "grad_norm": 0.002385231666266918, + "learning_rate": 2.9611600251804457e-05, + "loss": 0.22579362392425537, + "step": 164250 + }, + { + "epoch": 0.7052025106686244, + "grad_norm": 1.203508734703064, + "learning_rate": 2.9607288531686834e-05, + "loss": 0.27742347717285154, + "step": 164260 + }, + { + "epoch": 0.7052454427586444, + "grad_norm": 0.0387476347386837, + "learning_rate": 2.960297681156921e-05, + "loss": 0.3718020677566528, + "step": 164270 + }, + { + "epoch": 0.7052883748486644, + "grad_norm": 0.006723400205373764, + "learning_rate": 2.9598665091451582e-05, + "loss": 0.10956581830978393, + "step": 164280 + }, + { + "epoch": 0.7053313069386844, + "grad_norm": 4.690652370452881, + "learning_rate": 2.959435337133396e-05, + "loss": 0.24487228393554689, + "step": 164290 + }, + { + "epoch": 0.7053742390287044, + "grad_norm": 0.00888009648770094, + "learning_rate": 2.9590041651216336e-05, + "loss": 0.06979911923408508, + "step": 164300 + }, + { + "epoch": 0.7054171711187244, + "grad_norm": 1.0406445264816284, + "learning_rate": 2.9585729931098717e-05, + "loss": 0.2905172348022461, + "step": 164310 + }, + { + "epoch": 0.7054601032087444, + "grad_norm": 0.5761302709579468, + "learning_rate": 2.9581418210981088e-05, + "loss": 0.34133856296539306, + "step": 164320 + }, + { + "epoch": 0.7055030352987645, + "grad_norm": 0.08215140551328659, + "learning_rate": 2.9577106490863465e-05, + "loss": 0.31227014064788816, + "step": 164330 + }, + { + "epoch": 0.7055459673887844, + "grad_norm": 7.857476234436035, + "learning_rate": 2.9572794770745842e-05, + "loss": 0.22481787204742432, + "step": 164340 + }, + { + "epoch": 0.7055888994788044, + "grad_norm": 1.1321688890457153, + "learning_rate": 2.956848305062822e-05, + "loss": 0.161775803565979, + "step": 164350 + }, + { + "epoch": 0.7056318315688245, + "grad_norm": 0.003534104209393263, + "learning_rate": 2.9564171330510594e-05, + "loss": 0.2884944677352905, + "step": 164360 + }, + { + "epoch": 0.7056747636588444, + "grad_norm": 0.03924204409122467, + "learning_rate": 2.955985961039297e-05, + "loss": 0.23131952285766602, + "step": 164370 + }, + { + "epoch": 0.7057176957488644, + "grad_norm": 0.055216234177351, + "learning_rate": 2.955554789027535e-05, + "loss": 0.09212759137153625, + "step": 164380 + }, + { + "epoch": 0.7057606278388845, + "grad_norm": 1.711653709411621, + "learning_rate": 2.9551236170157726e-05, + "loss": 0.20180110931396483, + "step": 164390 + }, + { + "epoch": 0.7058035599289044, + "grad_norm": 2.030550479888916, + "learning_rate": 2.9546924450040096e-05, + "loss": 0.30741727352142334, + "step": 164400 + }, + { + "epoch": 0.7058464920189245, + "grad_norm": 0.010752196423709393, + "learning_rate": 2.9542612729922474e-05, + "loss": 0.09185991883277893, + "step": 164410 + }, + { + "epoch": 0.7058894241089445, + "grad_norm": 0.0016691704513505101, + "learning_rate": 2.9538301009804854e-05, + "loss": 0.1287990927696228, + "step": 164420 + }, + { + "epoch": 0.7059323561989644, + "grad_norm": 0.04343206062912941, + "learning_rate": 2.9533989289687232e-05, + "loss": 0.07160269618034362, + "step": 164430 + }, + { + "epoch": 0.7059752882889845, + "grad_norm": 0.0019043717766180634, + "learning_rate": 2.952967756956961e-05, + "loss": 0.12070275545120239, + "step": 164440 + }, + { + "epoch": 0.7060182203790045, + "grad_norm": 4.755942344665527, + "learning_rate": 2.952536584945198e-05, + "loss": 0.29552769660949707, + "step": 164450 + }, + { + "epoch": 0.7060611524690245, + "grad_norm": 0.00850929506123066, + "learning_rate": 2.9521054129334357e-05, + "loss": 0.25412487983703613, + "step": 164460 + }, + { + "epoch": 0.7061040845590445, + "grad_norm": 0.4611488878726959, + "learning_rate": 2.9516742409216734e-05, + "loss": 0.09284462332725525, + "step": 164470 + }, + { + "epoch": 0.7061470166490645, + "grad_norm": 1.1242001056671143, + "learning_rate": 2.9512430689099112e-05, + "loss": 0.3816138744354248, + "step": 164480 + }, + { + "epoch": 0.7061899487390845, + "grad_norm": 0.011605273000895977, + "learning_rate": 2.9508118968981486e-05, + "loss": 0.14563461542129516, + "step": 164490 + }, + { + "epoch": 0.7062328808291045, + "grad_norm": 3.635684013366699, + "learning_rate": 2.9503807248863863e-05, + "loss": 0.39076852798461914, + "step": 164500 + }, + { + "epoch": 0.7062758129191246, + "grad_norm": 0.005340252537280321, + "learning_rate": 2.949949552874624e-05, + "loss": 0.26224353313446047, + "step": 164510 + }, + { + "epoch": 0.7063187450091445, + "grad_norm": 0.03700922802090645, + "learning_rate": 2.9495183808628618e-05, + "loss": 0.11134896278381348, + "step": 164520 + }, + { + "epoch": 0.7063616770991645, + "grad_norm": 1.8511571884155273, + "learning_rate": 2.949087208851099e-05, + "loss": 0.17045905590057372, + "step": 164530 + }, + { + "epoch": 0.7064046091891846, + "grad_norm": 0.010725017637014389, + "learning_rate": 2.948656036839337e-05, + "loss": 0.35302841663360596, + "step": 164540 + }, + { + "epoch": 0.7064475412792045, + "grad_norm": 0.00597262941300869, + "learning_rate": 2.9482248648275746e-05, + "loss": 0.14913132190704345, + "step": 164550 + }, + { + "epoch": 0.7064904733692245, + "grad_norm": 0.0034178998321294785, + "learning_rate": 2.9477936928158124e-05, + "loss": 0.23776566982269287, + "step": 164560 + }, + { + "epoch": 0.7065334054592446, + "grad_norm": 0.009466869756579399, + "learning_rate": 2.9473625208040494e-05, + "loss": 0.14307208061218263, + "step": 164570 + }, + { + "epoch": 0.7065763375492645, + "grad_norm": 0.08489704132080078, + "learning_rate": 2.946931348792287e-05, + "loss": 0.1949027180671692, + "step": 164580 + }, + { + "epoch": 0.7066192696392846, + "grad_norm": 0.03956615552306175, + "learning_rate": 2.946500176780525e-05, + "loss": 0.13790332078933715, + "step": 164590 + }, + { + "epoch": 0.7066622017293046, + "grad_norm": 0.038982268422842026, + "learning_rate": 2.946069004768763e-05, + "loss": 0.33584163188934324, + "step": 164600 + }, + { + "epoch": 0.7067051338193245, + "grad_norm": 5.0026421546936035, + "learning_rate": 2.945637832757e-05, + "loss": 0.313620924949646, + "step": 164610 + }, + { + "epoch": 0.7067480659093446, + "grad_norm": 1.726048469543457, + "learning_rate": 2.9452066607452378e-05, + "loss": 0.3042426109313965, + "step": 164620 + }, + { + "epoch": 0.7067909979993646, + "grad_norm": 0.07862383872270584, + "learning_rate": 2.9447754887334755e-05, + "loss": 0.1545378804206848, + "step": 164630 + }, + { + "epoch": 0.7068339300893847, + "grad_norm": 0.019337479025125504, + "learning_rate": 2.9443443167217132e-05, + "loss": 0.24584414958953857, + "step": 164640 + }, + { + "epoch": 0.7068768621794046, + "grad_norm": 0.00637492910027504, + "learning_rate": 2.9439131447099506e-05, + "loss": 0.1771503210067749, + "step": 164650 + }, + { + "epoch": 0.7069197942694246, + "grad_norm": 0.0154104707762599, + "learning_rate": 2.9434819726981884e-05, + "loss": 0.2208927631378174, + "step": 164660 + }, + { + "epoch": 0.7069627263594447, + "grad_norm": 0.013810674659907818, + "learning_rate": 2.943050800686426e-05, + "loss": 0.07610102891921997, + "step": 164670 + }, + { + "epoch": 0.7070056584494646, + "grad_norm": 0.026029305532574654, + "learning_rate": 2.9426196286746638e-05, + "loss": 0.16726453304290773, + "step": 164680 + }, + { + "epoch": 0.7070485905394847, + "grad_norm": 1.9912234544754028, + "learning_rate": 2.942188456662901e-05, + "loss": 0.2336212396621704, + "step": 164690 + }, + { + "epoch": 0.7070915226295047, + "grad_norm": 0.11577221751213074, + "learning_rate": 2.9417572846511386e-05, + "loss": 0.29100732803344725, + "step": 164700 + }, + { + "epoch": 0.7071344547195246, + "grad_norm": 1.0063371658325195, + "learning_rate": 2.9413261126393767e-05, + "loss": 0.241862154006958, + "step": 164710 + }, + { + "epoch": 0.7071773868095447, + "grad_norm": 1.699659824371338, + "learning_rate": 2.9408949406276144e-05, + "loss": 0.3465231657028198, + "step": 164720 + }, + { + "epoch": 0.7072203188995647, + "grad_norm": 0.2365957647562027, + "learning_rate": 2.9404637686158515e-05, + "loss": 0.05553056001663208, + "step": 164730 + }, + { + "epoch": 0.7072632509895846, + "grad_norm": 0.8947508931159973, + "learning_rate": 2.9400325966040892e-05, + "loss": 0.1890372157096863, + "step": 164740 + }, + { + "epoch": 0.7073061830796047, + "grad_norm": 0.0036553533282130957, + "learning_rate": 2.939601424592327e-05, + "loss": 0.19252634048461914, + "step": 164750 + }, + { + "epoch": 0.7073491151696247, + "grad_norm": 0.005008448846638203, + "learning_rate": 2.9391702525805647e-05, + "loss": 0.07748492956161498, + "step": 164760 + }, + { + "epoch": 0.7073920472596447, + "grad_norm": 0.14192329347133636, + "learning_rate": 2.938739080568802e-05, + "loss": 0.006033249571919442, + "step": 164770 + }, + { + "epoch": 0.7074349793496647, + "grad_norm": 8.749017715454102, + "learning_rate": 2.9383079085570398e-05, + "loss": 0.5570028305053711, + "step": 164780 + }, + { + "epoch": 0.7074779114396847, + "grad_norm": 0.09582618623971939, + "learning_rate": 2.9378767365452775e-05, + "loss": 0.18232868909835814, + "step": 164790 + }, + { + "epoch": 0.7075208435297047, + "grad_norm": 0.010088094510138035, + "learning_rate": 2.9374455645335153e-05, + "loss": 0.2660231590270996, + "step": 164800 + }, + { + "epoch": 0.7075637756197247, + "grad_norm": 0.026392368599772453, + "learning_rate": 2.937014392521753e-05, + "loss": 0.0775251030921936, + "step": 164810 + }, + { + "epoch": 0.7076067077097448, + "grad_norm": 0.1605832576751709, + "learning_rate": 2.9365832205099904e-05, + "loss": 0.08925783634185791, + "step": 164820 + }, + { + "epoch": 0.7076496397997647, + "grad_norm": 0.0047328840009868145, + "learning_rate": 2.936152048498228e-05, + "loss": 0.3366368293762207, + "step": 164830 + }, + { + "epoch": 0.7076925718897847, + "grad_norm": 0.08522465825080872, + "learning_rate": 2.935720876486466e-05, + "loss": 0.30194315910339353, + "step": 164840 + }, + { + "epoch": 0.7077355039798048, + "grad_norm": 0.0030174916610121727, + "learning_rate": 2.9352897044747036e-05, + "loss": 0.0655434787273407, + "step": 164850 + }, + { + "epoch": 0.7077784360698247, + "grad_norm": 2.143808603286743, + "learning_rate": 2.9348585324629407e-05, + "loss": 0.32743024826049805, + "step": 164860 + }, + { + "epoch": 0.7078213681598448, + "grad_norm": 2.470440626144409, + "learning_rate": 2.9344273604511784e-05, + "loss": 0.22581710815429687, + "step": 164870 + }, + { + "epoch": 0.7078643002498648, + "grad_norm": 0.007416535634547472, + "learning_rate": 2.933996188439416e-05, + "loss": 0.29442901611328126, + "step": 164880 + }, + { + "epoch": 0.7079072323398847, + "grad_norm": 0.2843577265739441, + "learning_rate": 2.933565016427654e-05, + "loss": 0.21048979759216307, + "step": 164890 + }, + { + "epoch": 0.7079501644299048, + "grad_norm": 0.0796031728386879, + "learning_rate": 2.9331338444158913e-05, + "loss": 0.09109994173049926, + "step": 164900 + }, + { + "epoch": 0.7079930965199248, + "grad_norm": 0.017023451626300812, + "learning_rate": 2.932702672404129e-05, + "loss": 0.21580660343170166, + "step": 164910 + }, + { + "epoch": 0.7080360286099447, + "grad_norm": 0.06896127760410309, + "learning_rate": 2.9322715003923667e-05, + "loss": 0.05191414952278137, + "step": 164920 + }, + { + "epoch": 0.7080789606999648, + "grad_norm": 2.6581945419311523, + "learning_rate": 2.9318403283806045e-05, + "loss": 0.2914731025695801, + "step": 164930 + }, + { + "epoch": 0.7081218927899848, + "grad_norm": 1.7635499238967896, + "learning_rate": 2.931409156368842e-05, + "loss": 0.2414705753326416, + "step": 164940 + }, + { + "epoch": 0.7081648248800048, + "grad_norm": 0.1912590116262436, + "learning_rate": 2.9309779843570796e-05, + "loss": 0.14573431015014648, + "step": 164950 + }, + { + "epoch": 0.7082077569700248, + "grad_norm": 0.06119944900274277, + "learning_rate": 2.9305468123453173e-05, + "loss": 0.22457616329193114, + "step": 164960 + }, + { + "epoch": 0.7082506890600448, + "grad_norm": 3.581524133682251, + "learning_rate": 2.930115640333555e-05, + "loss": 0.37599098682403564, + "step": 164970 + }, + { + "epoch": 0.7082936211500648, + "grad_norm": 0.3666439950466156, + "learning_rate": 2.929684468321792e-05, + "loss": 0.1905382513999939, + "step": 164980 + }, + { + "epoch": 0.7083365532400848, + "grad_norm": 1.3603236675262451, + "learning_rate": 2.92925329631003e-05, + "loss": 0.314105224609375, + "step": 164990 + }, + { + "epoch": 0.7083794853301049, + "grad_norm": 1.4144307374954224, + "learning_rate": 2.9288221242982676e-05, + "loss": 0.29831576347351074, + "step": 165000 + }, + { + "epoch": 0.7083794853301049, + "eval_loss": 0.37844017148017883, + "eval_runtime": 27.115, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 3.688, + "step": 165000 + }, + { + "epoch": 0.7084224174201248, + "grad_norm": 2.283473253250122, + "learning_rate": 2.9283909522865057e-05, + "loss": 0.07131168842315674, + "step": 165010 + }, + { + "epoch": 0.7084653495101448, + "grad_norm": 0.03843050077557564, + "learning_rate": 2.9279597802747427e-05, + "loss": 0.27201735973358154, + "step": 165020 + }, + { + "epoch": 0.7085082816001649, + "grad_norm": 0.00798146240413189, + "learning_rate": 2.9275286082629805e-05, + "loss": 0.0913933277130127, + "step": 165030 + }, + { + "epoch": 0.7085512136901848, + "grad_norm": 0.03475451096892357, + "learning_rate": 2.9270974362512182e-05, + "loss": 0.1362240195274353, + "step": 165040 + }, + { + "epoch": 0.7085941457802049, + "grad_norm": 0.04747939482331276, + "learning_rate": 2.926666264239456e-05, + "loss": 0.07424071431159973, + "step": 165050 + }, + { + "epoch": 0.7086370778702249, + "grad_norm": 0.11417294293642044, + "learning_rate": 2.9262350922276933e-05, + "loss": 0.07617428302764892, + "step": 165060 + }, + { + "epoch": 0.7086800099602449, + "grad_norm": 0.034629106521606445, + "learning_rate": 2.925803920215931e-05, + "loss": 0.36686632633209226, + "step": 165070 + }, + { + "epoch": 0.7087229420502649, + "grad_norm": 0.13259123265743256, + "learning_rate": 2.9253727482041688e-05, + "loss": 0.1826852798461914, + "step": 165080 + }, + { + "epoch": 0.7087658741402849, + "grad_norm": 2.0491228103637695, + "learning_rate": 2.9249415761924065e-05, + "loss": 0.09049310684204101, + "step": 165090 + }, + { + "epoch": 0.708808806230305, + "grad_norm": 0.0009569579851813614, + "learning_rate": 2.9245104041806436e-05, + "loss": 0.026717782020568848, + "step": 165100 + }, + { + "epoch": 0.7088517383203249, + "grad_norm": 0.18775348365306854, + "learning_rate": 2.9240792321688813e-05, + "loss": 0.19738540649414063, + "step": 165110 + }, + { + "epoch": 0.7088946704103449, + "grad_norm": 0.0023286493960767984, + "learning_rate": 2.9236480601571194e-05, + "loss": 0.22296249866485596, + "step": 165120 + }, + { + "epoch": 0.708937602500365, + "grad_norm": 1.2228485345840454, + "learning_rate": 2.923216888145357e-05, + "loss": 0.19773554801940918, + "step": 165130 + }, + { + "epoch": 0.7089805345903849, + "grad_norm": 0.07914045453071594, + "learning_rate": 2.922785716133595e-05, + "loss": 0.18020875453948976, + "step": 165140 + }, + { + "epoch": 0.709023466680405, + "grad_norm": 0.04731407389044762, + "learning_rate": 2.922354544121832e-05, + "loss": 0.09294969439506531, + "step": 165150 + }, + { + "epoch": 0.709066398770425, + "grad_norm": 1.9331458806991577, + "learning_rate": 2.9219233721100696e-05, + "loss": 0.15570180416107177, + "step": 165160 + }, + { + "epoch": 0.7091093308604449, + "grad_norm": 0.001042968942783773, + "learning_rate": 2.9214922000983074e-05, + "loss": 0.18585129976272582, + "step": 165170 + }, + { + "epoch": 0.709152262950465, + "grad_norm": 0.039528731256723404, + "learning_rate": 2.921061028086545e-05, + "loss": 0.1763898491859436, + "step": 165180 + }, + { + "epoch": 0.709195195040485, + "grad_norm": 0.2774352729320526, + "learning_rate": 2.9206298560747825e-05, + "loss": 0.38413708209991454, + "step": 165190 + }, + { + "epoch": 0.7092381271305049, + "grad_norm": 0.30526450276374817, + "learning_rate": 2.9201986840630202e-05, + "loss": 0.09335086345672608, + "step": 165200 + }, + { + "epoch": 0.709281059220525, + "grad_norm": 0.009867934510111809, + "learning_rate": 2.919767512051258e-05, + "loss": 0.09516159892082214, + "step": 165210 + }, + { + "epoch": 0.709323991310545, + "grad_norm": 1.502676010131836, + "learning_rate": 2.9193363400394957e-05, + "loss": 0.3276165723800659, + "step": 165220 + }, + { + "epoch": 0.709366923400565, + "grad_norm": 0.23743776977062225, + "learning_rate": 2.918905168027733e-05, + "loss": 0.29257543087005616, + "step": 165230 + }, + { + "epoch": 0.709409855490585, + "grad_norm": 0.02360161580145359, + "learning_rate": 2.918473996015971e-05, + "loss": 0.2478564500808716, + "step": 165240 + }, + { + "epoch": 0.709452787580605, + "grad_norm": 0.02930424176156521, + "learning_rate": 2.9180428240042086e-05, + "loss": 0.049641406536102294, + "step": 165250 + }, + { + "epoch": 0.709495719670625, + "grad_norm": 1.6508783102035522, + "learning_rate": 2.9176116519924463e-05, + "loss": 0.269071102142334, + "step": 165260 + }, + { + "epoch": 0.709538651760645, + "grad_norm": 1.4661870002746582, + "learning_rate": 2.9171804799806834e-05, + "loss": 0.33371987342834475, + "step": 165270 + }, + { + "epoch": 0.709581583850665, + "grad_norm": 4.903928756713867, + "learning_rate": 2.916749307968921e-05, + "loss": 0.3156555652618408, + "step": 165280 + }, + { + "epoch": 0.709624515940685, + "grad_norm": 0.002806802047416568, + "learning_rate": 2.916318135957159e-05, + "loss": 0.2036449432373047, + "step": 165290 + }, + { + "epoch": 0.709667448030705, + "grad_norm": 3.3624939918518066, + "learning_rate": 2.9158869639453966e-05, + "loss": 0.3532872200012207, + "step": 165300 + }, + { + "epoch": 0.7097103801207251, + "grad_norm": 1.2719906568527222, + "learning_rate": 2.915455791933634e-05, + "loss": 0.3986253023147583, + "step": 165310 + }, + { + "epoch": 0.709753312210745, + "grad_norm": 1.8043235540390015, + "learning_rate": 2.9150246199218717e-05, + "loss": 0.0949524998664856, + "step": 165320 + }, + { + "epoch": 0.709796244300765, + "grad_norm": 0.05128329247236252, + "learning_rate": 2.9145934479101094e-05, + "loss": 0.09327793717384339, + "step": 165330 + }, + { + "epoch": 0.7098391763907851, + "grad_norm": 0.5460872650146484, + "learning_rate": 2.9141622758983472e-05, + "loss": 0.3752920389175415, + "step": 165340 + }, + { + "epoch": 0.709882108480805, + "grad_norm": 0.19574710726737976, + "learning_rate": 2.9137311038865846e-05, + "loss": 0.20312044620513917, + "step": 165350 + }, + { + "epoch": 0.7099250405708251, + "grad_norm": 47.620269775390625, + "learning_rate": 2.9132999318748223e-05, + "loss": 0.19353847503662108, + "step": 165360 + }, + { + "epoch": 0.7099679726608451, + "grad_norm": 0.05232425034046173, + "learning_rate": 2.91286875986306e-05, + "loss": 0.08786578178405761, + "step": 165370 + }, + { + "epoch": 0.710010904750865, + "grad_norm": 2.8413853645324707, + "learning_rate": 2.9124375878512978e-05, + "loss": 0.3481892108917236, + "step": 165380 + }, + { + "epoch": 0.7100538368408851, + "grad_norm": 0.023912513628602028, + "learning_rate": 2.9120064158395348e-05, + "loss": 0.10534037351608276, + "step": 165390 + }, + { + "epoch": 0.7100967689309051, + "grad_norm": 0.3025215268135071, + "learning_rate": 2.9115752438277726e-05, + "loss": 0.14979887008666992, + "step": 165400 + }, + { + "epoch": 0.710139701020925, + "grad_norm": 0.001987382536754012, + "learning_rate": 2.9111440718160103e-05, + "loss": 0.1420094847679138, + "step": 165410 + }, + { + "epoch": 0.7101826331109451, + "grad_norm": 2.2093968391418457, + "learning_rate": 2.9107128998042484e-05, + "loss": 0.35331382751464846, + "step": 165420 + }, + { + "epoch": 0.7102255652009651, + "grad_norm": 1.1964010000228882, + "learning_rate": 2.9102817277924854e-05, + "loss": 0.09625420570373536, + "step": 165430 + }, + { + "epoch": 0.7102684972909851, + "grad_norm": 0.9831269979476929, + "learning_rate": 2.909850555780723e-05, + "loss": 0.3276521682739258, + "step": 165440 + }, + { + "epoch": 0.7103114293810051, + "grad_norm": 15.865203857421875, + "learning_rate": 2.909419383768961e-05, + "loss": 0.16975079774856566, + "step": 165450 + }, + { + "epoch": 0.7103543614710252, + "grad_norm": 0.058152079582214355, + "learning_rate": 2.9089882117571986e-05, + "loss": 0.18160277605056763, + "step": 165460 + }, + { + "epoch": 0.7103972935610451, + "grad_norm": 0.013438289985060692, + "learning_rate": 2.908557039745436e-05, + "loss": 0.2488858461380005, + "step": 165470 + }, + { + "epoch": 0.7104402256510651, + "grad_norm": 1.8803297281265259, + "learning_rate": 2.9081258677336738e-05, + "loss": 0.12351391315460206, + "step": 165480 + }, + { + "epoch": 0.7104831577410852, + "grad_norm": 1.87388277053833, + "learning_rate": 2.9076946957219115e-05, + "loss": 0.386000919342041, + "step": 165490 + }, + { + "epoch": 0.7105260898311052, + "grad_norm": 1.3773218393325806, + "learning_rate": 2.9072635237101492e-05, + "loss": 0.2971598386764526, + "step": 165500 + }, + { + "epoch": 0.7105690219211251, + "grad_norm": 3.765920877456665, + "learning_rate": 2.906832351698387e-05, + "loss": 0.22443222999572754, + "step": 165510 + }, + { + "epoch": 0.7106119540111452, + "grad_norm": 0.012355818413197994, + "learning_rate": 2.906401179686624e-05, + "loss": 0.29129648208618164, + "step": 165520 + }, + { + "epoch": 0.7106548861011652, + "grad_norm": 0.0010706628672778606, + "learning_rate": 2.905970007674862e-05, + "loss": 0.00830424502491951, + "step": 165530 + }, + { + "epoch": 0.7106978181911852, + "grad_norm": 0.15785139799118042, + "learning_rate": 2.9055388356630998e-05, + "loss": 0.12713322639465333, + "step": 165540 + }, + { + "epoch": 0.7107407502812052, + "grad_norm": 0.6758171319961548, + "learning_rate": 2.9051076636513376e-05, + "loss": 0.09500041007995605, + "step": 165550 + }, + { + "epoch": 0.7107836823712252, + "grad_norm": 1.2952488660812378, + "learning_rate": 2.9046764916395746e-05, + "loss": 0.19718393087387084, + "step": 165560 + }, + { + "epoch": 0.7108266144612452, + "grad_norm": 0.0009880687575787306, + "learning_rate": 2.9042453196278124e-05, + "loss": 0.19167803525924682, + "step": 165570 + }, + { + "epoch": 0.7108695465512652, + "grad_norm": 1.7319165468215942, + "learning_rate": 2.90381414761605e-05, + "loss": 0.314529275894165, + "step": 165580 + }, + { + "epoch": 0.7109124786412853, + "grad_norm": 3.060222625732422, + "learning_rate": 2.9033829756042878e-05, + "loss": 0.1463207483291626, + "step": 165590 + }, + { + "epoch": 0.7109554107313052, + "grad_norm": 0.038725271821022034, + "learning_rate": 2.9029518035925252e-05, + "loss": 0.004723150655627251, + "step": 165600 + }, + { + "epoch": 0.7109983428213252, + "grad_norm": 0.01974060758948326, + "learning_rate": 2.902520631580763e-05, + "loss": 0.05358384251594543, + "step": 165610 + }, + { + "epoch": 0.7110412749113453, + "grad_norm": 0.04186383634805679, + "learning_rate": 2.9020894595690007e-05, + "loss": 0.40696163177490235, + "step": 165620 + }, + { + "epoch": 0.7110842070013652, + "grad_norm": 0.003584572346881032, + "learning_rate": 2.9016582875572384e-05, + "loss": 0.06248798370361328, + "step": 165630 + }, + { + "epoch": 0.7111271390913853, + "grad_norm": 0.0991358608007431, + "learning_rate": 2.9012271155454758e-05, + "loss": 0.17465416193008423, + "step": 165640 + }, + { + "epoch": 0.7111700711814053, + "grad_norm": 6.179220676422119, + "learning_rate": 2.9007959435337135e-05, + "loss": 0.09142645001411438, + "step": 165650 + }, + { + "epoch": 0.7112130032714252, + "grad_norm": 1.848460078239441, + "learning_rate": 2.9003647715219513e-05, + "loss": 0.34997422695159913, + "step": 165660 + }, + { + "epoch": 0.7112559353614453, + "grad_norm": 0.0017918201629072428, + "learning_rate": 2.899933599510189e-05, + "loss": 0.2364954710006714, + "step": 165670 + }, + { + "epoch": 0.7112988674514653, + "grad_norm": 0.001793616102077067, + "learning_rate": 2.899502427498426e-05, + "loss": 0.41597414016723633, + "step": 165680 + }, + { + "epoch": 0.7113417995414852, + "grad_norm": 5.766404628753662, + "learning_rate": 2.8990712554866638e-05, + "loss": 0.2759052038192749, + "step": 165690 + }, + { + "epoch": 0.7113847316315053, + "grad_norm": 7.550206184387207, + "learning_rate": 2.8986400834749015e-05, + "loss": 0.4413724422454834, + "step": 165700 + }, + { + "epoch": 0.7114276637215253, + "grad_norm": 0.03798282518982887, + "learning_rate": 2.8982089114631393e-05, + "loss": 0.25815589427948, + "step": 165710 + }, + { + "epoch": 0.7114705958115453, + "grad_norm": 1.352439284324646, + "learning_rate": 2.8977777394513767e-05, + "loss": 0.32106575965881345, + "step": 165720 + }, + { + "epoch": 0.7115135279015653, + "grad_norm": 0.06647659838199615, + "learning_rate": 2.8973465674396144e-05, + "loss": 0.1770651936531067, + "step": 165730 + }, + { + "epoch": 0.7115564599915853, + "grad_norm": 0.002012177137658, + "learning_rate": 2.896915395427852e-05, + "loss": 0.0860788345336914, + "step": 165740 + }, + { + "epoch": 0.7115993920816053, + "grad_norm": 0.09346366673707962, + "learning_rate": 2.89648422341609e-05, + "loss": 0.3049763679504395, + "step": 165750 + }, + { + "epoch": 0.7116423241716253, + "grad_norm": 0.029406042769551277, + "learning_rate": 2.8960530514043273e-05, + "loss": 0.12645034790039061, + "step": 165760 + }, + { + "epoch": 0.7116852562616454, + "grad_norm": 0.024466650560498238, + "learning_rate": 2.895621879392565e-05, + "loss": 0.1472999095916748, + "step": 165770 + }, + { + "epoch": 0.7117281883516653, + "grad_norm": 1.3930857181549072, + "learning_rate": 2.8951907073808027e-05, + "loss": 0.11919002532958985, + "step": 165780 + }, + { + "epoch": 0.7117711204416853, + "grad_norm": 0.050165556371212006, + "learning_rate": 2.8947595353690405e-05, + "loss": 0.09300928115844727, + "step": 165790 + }, + { + "epoch": 0.7118140525317054, + "grad_norm": 0.05946086719632149, + "learning_rate": 2.8943283633572775e-05, + "loss": 0.21979830265045167, + "step": 165800 + }, + { + "epoch": 0.7118569846217253, + "grad_norm": 0.49015337228775024, + "learning_rate": 2.8938971913455153e-05, + "loss": 0.14640580415725707, + "step": 165810 + }, + { + "epoch": 0.7118999167117454, + "grad_norm": 0.01640521176159382, + "learning_rate": 2.893466019333753e-05, + "loss": 0.26482203006744387, + "step": 165820 + }, + { + "epoch": 0.7119428488017654, + "grad_norm": 0.7002899050712585, + "learning_rate": 2.893034847321991e-05, + "loss": 0.19143946170806886, + "step": 165830 + }, + { + "epoch": 0.7119857808917853, + "grad_norm": 3.6404550075531006, + "learning_rate": 2.892603675310228e-05, + "loss": 0.2002251148223877, + "step": 165840 + }, + { + "epoch": 0.7120287129818054, + "grad_norm": 1.8261346817016602, + "learning_rate": 2.892172503298466e-05, + "loss": 0.16150326728820802, + "step": 165850 + }, + { + "epoch": 0.7120716450718254, + "grad_norm": 0.020639711990952492, + "learning_rate": 2.8917413312867036e-05, + "loss": 0.3025749683380127, + "step": 165860 + }, + { + "epoch": 0.7121145771618453, + "grad_norm": 2.7759740352630615, + "learning_rate": 2.8913101592749413e-05, + "loss": 0.24660980701446533, + "step": 165870 + }, + { + "epoch": 0.7121575092518654, + "grad_norm": 0.053507398813962936, + "learning_rate": 2.890878987263179e-05, + "loss": 0.1749052047729492, + "step": 165880 + }, + { + "epoch": 0.7122004413418854, + "grad_norm": 2.1792287826538086, + "learning_rate": 2.8904478152514165e-05, + "loss": 0.2586753129959106, + "step": 165890 + }, + { + "epoch": 0.7122433734319054, + "grad_norm": 0.00457863649353385, + "learning_rate": 2.8900166432396542e-05, + "loss": 0.24560930728912353, + "step": 165900 + }, + { + "epoch": 0.7122863055219254, + "grad_norm": 5.7979912757873535, + "learning_rate": 2.889585471227892e-05, + "loss": 0.08135615587234497, + "step": 165910 + }, + { + "epoch": 0.7123292376119454, + "grad_norm": 0.07997957617044449, + "learning_rate": 2.8891542992161297e-05, + "loss": 0.14295884370803832, + "step": 165920 + }, + { + "epoch": 0.7123721697019655, + "grad_norm": 0.05502615496516228, + "learning_rate": 2.8887231272043667e-05, + "loss": 0.15717185735702516, + "step": 165930 + }, + { + "epoch": 0.7124151017919854, + "grad_norm": 0.10012836754322052, + "learning_rate": 2.8882919551926048e-05, + "loss": 0.09158955216407776, + "step": 165940 + }, + { + "epoch": 0.7124580338820055, + "grad_norm": 0.03289559483528137, + "learning_rate": 2.8878607831808425e-05, + "loss": 0.17381430864334108, + "step": 165950 + }, + { + "epoch": 0.7125009659720255, + "grad_norm": 8.307626724243164, + "learning_rate": 2.8874296111690803e-05, + "loss": 0.293519926071167, + "step": 165960 + }, + { + "epoch": 0.7125438980620454, + "grad_norm": 0.008939497172832489, + "learning_rate": 2.8869984391573173e-05, + "loss": 0.009337369352579117, + "step": 165970 + }, + { + "epoch": 0.7125868301520655, + "grad_norm": 2.4467132091522217, + "learning_rate": 2.886567267145555e-05, + "loss": 0.31867618560791017, + "step": 165980 + }, + { + "epoch": 0.7126297622420855, + "grad_norm": 6.164371967315674, + "learning_rate": 2.8861360951337928e-05, + "loss": 0.3132692098617554, + "step": 165990 + }, + { + "epoch": 0.7126726943321054, + "grad_norm": 0.002331892494112253, + "learning_rate": 2.8857049231220305e-05, + "loss": 0.08976657390594482, + "step": 166000 + }, + { + "epoch": 0.7126726943321054, + "eval_loss": 0.3927363455295563, + "eval_runtime": 27.1077, + "eval_samples_per_second": 3.689, + "eval_steps_per_second": 3.689, + "step": 166000 + }, + { + "epoch": 0.7127156264221255, + "grad_norm": 0.13965459167957306, + "learning_rate": 2.885273751110268e-05, + "loss": 0.19368773698806763, + "step": 166010 + }, + { + "epoch": 0.7127585585121455, + "grad_norm": 0.08737999945878983, + "learning_rate": 2.8848425790985057e-05, + "loss": 0.2531174421310425, + "step": 166020 + }, + { + "epoch": 0.7128014906021655, + "grad_norm": 0.1749696582555771, + "learning_rate": 2.8844114070867434e-05, + "loss": 0.18520824909210204, + "step": 166030 + }, + { + "epoch": 0.7128444226921855, + "grad_norm": 0.03882657736539841, + "learning_rate": 2.883980235074981e-05, + "loss": 0.270503306388855, + "step": 166040 + }, + { + "epoch": 0.7128873547822056, + "grad_norm": 2.385289192199707, + "learning_rate": 2.8835490630632185e-05, + "loss": 0.21199922561645507, + "step": 166050 + }, + { + "epoch": 0.7129302868722255, + "grad_norm": 0.26386263966560364, + "learning_rate": 2.8831178910514562e-05, + "loss": 0.07489157319068909, + "step": 166060 + }, + { + "epoch": 0.7129732189622455, + "grad_norm": 0.4560233950614929, + "learning_rate": 2.882686719039694e-05, + "loss": 0.13342225551605225, + "step": 166070 + }, + { + "epoch": 0.7130161510522656, + "grad_norm": 0.0036526834592223167, + "learning_rate": 2.8822555470279317e-05, + "loss": 0.15702961683273314, + "step": 166080 + }, + { + "epoch": 0.7130590831422855, + "grad_norm": 1.793468952178955, + "learning_rate": 2.8818243750161688e-05, + "loss": 0.31508579254150393, + "step": 166090 + }, + { + "epoch": 0.7131020152323055, + "grad_norm": 1.5543371438980103, + "learning_rate": 2.8813932030044065e-05, + "loss": 0.10623435974121094, + "step": 166100 + }, + { + "epoch": 0.7131449473223256, + "grad_norm": 2.2663276195526123, + "learning_rate": 2.8809620309926442e-05, + "loss": 0.28469769954681395, + "step": 166110 + }, + { + "epoch": 0.7131878794123455, + "grad_norm": 2.3514790534973145, + "learning_rate": 2.8805308589808823e-05, + "loss": 0.28321788311004636, + "step": 166120 + }, + { + "epoch": 0.7132308115023656, + "grad_norm": 0.03398464620113373, + "learning_rate": 2.8800996869691194e-05, + "loss": 0.18461548089981078, + "step": 166130 + }, + { + "epoch": 0.7132737435923856, + "grad_norm": 1.426744818687439, + "learning_rate": 2.879668514957357e-05, + "loss": 0.2839787244796753, + "step": 166140 + }, + { + "epoch": 0.7133166756824055, + "grad_norm": 0.770117998123169, + "learning_rate": 2.879237342945595e-05, + "loss": 0.20739870071411132, + "step": 166150 + }, + { + "epoch": 0.7133596077724256, + "grad_norm": 0.002078356221318245, + "learning_rate": 2.8788061709338326e-05, + "loss": 0.1516830563545227, + "step": 166160 + }, + { + "epoch": 0.7134025398624456, + "grad_norm": 0.013807138428092003, + "learning_rate": 2.87837499892207e-05, + "loss": 0.10811216831207275, + "step": 166170 + }, + { + "epoch": 0.7134454719524655, + "grad_norm": 2.072737216949463, + "learning_rate": 2.8779438269103077e-05, + "loss": 0.16044397354125978, + "step": 166180 + }, + { + "epoch": 0.7134884040424856, + "grad_norm": 0.0021444791927933693, + "learning_rate": 2.8775126548985454e-05, + "loss": 0.16015149354934693, + "step": 166190 + }, + { + "epoch": 0.7135313361325056, + "grad_norm": 0.3627009689807892, + "learning_rate": 2.8770814828867832e-05, + "loss": 0.09903419613838196, + "step": 166200 + }, + { + "epoch": 0.7135742682225256, + "grad_norm": 0.26428207755088806, + "learning_rate": 2.8766503108750202e-05, + "loss": 0.07487475872039795, + "step": 166210 + }, + { + "epoch": 0.7136172003125456, + "grad_norm": 0.0420398935675621, + "learning_rate": 2.876219138863258e-05, + "loss": 0.017874059081077576, + "step": 166220 + }, + { + "epoch": 0.7136601324025657, + "grad_norm": 0.0023280770983546972, + "learning_rate": 2.875787966851496e-05, + "loss": 0.04525628387928009, + "step": 166230 + }, + { + "epoch": 0.7137030644925856, + "grad_norm": 0.670574426651001, + "learning_rate": 2.8753567948397338e-05, + "loss": 0.2393495559692383, + "step": 166240 + }, + { + "epoch": 0.7137459965826056, + "grad_norm": 0.6433839797973633, + "learning_rate": 2.8749256228279715e-05, + "loss": 0.19196040630340577, + "step": 166250 + }, + { + "epoch": 0.7137889286726257, + "grad_norm": 1.1744970083236694, + "learning_rate": 2.8744944508162086e-05, + "loss": 0.17194961309432982, + "step": 166260 + }, + { + "epoch": 0.7138318607626456, + "grad_norm": 0.07919386029243469, + "learning_rate": 2.8740632788044463e-05, + "loss": 0.3788050651550293, + "step": 166270 + }, + { + "epoch": 0.7138747928526656, + "grad_norm": 0.006081217434257269, + "learning_rate": 2.873632106792684e-05, + "loss": 0.2530497074127197, + "step": 166280 + }, + { + "epoch": 0.7139177249426857, + "grad_norm": 0.006908032111823559, + "learning_rate": 2.8732009347809218e-05, + "loss": 0.1852771282196045, + "step": 166290 + }, + { + "epoch": 0.7139606570327056, + "grad_norm": 8.477197647094727, + "learning_rate": 2.872769762769159e-05, + "loss": 0.18325682878494262, + "step": 166300 + }, + { + "epoch": 0.7140035891227257, + "grad_norm": 0.004288307391107082, + "learning_rate": 2.872338590757397e-05, + "loss": 0.31393725872039796, + "step": 166310 + }, + { + "epoch": 0.7140465212127457, + "grad_norm": 0.6741523742675781, + "learning_rate": 2.8719074187456346e-05, + "loss": 0.08937577605247497, + "step": 166320 + }, + { + "epoch": 0.7140894533027656, + "grad_norm": 0.008980872109532356, + "learning_rate": 2.8714762467338724e-05, + "loss": 0.26209454536437987, + "step": 166330 + }, + { + "epoch": 0.7141323853927857, + "grad_norm": 7.205010414123535, + "learning_rate": 2.8710450747221098e-05, + "loss": 0.30361227989196776, + "step": 166340 + }, + { + "epoch": 0.7141753174828057, + "grad_norm": 0.00631917966529727, + "learning_rate": 2.8706139027103475e-05, + "loss": 0.060678571462631226, + "step": 166350 + }, + { + "epoch": 0.7142182495728258, + "grad_norm": 3.5636608600616455, + "learning_rate": 2.8701827306985852e-05, + "loss": 0.20951218605041505, + "step": 166360 + }, + { + "epoch": 0.7142611816628457, + "grad_norm": 4.244123935699463, + "learning_rate": 2.869751558686823e-05, + "loss": 0.0760481059551239, + "step": 166370 + }, + { + "epoch": 0.7143041137528657, + "grad_norm": 0.03286031633615494, + "learning_rate": 2.86932038667506e-05, + "loss": 0.0744013249874115, + "step": 166380 + }, + { + "epoch": 0.7143470458428858, + "grad_norm": 0.016204357147216797, + "learning_rate": 2.8688892146632978e-05, + "loss": 0.0846706509590149, + "step": 166390 + }, + { + "epoch": 0.7143899779329057, + "grad_norm": 0.04532900080084801, + "learning_rate": 2.8684580426515355e-05, + "loss": 0.1527009963989258, + "step": 166400 + }, + { + "epoch": 0.7144329100229257, + "grad_norm": 0.5035681128501892, + "learning_rate": 2.8680268706397732e-05, + "loss": 0.44489264488220215, + "step": 166410 + }, + { + "epoch": 0.7144758421129458, + "grad_norm": 0.1924525499343872, + "learning_rate": 2.8675956986280106e-05, + "loss": 0.15166373252868653, + "step": 166420 + }, + { + "epoch": 0.7145187742029657, + "grad_norm": 0.011877103708684444, + "learning_rate": 2.8671645266162484e-05, + "loss": 0.062304210662841794, + "step": 166430 + }, + { + "epoch": 0.7145617062929858, + "grad_norm": 0.0012645673705264926, + "learning_rate": 2.866733354604486e-05, + "loss": 0.15559821128845214, + "step": 166440 + }, + { + "epoch": 0.7146046383830058, + "grad_norm": 0.17878887057304382, + "learning_rate": 2.8663021825927238e-05, + "loss": 0.33294291496276857, + "step": 166450 + }, + { + "epoch": 0.7146475704730257, + "grad_norm": 2.671096086502075, + "learning_rate": 2.8658710105809612e-05, + "loss": 0.5303155422210694, + "step": 166460 + }, + { + "epoch": 0.7146905025630458, + "grad_norm": 0.07435199618339539, + "learning_rate": 2.865439838569199e-05, + "loss": 0.12739841938018798, + "step": 166470 + }, + { + "epoch": 0.7147334346530658, + "grad_norm": 0.1150457039475441, + "learning_rate": 2.8650086665574367e-05, + "loss": 0.21583008766174316, + "step": 166480 + }, + { + "epoch": 0.7147763667430858, + "grad_norm": 0.08370175957679749, + "learning_rate": 2.8645774945456744e-05, + "loss": 0.07304016947746277, + "step": 166490 + }, + { + "epoch": 0.7148192988331058, + "grad_norm": 0.4779481887817383, + "learning_rate": 2.8641463225339115e-05, + "loss": 0.07035287618637084, + "step": 166500 + }, + { + "epoch": 0.7148622309231258, + "grad_norm": 0.027747897431254387, + "learning_rate": 2.8637151505221492e-05, + "loss": 0.15281134843826294, + "step": 166510 + }, + { + "epoch": 0.7149051630131458, + "grad_norm": 1.3348201513290405, + "learning_rate": 2.863283978510387e-05, + "loss": 0.23680105209350585, + "step": 166520 + }, + { + "epoch": 0.7149480951031658, + "grad_norm": 2.2081685066223145, + "learning_rate": 2.862852806498625e-05, + "loss": 0.20437359809875488, + "step": 166530 + }, + { + "epoch": 0.7149910271931859, + "grad_norm": 0.09051236510276794, + "learning_rate": 2.862421634486862e-05, + "loss": 0.11743265390396118, + "step": 166540 + }, + { + "epoch": 0.7150339592832058, + "grad_norm": 0.0025285957381129265, + "learning_rate": 2.8619904624750998e-05, + "loss": 0.10214493274688721, + "step": 166550 + }, + { + "epoch": 0.7150768913732258, + "grad_norm": 0.006848174147307873, + "learning_rate": 2.8615592904633375e-05, + "loss": 0.09545931220054626, + "step": 166560 + }, + { + "epoch": 0.7151198234632459, + "grad_norm": 2.7628366947174072, + "learning_rate": 2.8611281184515753e-05, + "loss": 0.37620038986206056, + "step": 166570 + }, + { + "epoch": 0.7151627555532658, + "grad_norm": 0.002312214346602559, + "learning_rate": 2.8606969464398127e-05, + "loss": 0.3035972356796265, + "step": 166580 + }, + { + "epoch": 0.7152056876432858, + "grad_norm": 0.008850247599184513, + "learning_rate": 2.8602657744280504e-05, + "loss": 0.21454875469207763, + "step": 166590 + }, + { + "epoch": 0.7152486197333059, + "grad_norm": 3.854825973510742, + "learning_rate": 2.859834602416288e-05, + "loss": 0.29339096546173093, + "step": 166600 + }, + { + "epoch": 0.7152915518233258, + "grad_norm": 0.004344264045357704, + "learning_rate": 2.859403430404526e-05, + "loss": 0.2869076728820801, + "step": 166610 + }, + { + "epoch": 0.7153344839133459, + "grad_norm": 1.5101947784423828, + "learning_rate": 2.8589722583927636e-05, + "loss": 0.26342000961303713, + "step": 166620 + }, + { + "epoch": 0.7153774160033659, + "grad_norm": 0.17990095913410187, + "learning_rate": 2.8585410863810007e-05, + "loss": 0.2827131986618042, + "step": 166630 + }, + { + "epoch": 0.7154203480933858, + "grad_norm": 0.019368786364793777, + "learning_rate": 2.8581099143692387e-05, + "loss": 0.16794705390930176, + "step": 166640 + }, + { + "epoch": 0.7154632801834059, + "grad_norm": 0.004532191436737776, + "learning_rate": 2.8576787423574765e-05, + "loss": 0.24334888458251952, + "step": 166650 + }, + { + "epoch": 0.7155062122734259, + "grad_norm": 0.006376985460519791, + "learning_rate": 2.8572475703457142e-05, + "loss": 0.2586236000061035, + "step": 166660 + }, + { + "epoch": 0.7155491443634459, + "grad_norm": 0.15656788647174835, + "learning_rate": 2.8568163983339513e-05, + "loss": 0.23142714500427247, + "step": 166670 + }, + { + "epoch": 0.7155920764534659, + "grad_norm": 0.019105859100818634, + "learning_rate": 2.856385226322189e-05, + "loss": 0.18950977325439453, + "step": 166680 + }, + { + "epoch": 0.7156350085434859, + "grad_norm": 0.009699470363557339, + "learning_rate": 2.8559540543104267e-05, + "loss": 0.4047813892364502, + "step": 166690 + }, + { + "epoch": 0.7156779406335059, + "grad_norm": 0.13277097046375275, + "learning_rate": 2.8555228822986645e-05, + "loss": 0.06402959227561951, + "step": 166700 + }, + { + "epoch": 0.7157208727235259, + "grad_norm": 2.5981411933898926, + "learning_rate": 2.855091710286902e-05, + "loss": 0.36272387504577636, + "step": 166710 + }, + { + "epoch": 0.715763804813546, + "grad_norm": 0.5918976068496704, + "learning_rate": 2.8546605382751396e-05, + "loss": 0.22860181331634521, + "step": 166720 + }, + { + "epoch": 0.7158067369035659, + "grad_norm": 0.04410221800208092, + "learning_rate": 2.8542293662633773e-05, + "loss": 0.24485437870025634, + "step": 166730 + }, + { + "epoch": 0.7158496689935859, + "grad_norm": 0.009131097234785557, + "learning_rate": 2.853798194251615e-05, + "loss": 0.060953164100646974, + "step": 166740 + }, + { + "epoch": 0.715892601083606, + "grad_norm": 0.011584590189158916, + "learning_rate": 2.8533670222398525e-05, + "loss": 0.28687229156494143, + "step": 166750 + }, + { + "epoch": 0.7159355331736259, + "grad_norm": 3.6411337852478027, + "learning_rate": 2.8529358502280902e-05, + "loss": 0.3922334432601929, + "step": 166760 + }, + { + "epoch": 0.715978465263646, + "grad_norm": 0.13134698569774628, + "learning_rate": 2.852504678216328e-05, + "loss": 0.17828294038772582, + "step": 166770 + }, + { + "epoch": 0.716021397353666, + "grad_norm": 0.009131759405136108, + "learning_rate": 2.8520735062045657e-05, + "loss": 0.2095499277114868, + "step": 166780 + }, + { + "epoch": 0.716064329443686, + "grad_norm": 0.007369739469140768, + "learning_rate": 2.8516423341928027e-05, + "loss": 0.19556423425674438, + "step": 166790 + }, + { + "epoch": 0.716107261533706, + "grad_norm": 1.3228790760040283, + "learning_rate": 2.8512111621810405e-05, + "loss": 0.14940618276596068, + "step": 166800 + }, + { + "epoch": 0.716150193623726, + "grad_norm": 2.1382687091827393, + "learning_rate": 2.8507799901692782e-05, + "loss": 0.08745411634445191, + "step": 166810 + }, + { + "epoch": 0.716193125713746, + "grad_norm": 2.0986907482147217, + "learning_rate": 2.850348818157516e-05, + "loss": 0.18943517208099364, + "step": 166820 + }, + { + "epoch": 0.716236057803766, + "grad_norm": 0.01945311203598976, + "learning_rate": 2.8499176461457533e-05, + "loss": 0.2173239231109619, + "step": 166830 + }, + { + "epoch": 0.716278989893786, + "grad_norm": 0.037285711616277695, + "learning_rate": 2.849486474133991e-05, + "loss": 0.06609262228012085, + "step": 166840 + }, + { + "epoch": 0.7163219219838061, + "grad_norm": 5.157741546630859, + "learning_rate": 2.8490553021222288e-05, + "loss": 0.22957515716552734, + "step": 166850 + }, + { + "epoch": 0.716364854073826, + "grad_norm": 0.5119006037712097, + "learning_rate": 2.8486241301104665e-05, + "loss": 0.1565529465675354, + "step": 166860 + }, + { + "epoch": 0.716407786163846, + "grad_norm": 0.3631255030632019, + "learning_rate": 2.848192958098704e-05, + "loss": 0.050896257162094116, + "step": 166870 + }, + { + "epoch": 0.7164507182538661, + "grad_norm": 2.3672053813934326, + "learning_rate": 2.8477617860869417e-05, + "loss": 0.24461157321929933, + "step": 166880 + }, + { + "epoch": 0.716493650343886, + "grad_norm": 0.41185131669044495, + "learning_rate": 2.8473306140751794e-05, + "loss": 0.24481289386749266, + "step": 166890 + }, + { + "epoch": 0.716536582433906, + "grad_norm": 0.0012526832288131118, + "learning_rate": 2.846899442063417e-05, + "loss": 0.007355506718158722, + "step": 166900 + }, + { + "epoch": 0.7165795145239261, + "grad_norm": 1.2811726331710815, + "learning_rate": 2.8464682700516542e-05, + "loss": 0.10153491497039795, + "step": 166910 + }, + { + "epoch": 0.716622446613946, + "grad_norm": 0.5895275473594666, + "learning_rate": 2.846037098039892e-05, + "loss": 0.18279372453689574, + "step": 166920 + }, + { + "epoch": 0.7166653787039661, + "grad_norm": 0.0027312906458973885, + "learning_rate": 2.8456059260281296e-05, + "loss": 0.19183812141418458, + "step": 166930 + }, + { + "epoch": 0.7167083107939861, + "grad_norm": 0.29518353939056396, + "learning_rate": 2.8451747540163677e-05, + "loss": 0.05575355291366577, + "step": 166940 + }, + { + "epoch": 0.716751242884006, + "grad_norm": 0.8758601546287537, + "learning_rate": 2.8447435820046048e-05, + "loss": 0.22281897068023682, + "step": 166950 + }, + { + "epoch": 0.7167941749740261, + "grad_norm": 0.1582789421081543, + "learning_rate": 2.8443124099928425e-05, + "loss": 0.16287997961044312, + "step": 166960 + }, + { + "epoch": 0.7168371070640461, + "grad_norm": 4.892996311187744, + "learning_rate": 2.8438812379810802e-05, + "loss": 0.4806065082550049, + "step": 166970 + }, + { + "epoch": 0.7168800391540661, + "grad_norm": 0.44628673791885376, + "learning_rate": 2.843450065969318e-05, + "loss": 0.26183133125305175, + "step": 166980 + }, + { + "epoch": 0.7169229712440861, + "grad_norm": 0.02352777309715748, + "learning_rate": 2.8430188939575557e-05, + "loss": 0.2532444953918457, + "step": 166990 + }, + { + "epoch": 0.7169659033341061, + "grad_norm": 0.03716174140572548, + "learning_rate": 2.842587721945793e-05, + "loss": 0.08634965419769287, + "step": 167000 + }, + { + "epoch": 0.7169659033341061, + "eval_loss": 0.3958474099636078, + "eval_runtime": 27.1467, + "eval_samples_per_second": 3.684, + "eval_steps_per_second": 3.684, + "step": 167000 + }, + { + "epoch": 0.7170088354241261, + "grad_norm": 0.0032118239905685186, + "learning_rate": 2.842156549934031e-05, + "loss": 0.10042769908905029, + "step": 167010 + }, + { + "epoch": 0.7170517675141461, + "grad_norm": 0.02116282656788826, + "learning_rate": 2.8417253779222686e-05, + "loss": 0.12349098920822144, + "step": 167020 + }, + { + "epoch": 0.7170946996041662, + "grad_norm": 0.1639430820941925, + "learning_rate": 2.8412942059105063e-05, + "loss": 0.4242201328277588, + "step": 167030 + }, + { + "epoch": 0.7171376316941861, + "grad_norm": 0.9448293447494507, + "learning_rate": 2.8408630338987434e-05, + "loss": 0.2647895574569702, + "step": 167040 + }, + { + "epoch": 0.7171805637842061, + "grad_norm": 0.019060397520661354, + "learning_rate": 2.8404318618869814e-05, + "loss": 0.27194387912750245, + "step": 167050 + }, + { + "epoch": 0.7172234958742262, + "grad_norm": 1.0550918579101562, + "learning_rate": 2.8400006898752192e-05, + "loss": 0.02531105875968933, + "step": 167060 + }, + { + "epoch": 0.7172664279642461, + "grad_norm": 0.12561658024787903, + "learning_rate": 2.839569517863457e-05, + "loss": 0.19044885635375977, + "step": 167070 + }, + { + "epoch": 0.7173093600542662, + "grad_norm": 5.141531944274902, + "learning_rate": 2.839138345851694e-05, + "loss": 0.3282967805862427, + "step": 167080 + }, + { + "epoch": 0.7173522921442862, + "grad_norm": 0.06956620514392853, + "learning_rate": 2.8387071738399317e-05, + "loss": 0.12983059883117676, + "step": 167090 + }, + { + "epoch": 0.7173952242343061, + "grad_norm": 0.19691401720046997, + "learning_rate": 2.8382760018281694e-05, + "loss": 0.20943801403045653, + "step": 167100 + }, + { + "epoch": 0.7174381563243262, + "grad_norm": 0.09076624363660812, + "learning_rate": 2.837844829816407e-05, + "loss": 0.17342876195907592, + "step": 167110 + }, + { + "epoch": 0.7174810884143462, + "grad_norm": 0.10245295614004135, + "learning_rate": 2.8374136578046446e-05, + "loss": 0.06132156252861023, + "step": 167120 + }, + { + "epoch": 0.7175240205043661, + "grad_norm": 1.7344605922698975, + "learning_rate": 2.8369824857928823e-05, + "loss": 0.2766872882843018, + "step": 167130 + }, + { + "epoch": 0.7175669525943862, + "grad_norm": 0.07946517318487167, + "learning_rate": 2.83655131378112e-05, + "loss": 0.1935239315032959, + "step": 167140 + }, + { + "epoch": 0.7176098846844062, + "grad_norm": 0.002123386599123478, + "learning_rate": 2.8361201417693578e-05, + "loss": 0.3540096998214722, + "step": 167150 + }, + { + "epoch": 0.7176528167744262, + "grad_norm": 0.0639406368136406, + "learning_rate": 2.835688969757595e-05, + "loss": 0.2834771633148193, + "step": 167160 + }, + { + "epoch": 0.7176957488644462, + "grad_norm": 7.1676177978515625, + "learning_rate": 2.835257797745833e-05, + "loss": 0.14616410732269286, + "step": 167170 + }, + { + "epoch": 0.7177386809544662, + "grad_norm": 0.38563501834869385, + "learning_rate": 2.8348266257340706e-05, + "loss": 0.22452714443206787, + "step": 167180 + }, + { + "epoch": 0.7177816130444862, + "grad_norm": 0.043906208127737045, + "learning_rate": 2.8343954537223084e-05, + "loss": 0.10431742668151855, + "step": 167190 + }, + { + "epoch": 0.7178245451345062, + "grad_norm": 0.1342371553182602, + "learning_rate": 2.8339642817105454e-05, + "loss": 0.20088469982147217, + "step": 167200 + }, + { + "epoch": 0.7178674772245263, + "grad_norm": 0.13105660676956177, + "learning_rate": 2.833533109698783e-05, + "loss": 0.19219967126846313, + "step": 167210 + }, + { + "epoch": 0.7179104093145463, + "grad_norm": 0.023251548409461975, + "learning_rate": 2.833101937687021e-05, + "loss": 0.14668450355529786, + "step": 167220 + }, + { + "epoch": 0.7179533414045662, + "grad_norm": 0.038987189531326294, + "learning_rate": 2.832670765675259e-05, + "loss": 0.22458341121673583, + "step": 167230 + }, + { + "epoch": 0.7179962734945863, + "grad_norm": 1.144334077835083, + "learning_rate": 2.832239593663496e-05, + "loss": 0.412811803817749, + "step": 167240 + }, + { + "epoch": 0.7180392055846063, + "grad_norm": 1.0596046447753906, + "learning_rate": 2.8318084216517338e-05, + "loss": 0.29309654235839844, + "step": 167250 + }, + { + "epoch": 0.7180821376746263, + "grad_norm": 0.005872590467333794, + "learning_rate": 2.8313772496399715e-05, + "loss": 0.17906821966171266, + "step": 167260 + }, + { + "epoch": 0.7181250697646463, + "grad_norm": 0.005974804516881704, + "learning_rate": 2.8309460776282092e-05, + "loss": 0.05527445077896118, + "step": 167270 + }, + { + "epoch": 0.7181680018546663, + "grad_norm": 0.6632001399993896, + "learning_rate": 2.8305149056164466e-05, + "loss": 0.16342580318450928, + "step": 167280 + }, + { + "epoch": 0.7182109339446863, + "grad_norm": 1.1039843559265137, + "learning_rate": 2.8300837336046844e-05, + "loss": 0.1806264877319336, + "step": 167290 + }, + { + "epoch": 0.7182538660347063, + "grad_norm": 3.905797243118286, + "learning_rate": 2.829652561592922e-05, + "loss": 0.24257402420043944, + "step": 167300 + }, + { + "epoch": 0.7182967981247264, + "grad_norm": 1.0418668985366821, + "learning_rate": 2.8292213895811598e-05, + "loss": 0.2584467887878418, + "step": 167310 + }, + { + "epoch": 0.7183397302147463, + "grad_norm": 0.009823929518461227, + "learning_rate": 2.828790217569397e-05, + "loss": 0.21770334243774414, + "step": 167320 + }, + { + "epoch": 0.7183826623047663, + "grad_norm": 1.0369503498077393, + "learning_rate": 2.8283590455576346e-05, + "loss": 0.20086488723754883, + "step": 167330 + }, + { + "epoch": 0.7184255943947864, + "grad_norm": 0.02220025099813938, + "learning_rate": 2.8279278735458727e-05, + "loss": 0.006128740310668945, + "step": 167340 + }, + { + "epoch": 0.7184685264848063, + "grad_norm": 0.035444118082523346, + "learning_rate": 2.8274967015341104e-05, + "loss": 0.09826570153236389, + "step": 167350 + }, + { + "epoch": 0.7185114585748263, + "grad_norm": 0.006482461001724005, + "learning_rate": 2.827065529522348e-05, + "loss": 0.14989885091781616, + "step": 167360 + }, + { + "epoch": 0.7185543906648464, + "grad_norm": 7.3291401863098145, + "learning_rate": 2.8266343575105852e-05, + "loss": 0.30905053615570066, + "step": 167370 + }, + { + "epoch": 0.7185973227548663, + "grad_norm": 3.8639028072357178, + "learning_rate": 2.826203185498823e-05, + "loss": 0.13532700538635253, + "step": 167380 + }, + { + "epoch": 0.7186402548448864, + "grad_norm": 0.2876453101634979, + "learning_rate": 2.8257720134870607e-05, + "loss": 0.1640407085418701, + "step": 167390 + }, + { + "epoch": 0.7186831869349064, + "grad_norm": 1.3986937999725342, + "learning_rate": 2.8253408414752984e-05, + "loss": 0.3686722755432129, + "step": 167400 + }, + { + "epoch": 0.7187261190249263, + "grad_norm": 0.004579016473144293, + "learning_rate": 2.8249096694635358e-05, + "loss": 0.2243633270263672, + "step": 167410 + }, + { + "epoch": 0.7187690511149464, + "grad_norm": 1.4218645095825195, + "learning_rate": 2.8244784974517735e-05, + "loss": 0.07126256227493286, + "step": 167420 + }, + { + "epoch": 0.7188119832049664, + "grad_norm": 0.06639666855335236, + "learning_rate": 2.8240473254400113e-05, + "loss": 0.2880155086517334, + "step": 167430 + }, + { + "epoch": 0.7188549152949864, + "grad_norm": 1.2099025249481201, + "learning_rate": 2.823616153428249e-05, + "loss": 0.22116010189056395, + "step": 167440 + }, + { + "epoch": 0.7188978473850064, + "grad_norm": 0.001034542452543974, + "learning_rate": 2.8231849814164864e-05, + "loss": 0.18301440477371217, + "step": 167450 + }, + { + "epoch": 0.7189407794750264, + "grad_norm": 0.07826003432273865, + "learning_rate": 2.822753809404724e-05, + "loss": 0.19326547384262086, + "step": 167460 + }, + { + "epoch": 0.7189837115650464, + "grad_norm": 1.5844489336013794, + "learning_rate": 2.822322637392962e-05, + "loss": 0.3147656202316284, + "step": 167470 + }, + { + "epoch": 0.7190266436550664, + "grad_norm": 0.7816833257675171, + "learning_rate": 2.8218914653811996e-05, + "loss": 0.17998864650726318, + "step": 167480 + }, + { + "epoch": 0.7190695757450865, + "grad_norm": 0.0279255211353302, + "learning_rate": 2.8214602933694367e-05, + "loss": 0.11451849937438965, + "step": 167490 + }, + { + "epoch": 0.7191125078351064, + "grad_norm": 0.011592227965593338, + "learning_rate": 2.8210291213576744e-05, + "loss": 0.02958979904651642, + "step": 167500 + }, + { + "epoch": 0.7191554399251264, + "grad_norm": 0.1158149316906929, + "learning_rate": 2.820597949345912e-05, + "loss": 0.18526514768600463, + "step": 167510 + }, + { + "epoch": 0.7191983720151465, + "grad_norm": 0.02158307656645775, + "learning_rate": 2.82016677733415e-05, + "loss": 0.3040745735168457, + "step": 167520 + }, + { + "epoch": 0.7192413041051664, + "grad_norm": 0.6742632985115051, + "learning_rate": 2.8197356053223873e-05, + "loss": 0.15425585508346557, + "step": 167530 + }, + { + "epoch": 0.7192842361951864, + "grad_norm": 1.8113670349121094, + "learning_rate": 2.819304433310625e-05, + "loss": 0.21069693565368652, + "step": 167540 + }, + { + "epoch": 0.7193271682852065, + "grad_norm": 0.0016236762749031186, + "learning_rate": 2.8188732612988627e-05, + "loss": 0.1839777112007141, + "step": 167550 + }, + { + "epoch": 0.7193701003752264, + "grad_norm": 0.3076116144657135, + "learning_rate": 2.8184420892871005e-05, + "loss": 0.16991562843322755, + "step": 167560 + }, + { + "epoch": 0.7194130324652465, + "grad_norm": 0.15978385508060455, + "learning_rate": 2.818010917275338e-05, + "loss": 0.09455273747444153, + "step": 167570 + }, + { + "epoch": 0.7194559645552665, + "grad_norm": 0.010344245471060276, + "learning_rate": 2.8175797452635756e-05, + "loss": 0.09267536401748658, + "step": 167580 + }, + { + "epoch": 0.7194988966452864, + "grad_norm": 0.026899434626102448, + "learning_rate": 2.8171485732518133e-05, + "loss": 0.1246342420578003, + "step": 167590 + }, + { + "epoch": 0.7195418287353065, + "grad_norm": 0.18134640157222748, + "learning_rate": 2.816717401240051e-05, + "loss": 0.11649966239929199, + "step": 167600 + }, + { + "epoch": 0.7195847608253265, + "grad_norm": 0.008635690435767174, + "learning_rate": 2.816286229228288e-05, + "loss": 0.24684834480285645, + "step": 167610 + }, + { + "epoch": 0.7196276929153465, + "grad_norm": 0.0058225602842867374, + "learning_rate": 2.815855057216526e-05, + "loss": 0.21973233222961425, + "step": 167620 + }, + { + "epoch": 0.7196706250053665, + "grad_norm": 1.9177488088607788, + "learning_rate": 2.8154238852047636e-05, + "loss": 0.1735082745552063, + "step": 167630 + }, + { + "epoch": 0.7197135570953865, + "grad_norm": 1.5728529691696167, + "learning_rate": 2.8149927131930017e-05, + "loss": 0.2044273853302002, + "step": 167640 + }, + { + "epoch": 0.7197564891854066, + "grad_norm": 0.003188293194398284, + "learning_rate": 2.8145615411812387e-05, + "loss": 0.24602503776550294, + "step": 167650 + }, + { + "epoch": 0.7197994212754265, + "grad_norm": 7.650982856750488, + "learning_rate": 2.8141303691694765e-05, + "loss": 0.28363747596740724, + "step": 167660 + }, + { + "epoch": 0.7198423533654466, + "grad_norm": 0.03091641142964363, + "learning_rate": 2.8136991971577142e-05, + "loss": 0.10700229406356812, + "step": 167670 + }, + { + "epoch": 0.7198852854554666, + "grad_norm": 4.954349994659424, + "learning_rate": 2.813268025145952e-05, + "loss": 0.2537800073623657, + "step": 167680 + }, + { + "epoch": 0.7199282175454865, + "grad_norm": 0.0050700693391263485, + "learning_rate": 2.8128368531341897e-05, + "loss": 0.21821444034576415, + "step": 167690 + }, + { + "epoch": 0.7199711496355066, + "grad_norm": 0.0030803687404841185, + "learning_rate": 2.812405681122427e-05, + "loss": 0.024721534550189973, + "step": 167700 + }, + { + "epoch": 0.7200140817255266, + "grad_norm": 0.05516568943858147, + "learning_rate": 2.8119745091106648e-05, + "loss": 0.18041404485702514, + "step": 167710 + }, + { + "epoch": 0.7200570138155465, + "grad_norm": 0.0014705831417813897, + "learning_rate": 2.8115433370989025e-05, + "loss": 0.12923437356948853, + "step": 167720 + }, + { + "epoch": 0.7200999459055666, + "grad_norm": 1.7812938690185547, + "learning_rate": 2.8111121650871403e-05, + "loss": 0.02560472786426544, + "step": 167730 + }, + { + "epoch": 0.7201428779955866, + "grad_norm": 1.239588975906372, + "learning_rate": 2.8106809930753773e-05, + "loss": 0.20239953994750975, + "step": 167740 + }, + { + "epoch": 0.7201858100856066, + "grad_norm": 0.6212531328201294, + "learning_rate": 2.8102498210636154e-05, + "loss": 0.19338698387145997, + "step": 167750 + }, + { + "epoch": 0.7202287421756266, + "grad_norm": 0.016148079186677933, + "learning_rate": 2.809818649051853e-05, + "loss": 0.16366405487060548, + "step": 167760 + }, + { + "epoch": 0.7202716742656466, + "grad_norm": 0.0054362318478524685, + "learning_rate": 2.809387477040091e-05, + "loss": 0.22141509056091307, + "step": 167770 + }, + { + "epoch": 0.7203146063556666, + "grad_norm": 0.10470175743103027, + "learning_rate": 2.808956305028328e-05, + "loss": 0.2782343864440918, + "step": 167780 + }, + { + "epoch": 0.7203575384456866, + "grad_norm": 0.18910618126392365, + "learning_rate": 2.8085251330165656e-05, + "loss": 0.2893279790878296, + "step": 167790 + }, + { + "epoch": 0.7204004705357067, + "grad_norm": 0.022973231971263885, + "learning_rate": 2.8080939610048034e-05, + "loss": 0.2531591892242432, + "step": 167800 + }, + { + "epoch": 0.7204434026257266, + "grad_norm": 0.02012925036251545, + "learning_rate": 2.807662788993041e-05, + "loss": 0.07524177432060242, + "step": 167810 + }, + { + "epoch": 0.7204863347157466, + "grad_norm": 0.0014940955443307757, + "learning_rate": 2.8072316169812785e-05, + "loss": 0.1927587866783142, + "step": 167820 + }, + { + "epoch": 0.7205292668057667, + "grad_norm": 0.6223175525665283, + "learning_rate": 2.8068004449695162e-05, + "loss": 0.18107985258102416, + "step": 167830 + }, + { + "epoch": 0.7205721988957866, + "grad_norm": 1.9258840084075928, + "learning_rate": 2.806369272957754e-05, + "loss": 0.1704465627670288, + "step": 167840 + }, + { + "epoch": 0.7206151309858067, + "grad_norm": 0.1283116340637207, + "learning_rate": 2.8059381009459917e-05, + "loss": 0.18721762895584107, + "step": 167850 + }, + { + "epoch": 0.7206580630758267, + "grad_norm": 1.301703929901123, + "learning_rate": 2.805506928934229e-05, + "loss": 0.18680089712142944, + "step": 167860 + }, + { + "epoch": 0.7207009951658466, + "grad_norm": 0.03944355994462967, + "learning_rate": 2.805075756922467e-05, + "loss": 0.055796694755554196, + "step": 167870 + }, + { + "epoch": 0.7207439272558667, + "grad_norm": 0.010357534512877464, + "learning_rate": 2.8046445849107046e-05, + "loss": 0.23156373500823973, + "step": 167880 + }, + { + "epoch": 0.7207868593458867, + "grad_norm": 0.21075792610645294, + "learning_rate": 2.8042134128989423e-05, + "loss": 0.306375527381897, + "step": 167890 + }, + { + "epoch": 0.7208297914359066, + "grad_norm": 0.03510294854640961, + "learning_rate": 2.8037822408871794e-05, + "loss": 0.17589036226272584, + "step": 167900 + }, + { + "epoch": 0.7208727235259267, + "grad_norm": 0.1116693839430809, + "learning_rate": 2.803351068875417e-05, + "loss": 0.12817736864089965, + "step": 167910 + }, + { + "epoch": 0.7209156556159467, + "grad_norm": 0.006631443277001381, + "learning_rate": 2.802919896863655e-05, + "loss": 0.2521315097808838, + "step": 167920 + }, + { + "epoch": 0.7209585877059667, + "grad_norm": 0.04760783165693283, + "learning_rate": 2.8024887248518926e-05, + "loss": 0.26681339740753174, + "step": 167930 + }, + { + "epoch": 0.7210015197959867, + "grad_norm": 0.0051206364296376705, + "learning_rate": 2.80205755284013e-05, + "loss": 0.12145293951034546, + "step": 167940 + }, + { + "epoch": 0.7210444518860067, + "grad_norm": 2.330493211746216, + "learning_rate": 2.8016263808283677e-05, + "loss": 0.2616943359375, + "step": 167950 + }, + { + "epoch": 0.7210873839760267, + "grad_norm": 0.767716109752655, + "learning_rate": 2.8011952088166054e-05, + "loss": 0.09361116886138916, + "step": 167960 + }, + { + "epoch": 0.7211303160660467, + "grad_norm": 0.017026184126734734, + "learning_rate": 2.8007640368048432e-05, + "loss": 0.10996824502944946, + "step": 167970 + }, + { + "epoch": 0.7211732481560668, + "grad_norm": 0.0036786592099815607, + "learning_rate": 2.8003328647930806e-05, + "loss": 0.07994485497474671, + "step": 167980 + }, + { + "epoch": 0.7212161802460867, + "grad_norm": 0.14449842274188995, + "learning_rate": 2.7999016927813183e-05, + "loss": 0.1397989273071289, + "step": 167990 + }, + { + "epoch": 0.7212591123361067, + "grad_norm": 1.243582844734192, + "learning_rate": 2.799470520769556e-05, + "loss": 0.18534576892852783, + "step": 168000 + }, + { + "epoch": 0.7212591123361067, + "eval_loss": 0.37931978702545166, + "eval_runtime": 27.1262, + "eval_samples_per_second": 3.686, + "eval_steps_per_second": 3.686, + "step": 168000 + }, + { + "epoch": 0.7213020444261268, + "grad_norm": 0.15428930521011353, + "learning_rate": 2.7990393487577938e-05, + "loss": 0.2006314516067505, + "step": 168010 + }, + { + "epoch": 0.7213449765161467, + "grad_norm": 3.320246458053589, + "learning_rate": 2.7986081767460308e-05, + "loss": 0.37806756496429444, + "step": 168020 + }, + { + "epoch": 0.7213879086061668, + "grad_norm": 1.1948304176330566, + "learning_rate": 2.7981770047342686e-05, + "loss": 0.3856146812438965, + "step": 168030 + }, + { + "epoch": 0.7214308406961868, + "grad_norm": 5.518876552581787, + "learning_rate": 2.7977458327225063e-05, + "loss": 0.2722789764404297, + "step": 168040 + }, + { + "epoch": 0.7214737727862067, + "grad_norm": 1.6739928722381592, + "learning_rate": 2.7973146607107444e-05, + "loss": 0.27594332695007323, + "step": 168050 + }, + { + "epoch": 0.7215167048762268, + "grad_norm": 0.8862299919128418, + "learning_rate": 2.796883488698982e-05, + "loss": 0.3354404211044312, + "step": 168060 + }, + { + "epoch": 0.7215596369662468, + "grad_norm": 0.0056771812960505486, + "learning_rate": 2.796452316687219e-05, + "loss": 0.28955352306365967, + "step": 168070 + }, + { + "epoch": 0.7216025690562669, + "grad_norm": 0.013778852298855782, + "learning_rate": 2.796021144675457e-05, + "loss": 0.03039870858192444, + "step": 168080 + }, + { + "epoch": 0.7216455011462868, + "grad_norm": 0.27223819494247437, + "learning_rate": 2.7955899726636946e-05, + "loss": 0.16090576648712157, + "step": 168090 + }, + { + "epoch": 0.7216884332363068, + "grad_norm": 1.8554707765579224, + "learning_rate": 2.7951588006519324e-05, + "loss": 0.19248855113983154, + "step": 168100 + }, + { + "epoch": 0.7217313653263269, + "grad_norm": 0.012462825514376163, + "learning_rate": 2.7947276286401698e-05, + "loss": 0.339345121383667, + "step": 168110 + }, + { + "epoch": 0.7217742974163468, + "grad_norm": 0.0005453620688058436, + "learning_rate": 2.7942964566284075e-05, + "loss": 0.13194665908813477, + "step": 168120 + }, + { + "epoch": 0.7218172295063668, + "grad_norm": 0.0018816015217453241, + "learning_rate": 2.7938652846166452e-05, + "loss": 0.31123464107513427, + "step": 168130 + }, + { + "epoch": 0.7218601615963869, + "grad_norm": 0.0021208133548498154, + "learning_rate": 2.793434112604883e-05, + "loss": 0.06131689548492432, + "step": 168140 + }, + { + "epoch": 0.7219030936864068, + "grad_norm": 1.9875645637512207, + "learning_rate": 2.79300294059312e-05, + "loss": 0.2216268301010132, + "step": 168150 + }, + { + "epoch": 0.7219460257764269, + "grad_norm": 0.008463248610496521, + "learning_rate": 2.792571768581358e-05, + "loss": 0.2832399845123291, + "step": 168160 + }, + { + "epoch": 0.7219889578664469, + "grad_norm": 1.7538836002349854, + "learning_rate": 2.7921405965695958e-05, + "loss": 0.1032507061958313, + "step": 168170 + }, + { + "epoch": 0.7220318899564668, + "grad_norm": 0.9188027381896973, + "learning_rate": 2.7917094245578336e-05, + "loss": 0.06151009202003479, + "step": 168180 + }, + { + "epoch": 0.7220748220464869, + "grad_norm": 0.5642251968383789, + "learning_rate": 2.7912782525460706e-05, + "loss": 0.2438356399536133, + "step": 168190 + }, + { + "epoch": 0.7221177541365069, + "grad_norm": 0.001289999345317483, + "learning_rate": 2.7908470805343083e-05, + "loss": 0.2134316682815552, + "step": 168200 + }, + { + "epoch": 0.7221606862265268, + "grad_norm": 0.05979342386126518, + "learning_rate": 2.790415908522546e-05, + "loss": 0.13589794635772706, + "step": 168210 + }, + { + "epoch": 0.7222036183165469, + "grad_norm": 0.3308796286582947, + "learning_rate": 2.7899847365107838e-05, + "loss": 0.3463158130645752, + "step": 168220 + }, + { + "epoch": 0.7222465504065669, + "grad_norm": 0.014125037007033825, + "learning_rate": 2.7895535644990212e-05, + "loss": 0.17682739496231079, + "step": 168230 + }, + { + "epoch": 0.7222894824965869, + "grad_norm": 0.014602779410779476, + "learning_rate": 2.789122392487259e-05, + "loss": 0.26034021377563477, + "step": 168240 + }, + { + "epoch": 0.7223324145866069, + "grad_norm": 2.157536745071411, + "learning_rate": 2.7886912204754967e-05, + "loss": 0.36604933738708495, + "step": 168250 + }, + { + "epoch": 0.722375346676627, + "grad_norm": 5.271147727966309, + "learning_rate": 2.7882600484637344e-05, + "loss": 0.3733457088470459, + "step": 168260 + }, + { + "epoch": 0.7224182787666469, + "grad_norm": 0.17696958780288696, + "learning_rate": 2.7878288764519718e-05, + "loss": 0.24389424324035644, + "step": 168270 + }, + { + "epoch": 0.7224612108566669, + "grad_norm": 0.0023903343826532364, + "learning_rate": 2.7873977044402095e-05, + "loss": 0.11699960231781006, + "step": 168280 + }, + { + "epoch": 0.722504142946687, + "grad_norm": 0.6614067554473877, + "learning_rate": 2.7869665324284473e-05, + "loss": 0.17375963926315308, + "step": 168290 + }, + { + "epoch": 0.7225470750367069, + "grad_norm": 0.29951897263526917, + "learning_rate": 2.786535360416685e-05, + "loss": 0.15570751428604127, + "step": 168300 + }, + { + "epoch": 0.7225900071267269, + "grad_norm": 0.049194544553756714, + "learning_rate": 2.786104188404922e-05, + "loss": 0.11768434047698975, + "step": 168310 + }, + { + "epoch": 0.722632939216747, + "grad_norm": 11.708648681640625, + "learning_rate": 2.7856730163931598e-05, + "loss": 0.40833373069763185, + "step": 168320 + }, + { + "epoch": 0.7226758713067669, + "grad_norm": 0.00927521288394928, + "learning_rate": 2.7852418443813975e-05, + "loss": 0.14625450372695922, + "step": 168330 + }, + { + "epoch": 0.722718803396787, + "grad_norm": 3.148890495300293, + "learning_rate": 2.7848106723696356e-05, + "loss": 0.22452225685119628, + "step": 168340 + }, + { + "epoch": 0.722761735486807, + "grad_norm": 2.645533561706543, + "learning_rate": 2.7843795003578727e-05, + "loss": 0.37561984062194825, + "step": 168350 + }, + { + "epoch": 0.7228046675768269, + "grad_norm": 1.514840841293335, + "learning_rate": 2.7839483283461104e-05, + "loss": 0.15910897254943848, + "step": 168360 + }, + { + "epoch": 0.722847599666847, + "grad_norm": 1.4466434717178345, + "learning_rate": 2.783517156334348e-05, + "loss": 0.07210831046104431, + "step": 168370 + }, + { + "epoch": 0.722890531756867, + "grad_norm": 0.9692143201828003, + "learning_rate": 2.783085984322586e-05, + "loss": 0.22329955101013182, + "step": 168380 + }, + { + "epoch": 0.722933463846887, + "grad_norm": 1.6919922828674316, + "learning_rate": 2.7826548123108233e-05, + "loss": 0.19658915996551513, + "step": 168390 + }, + { + "epoch": 0.722976395936907, + "grad_norm": 0.04949316009879112, + "learning_rate": 2.782223640299061e-05, + "loss": 0.10937418937683105, + "step": 168400 + }, + { + "epoch": 0.723019328026927, + "grad_norm": 0.004701630212366581, + "learning_rate": 2.7817924682872987e-05, + "loss": 0.22459728717803956, + "step": 168410 + }, + { + "epoch": 0.723062260116947, + "grad_norm": 0.007539353799074888, + "learning_rate": 2.7813612962755365e-05, + "loss": 0.09266999959945679, + "step": 168420 + }, + { + "epoch": 0.723105192206967, + "grad_norm": 0.03429533913731575, + "learning_rate": 2.7809301242637742e-05, + "loss": 0.1797150492668152, + "step": 168430 + }, + { + "epoch": 0.723148124296987, + "grad_norm": 0.04105391725897789, + "learning_rate": 2.7804989522520113e-05, + "loss": 0.17547688484191895, + "step": 168440 + }, + { + "epoch": 0.723191056387007, + "grad_norm": 0.04575356841087341, + "learning_rate": 2.7800677802402493e-05, + "loss": 0.11070233583450317, + "step": 168450 + }, + { + "epoch": 0.723233988477027, + "grad_norm": 0.0016563141252845526, + "learning_rate": 2.779636608228487e-05, + "loss": 0.28673884868621824, + "step": 168460 + }, + { + "epoch": 0.7232769205670471, + "grad_norm": 0.6947919130325317, + "learning_rate": 2.7792054362167248e-05, + "loss": 0.2968265533447266, + "step": 168470 + }, + { + "epoch": 0.723319852657067, + "grad_norm": 0.003793654264882207, + "learning_rate": 2.778774264204962e-05, + "loss": 0.10515412092208862, + "step": 168480 + }, + { + "epoch": 0.723362784747087, + "grad_norm": 1.045601487159729, + "learning_rate": 2.7783430921931996e-05, + "loss": 0.334007716178894, + "step": 168490 + }, + { + "epoch": 0.7234057168371071, + "grad_norm": 2.9117648601531982, + "learning_rate": 2.7779119201814373e-05, + "loss": 0.18874597549438477, + "step": 168500 + }, + { + "epoch": 0.7234486489271271, + "grad_norm": 0.006848334800451994, + "learning_rate": 2.777480748169675e-05, + "loss": 0.1698448419570923, + "step": 168510 + }, + { + "epoch": 0.7234915810171471, + "grad_norm": 0.806219756603241, + "learning_rate": 2.7770495761579125e-05, + "loss": 0.5198267936706543, + "step": 168520 + }, + { + "epoch": 0.7235345131071671, + "grad_norm": 1.7642408609390259, + "learning_rate": 2.7766184041461502e-05, + "loss": 0.2058842658996582, + "step": 168530 + }, + { + "epoch": 0.7235774451971871, + "grad_norm": 4.31995964050293, + "learning_rate": 2.776187232134388e-05, + "loss": 0.17449574470520018, + "step": 168540 + }, + { + "epoch": 0.7236203772872071, + "grad_norm": 0.0006515654386021197, + "learning_rate": 2.7757560601226257e-05, + "loss": 0.20330989360809326, + "step": 168550 + }, + { + "epoch": 0.7236633093772271, + "grad_norm": 0.3082123398780823, + "learning_rate": 2.7753248881108627e-05, + "loss": 0.11522177457809449, + "step": 168560 + }, + { + "epoch": 0.7237062414672472, + "grad_norm": 0.05763091892004013, + "learning_rate": 2.7748937160991008e-05, + "loss": 0.04711337983608246, + "step": 168570 + }, + { + "epoch": 0.7237491735572671, + "grad_norm": 15.214527130126953, + "learning_rate": 2.7744625440873385e-05, + "loss": 0.2867955207824707, + "step": 168580 + }, + { + "epoch": 0.7237921056472871, + "grad_norm": 3.8333139419555664, + "learning_rate": 2.7740313720755763e-05, + "loss": 0.09657506942749024, + "step": 168590 + }, + { + "epoch": 0.7238350377373072, + "grad_norm": 0.0093782814219594, + "learning_rate": 2.7736002000638133e-05, + "loss": 0.20292196273803711, + "step": 168600 + }, + { + "epoch": 0.7238779698273271, + "grad_norm": 0.004507375881075859, + "learning_rate": 2.773169028052051e-05, + "loss": 0.0971630871295929, + "step": 168610 + }, + { + "epoch": 0.7239209019173471, + "grad_norm": 0.013182217255234718, + "learning_rate": 2.7727378560402888e-05, + "loss": 0.14002325534820556, + "step": 168620 + }, + { + "epoch": 0.7239638340073672, + "grad_norm": 0.002185863209888339, + "learning_rate": 2.7723066840285265e-05, + "loss": 0.17800129652023317, + "step": 168630 + }, + { + "epoch": 0.7240067660973871, + "grad_norm": 1.0492135286331177, + "learning_rate": 2.771875512016764e-05, + "loss": 0.04826590418815613, + "step": 168640 + }, + { + "epoch": 0.7240496981874072, + "grad_norm": 0.014060789719223976, + "learning_rate": 2.7714443400050016e-05, + "loss": 0.3352368354797363, + "step": 168650 + }, + { + "epoch": 0.7240926302774272, + "grad_norm": 0.03786401078104973, + "learning_rate": 2.7710131679932394e-05, + "loss": 0.23978736400604247, + "step": 168660 + }, + { + "epoch": 0.7241355623674471, + "grad_norm": 0.0007775720441713929, + "learning_rate": 2.770581995981477e-05, + "loss": 0.24734921455383302, + "step": 168670 + }, + { + "epoch": 0.7241784944574672, + "grad_norm": 1.8972983360290527, + "learning_rate": 2.7701508239697145e-05, + "loss": 0.19279592037200927, + "step": 168680 + }, + { + "epoch": 0.7242214265474872, + "grad_norm": 1.2243245840072632, + "learning_rate": 2.7697196519579522e-05, + "loss": 0.1520993232727051, + "step": 168690 + }, + { + "epoch": 0.7242643586375072, + "grad_norm": 1.4366346597671509, + "learning_rate": 2.76928847994619e-05, + "loss": 0.101070237159729, + "step": 168700 + }, + { + "epoch": 0.7243072907275272, + "grad_norm": 0.02268645167350769, + "learning_rate": 2.7688573079344277e-05, + "loss": 0.21039226055145263, + "step": 168710 + }, + { + "epoch": 0.7243502228175472, + "grad_norm": 0.3205955922603607, + "learning_rate": 2.7684261359226648e-05, + "loss": 0.12725365161895752, + "step": 168720 + }, + { + "epoch": 0.7243931549075672, + "grad_norm": 1.147531270980835, + "learning_rate": 2.7679949639109025e-05, + "loss": 0.17458294630050658, + "step": 168730 + }, + { + "epoch": 0.7244360869975872, + "grad_norm": 0.016879552975296974, + "learning_rate": 2.7675637918991402e-05, + "loss": 0.34085586071014407, + "step": 168740 + }, + { + "epoch": 0.7244790190876073, + "grad_norm": 0.00016512209549546242, + "learning_rate": 2.7671326198873783e-05, + "loss": 0.28415277004241946, + "step": 168750 + }, + { + "epoch": 0.7245219511776272, + "grad_norm": 5.477837085723877, + "learning_rate": 2.7667014478756154e-05, + "loss": 0.20578546524047853, + "step": 168760 + }, + { + "epoch": 0.7245648832676472, + "grad_norm": 0.050836920738220215, + "learning_rate": 2.766270275863853e-05, + "loss": 0.36799397468566897, + "step": 168770 + }, + { + "epoch": 0.7246078153576673, + "grad_norm": 6.697229862213135, + "learning_rate": 2.765839103852091e-05, + "loss": 0.054876917600631715, + "step": 168780 + }, + { + "epoch": 0.7246507474476872, + "grad_norm": 1.9743859767913818, + "learning_rate": 2.7654079318403286e-05, + "loss": 0.26255998611450193, + "step": 168790 + }, + { + "epoch": 0.7246936795377072, + "grad_norm": 0.1395675390958786, + "learning_rate": 2.7649767598285663e-05, + "loss": 0.08039167523384094, + "step": 168800 + }, + { + "epoch": 0.7247366116277273, + "grad_norm": 8.4291353225708, + "learning_rate": 2.7645455878168037e-05, + "loss": 0.462004280090332, + "step": 168810 + }, + { + "epoch": 0.7247795437177472, + "grad_norm": 0.12003923207521439, + "learning_rate": 2.7641144158050414e-05, + "loss": 0.22433068752288818, + "step": 168820 + }, + { + "epoch": 0.7248224758077673, + "grad_norm": 0.9225029945373535, + "learning_rate": 2.7636832437932792e-05, + "loss": 0.25149383544921877, + "step": 168830 + }, + { + "epoch": 0.7248654078977873, + "grad_norm": 1.2647552490234375, + "learning_rate": 2.763252071781517e-05, + "loss": 0.2392057180404663, + "step": 168840 + }, + { + "epoch": 0.7249083399878072, + "grad_norm": 0.7912980914115906, + "learning_rate": 2.762820899769754e-05, + "loss": 0.13407490253448487, + "step": 168850 + }, + { + "epoch": 0.7249512720778273, + "grad_norm": 0.029732348397374153, + "learning_rate": 2.762389727757992e-05, + "loss": 0.059022271633148195, + "step": 168860 + }, + { + "epoch": 0.7249942041678473, + "grad_norm": 0.8471653461456299, + "learning_rate": 2.7619585557462298e-05, + "loss": 0.3402934312820435, + "step": 168870 + }, + { + "epoch": 0.7250371362578673, + "grad_norm": 11.936932563781738, + "learning_rate": 2.7615273837344675e-05, + "loss": 0.2877587080001831, + "step": 168880 + }, + { + "epoch": 0.7250800683478873, + "grad_norm": 1.663932204246521, + "learning_rate": 2.7610962117227046e-05, + "loss": 0.28721544742584226, + "step": 168890 + }, + { + "epoch": 0.7251230004379073, + "grad_norm": 0.3331746459007263, + "learning_rate": 2.7606650397109423e-05, + "loss": 0.11548913717269897, + "step": 168900 + }, + { + "epoch": 0.7251659325279273, + "grad_norm": 1.681654930114746, + "learning_rate": 2.76023386769918e-05, + "loss": 0.3489760160446167, + "step": 168910 + }, + { + "epoch": 0.7252088646179473, + "grad_norm": 0.09427236020565033, + "learning_rate": 2.7598026956874178e-05, + "loss": 0.0763422667980194, + "step": 168920 + }, + { + "epoch": 0.7252517967079674, + "grad_norm": 0.07877668738365173, + "learning_rate": 2.759371523675655e-05, + "loss": 0.1756036639213562, + "step": 168930 + }, + { + "epoch": 0.7252947287979874, + "grad_norm": 1.987454891204834, + "learning_rate": 2.758940351663893e-05, + "loss": 0.19289370775222778, + "step": 168940 + }, + { + "epoch": 0.7253376608880073, + "grad_norm": 1.1299264430999756, + "learning_rate": 2.7585091796521306e-05, + "loss": 0.3390709638595581, + "step": 168950 + }, + { + "epoch": 0.7253805929780274, + "grad_norm": 0.0014903040137141943, + "learning_rate": 2.7580780076403684e-05, + "loss": 0.2046818256378174, + "step": 168960 + }, + { + "epoch": 0.7254235250680474, + "grad_norm": 1.1031399965286255, + "learning_rate": 2.7576468356286058e-05, + "loss": 0.16519639492034913, + "step": 168970 + }, + { + "epoch": 0.7254664571580673, + "grad_norm": 0.0073827896267175674, + "learning_rate": 2.7572156636168435e-05, + "loss": 0.2113725423812866, + "step": 168980 + }, + { + "epoch": 0.7255093892480874, + "grad_norm": 0.00510813295841217, + "learning_rate": 2.7567844916050812e-05, + "loss": 0.09802131652832032, + "step": 168990 + }, + { + "epoch": 0.7255523213381074, + "grad_norm": 0.2502835690975189, + "learning_rate": 2.756353319593319e-05, + "loss": 0.18962769508361815, + "step": 169000 + }, + { + "epoch": 0.7255523213381074, + "eval_loss": 0.38295066356658936, + "eval_runtime": 27.3886, + "eval_samples_per_second": 3.651, + "eval_steps_per_second": 3.651, + "step": 169000 + }, + { + "epoch": 0.7255952534281274, + "grad_norm": 0.0011066279839724302, + "learning_rate": 2.755922147581556e-05, + "loss": 0.2311826229095459, + "step": 169010 + }, + { + "epoch": 0.7256381855181474, + "grad_norm": 0.00853653158992529, + "learning_rate": 2.7554909755697938e-05, + "loss": 0.18713712692260742, + "step": 169020 + }, + { + "epoch": 0.7256811176081674, + "grad_norm": 7.358502388000488, + "learning_rate": 2.7550598035580315e-05, + "loss": 0.273174524307251, + "step": 169030 + }, + { + "epoch": 0.7257240496981874, + "grad_norm": 0.004957462195307016, + "learning_rate": 2.7546286315462692e-05, + "loss": 0.17104941606521606, + "step": 169040 + }, + { + "epoch": 0.7257669817882074, + "grad_norm": 1.1058156490325928, + "learning_rate": 2.7541974595345066e-05, + "loss": 0.3058955669403076, + "step": 169050 + }, + { + "epoch": 0.7258099138782275, + "grad_norm": 0.015500033274292946, + "learning_rate": 2.7537662875227443e-05, + "loss": 0.21356968879699706, + "step": 169060 + }, + { + "epoch": 0.7258528459682474, + "grad_norm": 0.02399790659546852, + "learning_rate": 2.753335115510982e-05, + "loss": 0.059341037273406984, + "step": 169070 + }, + { + "epoch": 0.7258957780582674, + "grad_norm": 1.957128643989563, + "learning_rate": 2.7529039434992198e-05, + "loss": 0.381409740447998, + "step": 169080 + }, + { + "epoch": 0.7259387101482875, + "grad_norm": 2.0425257682800293, + "learning_rate": 2.7524727714874572e-05, + "loss": 0.17824835777282716, + "step": 169090 + }, + { + "epoch": 0.7259816422383074, + "grad_norm": 0.0014805634273216128, + "learning_rate": 2.752041599475695e-05, + "loss": 0.39816100597381593, + "step": 169100 + }, + { + "epoch": 0.7260245743283275, + "grad_norm": 8.61057186126709, + "learning_rate": 2.7516104274639327e-05, + "loss": 0.3082611322402954, + "step": 169110 + }, + { + "epoch": 0.7260675064183475, + "grad_norm": 0.08923623710870743, + "learning_rate": 2.7511792554521704e-05, + "loss": 0.14777944087982178, + "step": 169120 + }, + { + "epoch": 0.7261104385083674, + "grad_norm": 0.01090413797646761, + "learning_rate": 2.7507480834404075e-05, + "loss": 0.23257055282592773, + "step": 169130 + }, + { + "epoch": 0.7261533705983875, + "grad_norm": 0.10899604111909866, + "learning_rate": 2.7503169114286452e-05, + "loss": 0.18908019065856935, + "step": 169140 + }, + { + "epoch": 0.7261963026884075, + "grad_norm": 3.4646310806274414, + "learning_rate": 2.749885739416883e-05, + "loss": 0.3117159128189087, + "step": 169150 + }, + { + "epoch": 0.7262392347784274, + "grad_norm": 1.4176348447799683, + "learning_rate": 2.749454567405121e-05, + "loss": 0.06437577605247498, + "step": 169160 + }, + { + "epoch": 0.7262821668684475, + "grad_norm": 3.021955966949463, + "learning_rate": 2.7490233953933587e-05, + "loss": 0.1498560667037964, + "step": 169170 + }, + { + "epoch": 0.7263250989584675, + "grad_norm": 0.019517341628670692, + "learning_rate": 2.7485922233815958e-05, + "loss": 0.34508914947509767, + "step": 169180 + }, + { + "epoch": 0.7263680310484875, + "grad_norm": 0.004844113253057003, + "learning_rate": 2.7481610513698335e-05, + "loss": 0.19807019233703613, + "step": 169190 + }, + { + "epoch": 0.7264109631385075, + "grad_norm": 0.0019148435676470399, + "learning_rate": 2.7477298793580713e-05, + "loss": 0.36115279197692873, + "step": 169200 + }, + { + "epoch": 0.7264538952285275, + "grad_norm": 0.19224651157855988, + "learning_rate": 2.747298707346309e-05, + "loss": 0.39707260131835936, + "step": 169210 + }, + { + "epoch": 0.7264968273185475, + "grad_norm": 0.22394929826259613, + "learning_rate": 2.7468675353345464e-05, + "loss": 0.10976753234863282, + "step": 169220 + }, + { + "epoch": 0.7265397594085675, + "grad_norm": 0.0022097777109593153, + "learning_rate": 2.746436363322784e-05, + "loss": 0.31788818836212157, + "step": 169230 + }, + { + "epoch": 0.7265826914985876, + "grad_norm": 3.0895588397979736, + "learning_rate": 2.746005191311022e-05, + "loss": 0.18544777631759643, + "step": 169240 + }, + { + "epoch": 0.7266256235886075, + "grad_norm": 1.5325919389724731, + "learning_rate": 2.7455740192992596e-05, + "loss": 0.1992364525794983, + "step": 169250 + }, + { + "epoch": 0.7266685556786275, + "grad_norm": 0.045792438089847565, + "learning_rate": 2.7451428472874967e-05, + "loss": 0.3305244445800781, + "step": 169260 + }, + { + "epoch": 0.7267114877686476, + "grad_norm": 0.9892081618309021, + "learning_rate": 2.7447116752757347e-05, + "loss": 0.15464599132537843, + "step": 169270 + }, + { + "epoch": 0.7267544198586675, + "grad_norm": 0.009701947681605816, + "learning_rate": 2.7442805032639725e-05, + "loss": 0.1055402398109436, + "step": 169280 + }, + { + "epoch": 0.7267973519486876, + "grad_norm": 2.0974602699279785, + "learning_rate": 2.7438493312522102e-05, + "loss": 0.12994418144226075, + "step": 169290 + }, + { + "epoch": 0.7268402840387076, + "grad_norm": 0.018503086641430855, + "learning_rate": 2.7434181592404473e-05, + "loss": 0.34631929397583006, + "step": 169300 + }, + { + "epoch": 0.7268832161287275, + "grad_norm": 0.5620545148849487, + "learning_rate": 2.742986987228685e-05, + "loss": 0.29466919898986815, + "step": 169310 + }, + { + "epoch": 0.7269261482187476, + "grad_norm": 1.7169890403747559, + "learning_rate": 2.7425558152169227e-05, + "loss": 0.25790133476257326, + "step": 169320 + }, + { + "epoch": 0.7269690803087676, + "grad_norm": 0.01841481775045395, + "learning_rate": 2.7421246432051605e-05, + "loss": 0.3726177215576172, + "step": 169330 + }, + { + "epoch": 0.7270120123987875, + "grad_norm": 7.7407331466674805, + "learning_rate": 2.741693471193398e-05, + "loss": 0.2069004774093628, + "step": 169340 + }, + { + "epoch": 0.7270549444888076, + "grad_norm": 0.041998837143182755, + "learning_rate": 2.7412622991816356e-05, + "loss": 0.18190546035766603, + "step": 169350 + }, + { + "epoch": 0.7270978765788276, + "grad_norm": 0.0035823362413793802, + "learning_rate": 2.7408311271698733e-05, + "loss": 0.27233123779296875, + "step": 169360 + }, + { + "epoch": 0.7271408086688477, + "grad_norm": 0.00309250270947814, + "learning_rate": 2.740399955158111e-05, + "loss": 0.19596065282821656, + "step": 169370 + }, + { + "epoch": 0.7271837407588676, + "grad_norm": 0.059527233242988586, + "learning_rate": 2.7399687831463485e-05, + "loss": 0.22426025867462157, + "step": 169380 + }, + { + "epoch": 0.7272266728488876, + "grad_norm": 2.7782046794891357, + "learning_rate": 2.7395376111345862e-05, + "loss": 0.24270076751708985, + "step": 169390 + }, + { + "epoch": 0.7272696049389077, + "grad_norm": 2.5395593643188477, + "learning_rate": 2.739106439122824e-05, + "loss": 0.22292840480804443, + "step": 169400 + }, + { + "epoch": 0.7273125370289276, + "grad_norm": 0.021359330043196678, + "learning_rate": 2.7386752671110617e-05, + "loss": 0.1687622547149658, + "step": 169410 + }, + { + "epoch": 0.7273554691189477, + "grad_norm": 2.1432902812957764, + "learning_rate": 2.7382440950992987e-05, + "loss": 0.22134754657745362, + "step": 169420 + }, + { + "epoch": 0.7273984012089677, + "grad_norm": 0.15447072684764862, + "learning_rate": 2.7378129230875365e-05, + "loss": 0.1350387692451477, + "step": 169430 + }, + { + "epoch": 0.7274413332989876, + "grad_norm": 1.2907360792160034, + "learning_rate": 2.7373817510757742e-05, + "loss": 0.3004169940948486, + "step": 169440 + }, + { + "epoch": 0.7274842653890077, + "grad_norm": 0.00447465293109417, + "learning_rate": 2.736950579064012e-05, + "loss": 0.055487924814224245, + "step": 169450 + }, + { + "epoch": 0.7275271974790277, + "grad_norm": 0.01885574497282505, + "learning_rate": 2.7365194070522493e-05, + "loss": 0.17447357177734374, + "step": 169460 + }, + { + "epoch": 0.7275701295690477, + "grad_norm": 1.8977580070495605, + "learning_rate": 2.736088235040487e-05, + "loss": 0.18592686653137208, + "step": 169470 + }, + { + "epoch": 0.7276130616590677, + "grad_norm": 0.006628870032727718, + "learning_rate": 2.7356570630287248e-05, + "loss": 0.2765743494033813, + "step": 169480 + }, + { + "epoch": 0.7276559937490877, + "grad_norm": 0.015173074789345264, + "learning_rate": 2.7352258910169625e-05, + "loss": 0.12751307487487792, + "step": 169490 + }, + { + "epoch": 0.7276989258391077, + "grad_norm": 0.012416012585163116, + "learning_rate": 2.7347947190052e-05, + "loss": 0.1876566767692566, + "step": 169500 + }, + { + "epoch": 0.7277418579291277, + "grad_norm": 1.2336422204971313, + "learning_rate": 2.7343635469934376e-05, + "loss": 0.3533480167388916, + "step": 169510 + }, + { + "epoch": 0.7277847900191478, + "grad_norm": 0.02001671865582466, + "learning_rate": 2.7339323749816754e-05, + "loss": 0.2230234146118164, + "step": 169520 + }, + { + "epoch": 0.7278277221091677, + "grad_norm": 0.015594366006553173, + "learning_rate": 2.733501202969913e-05, + "loss": 0.06503039002418518, + "step": 169530 + }, + { + "epoch": 0.7278706541991877, + "grad_norm": 2.3339343070983887, + "learning_rate": 2.733070030958151e-05, + "loss": 0.19949530363082885, + "step": 169540 + }, + { + "epoch": 0.7279135862892078, + "grad_norm": 0.90096515417099, + "learning_rate": 2.732638858946388e-05, + "loss": 0.0720227301120758, + "step": 169550 + }, + { + "epoch": 0.7279565183792277, + "grad_norm": 0.0013229718897491693, + "learning_rate": 2.7322076869346256e-05, + "loss": 0.2935648441314697, + "step": 169560 + }, + { + "epoch": 0.7279994504692477, + "grad_norm": 3.1632344722747803, + "learning_rate": 2.7317765149228637e-05, + "loss": 0.2547586917877197, + "step": 169570 + }, + { + "epoch": 0.7280423825592678, + "grad_norm": 0.9867843389511108, + "learning_rate": 2.7313453429111015e-05, + "loss": 0.10192888975143433, + "step": 169580 + }, + { + "epoch": 0.7280853146492877, + "grad_norm": 3.412651538848877, + "learning_rate": 2.7309141708993385e-05, + "loss": 0.14853811264038086, + "step": 169590 + }, + { + "epoch": 0.7281282467393078, + "grad_norm": 0.13131414353847504, + "learning_rate": 2.7304829988875762e-05, + "loss": 0.16117234230041505, + "step": 169600 + }, + { + "epoch": 0.7281711788293278, + "grad_norm": 0.20439204573631287, + "learning_rate": 2.730051826875814e-05, + "loss": 0.30737872123718263, + "step": 169610 + }, + { + "epoch": 0.7282141109193477, + "grad_norm": 0.0034942487254738808, + "learning_rate": 2.7296206548640517e-05, + "loss": 0.1933616042137146, + "step": 169620 + }, + { + "epoch": 0.7282570430093678, + "grad_norm": 0.039384450763463974, + "learning_rate": 2.729189482852289e-05, + "loss": 0.3517719507217407, + "step": 169630 + }, + { + "epoch": 0.7282999750993878, + "grad_norm": 1.3371518850326538, + "learning_rate": 2.728758310840527e-05, + "loss": 0.2886610984802246, + "step": 169640 + }, + { + "epoch": 0.7283429071894078, + "grad_norm": 0.1878964751958847, + "learning_rate": 2.7283271388287646e-05, + "loss": 0.18123259544372558, + "step": 169650 + }, + { + "epoch": 0.7283858392794278, + "grad_norm": 1.6017755270004272, + "learning_rate": 2.7278959668170023e-05, + "loss": 0.2711747884750366, + "step": 169660 + }, + { + "epoch": 0.7284287713694478, + "grad_norm": 1.121159315109253, + "learning_rate": 2.7274647948052394e-05, + "loss": 0.141208279132843, + "step": 169670 + }, + { + "epoch": 0.7284717034594678, + "grad_norm": 1.4669911861419678, + "learning_rate": 2.7270336227934774e-05, + "loss": 0.34694585800170896, + "step": 169680 + }, + { + "epoch": 0.7285146355494878, + "grad_norm": 0.0026642445009201765, + "learning_rate": 2.7266024507817152e-05, + "loss": 0.30149426460266116, + "step": 169690 + }, + { + "epoch": 0.7285575676395079, + "grad_norm": 0.01654047705233097, + "learning_rate": 2.726171278769953e-05, + "loss": 0.07679366469383239, + "step": 169700 + }, + { + "epoch": 0.7286004997295278, + "grad_norm": 3.939174175262451, + "learning_rate": 2.72574010675819e-05, + "loss": 0.1391007423400879, + "step": 169710 + }, + { + "epoch": 0.7286434318195478, + "grad_norm": 1.1632599830627441, + "learning_rate": 2.7253089347464277e-05, + "loss": 0.3620645046234131, + "step": 169720 + }, + { + "epoch": 0.7286863639095679, + "grad_norm": 0.03564436361193657, + "learning_rate": 2.7248777627346654e-05, + "loss": 0.1843175172805786, + "step": 169730 + }, + { + "epoch": 0.7287292959995878, + "grad_norm": 2.9193012714385986, + "learning_rate": 2.724446590722903e-05, + "loss": 0.28504557609558107, + "step": 169740 + }, + { + "epoch": 0.7287722280896078, + "grad_norm": 6.900730133056641, + "learning_rate": 2.7240154187111406e-05, + "loss": 0.29606239795684813, + "step": 169750 + }, + { + "epoch": 0.7288151601796279, + "grad_norm": 0.05998038873076439, + "learning_rate": 2.7235842466993783e-05, + "loss": 0.34577767848968505, + "step": 169760 + }, + { + "epoch": 0.7288580922696478, + "grad_norm": 5.61971378326416, + "learning_rate": 2.723153074687616e-05, + "loss": 0.25440452098846433, + "step": 169770 + }, + { + "epoch": 0.7289010243596679, + "grad_norm": 0.690499484539032, + "learning_rate": 2.7227219026758538e-05, + "loss": 0.11958892345428467, + "step": 169780 + }, + { + "epoch": 0.7289439564496879, + "grad_norm": 3.1090917587280273, + "learning_rate": 2.722290730664091e-05, + "loss": 0.14645473957061766, + "step": 169790 + }, + { + "epoch": 0.728986888539708, + "grad_norm": 0.10904362052679062, + "learning_rate": 2.721859558652329e-05, + "loss": 0.0882343590259552, + "step": 169800 + }, + { + "epoch": 0.7290298206297279, + "grad_norm": 6.914793491363525, + "learning_rate": 2.7214283866405666e-05, + "loss": 0.38003613948822024, + "step": 169810 + }, + { + "epoch": 0.7290727527197479, + "grad_norm": 0.009238614700734615, + "learning_rate": 2.7209972146288044e-05, + "loss": 0.21988511085510254, + "step": 169820 + }, + { + "epoch": 0.729115684809768, + "grad_norm": 3.774786949157715, + "learning_rate": 2.7205660426170414e-05, + "loss": 0.1955350399017334, + "step": 169830 + }, + { + "epoch": 0.7291586168997879, + "grad_norm": 3.135042667388916, + "learning_rate": 2.720134870605279e-05, + "loss": 0.17978460788726808, + "step": 169840 + }, + { + "epoch": 0.7292015489898079, + "grad_norm": 0.557732343673706, + "learning_rate": 2.719703698593517e-05, + "loss": 0.09407221078872681, + "step": 169850 + }, + { + "epoch": 0.729244481079828, + "grad_norm": 0.01777493953704834, + "learning_rate": 2.719272526581755e-05, + "loss": 0.15389621257781982, + "step": 169860 + }, + { + "epoch": 0.7292874131698479, + "grad_norm": 0.0050742123275995255, + "learning_rate": 2.7188413545699927e-05, + "loss": 0.28021841049194335, + "step": 169870 + }, + { + "epoch": 0.729330345259868, + "grad_norm": 0.051354601979255676, + "learning_rate": 2.7184101825582298e-05, + "loss": 0.13149229288101197, + "step": 169880 + }, + { + "epoch": 0.729373277349888, + "grad_norm": 17.105356216430664, + "learning_rate": 2.7179790105464675e-05, + "loss": 0.16971458196640016, + "step": 169890 + }, + { + "epoch": 0.7294162094399079, + "grad_norm": 0.1747300624847412, + "learning_rate": 2.7175478385347052e-05, + "loss": 0.21624934673309326, + "step": 169900 + }, + { + "epoch": 0.729459141529928, + "grad_norm": 0.04679642617702484, + "learning_rate": 2.717116666522943e-05, + "loss": 0.09935680627822877, + "step": 169910 + }, + { + "epoch": 0.729502073619948, + "grad_norm": 0.0205056332051754, + "learning_rate": 2.7166854945111804e-05, + "loss": 0.15621496438980104, + "step": 169920 + }, + { + "epoch": 0.7295450057099679, + "grad_norm": 0.07464340329170227, + "learning_rate": 2.716254322499418e-05, + "loss": 0.27371673583984374, + "step": 169930 + }, + { + "epoch": 0.729587937799988, + "grad_norm": 5.811140537261963, + "learning_rate": 2.7158231504876558e-05, + "loss": 0.22537665367126464, + "step": 169940 + }, + { + "epoch": 0.729630869890008, + "grad_norm": 1.5015976428985596, + "learning_rate": 2.7153919784758936e-05, + "loss": 0.26116361618041994, + "step": 169950 + }, + { + "epoch": 0.729673801980028, + "grad_norm": 1.3956966400146484, + "learning_rate": 2.7149608064641306e-05, + "loss": 0.3448702573776245, + "step": 169960 + }, + { + "epoch": 0.729716734070048, + "grad_norm": 0.0037657865323126316, + "learning_rate": 2.7145296344523687e-05, + "loss": 0.23841521739959717, + "step": 169970 + }, + { + "epoch": 0.729759666160068, + "grad_norm": 0.27306094765663147, + "learning_rate": 2.7140984624406064e-05, + "loss": 0.22840194702148436, + "step": 169980 + }, + { + "epoch": 0.729802598250088, + "grad_norm": 0.36141231656074524, + "learning_rate": 2.713667290428844e-05, + "loss": 0.29284093379974363, + "step": 169990 + }, + { + "epoch": 0.729845530340108, + "grad_norm": 0.011245034635066986, + "learning_rate": 2.7132361184170812e-05, + "loss": 0.18197616338729858, + "step": 170000 + }, + { + "epoch": 0.729845530340108, + "eval_loss": 0.3767232596874237, + "eval_runtime": 27.4074, + "eval_samples_per_second": 3.649, + "eval_steps_per_second": 3.649, + "step": 170000 + }, + { + "epoch": 0.7298884624301281, + "grad_norm": 0.0031025889329612255, + "learning_rate": 2.712804946405319e-05, + "loss": 0.23458943367004395, + "step": 170010 + }, + { + "epoch": 0.729931394520148, + "grad_norm": 0.17620614171028137, + "learning_rate": 2.7123737743935567e-05, + "loss": 0.29802589416503905, + "step": 170020 + }, + { + "epoch": 0.729974326610168, + "grad_norm": 0.017262430861592293, + "learning_rate": 2.7119426023817944e-05, + "loss": 0.03175511360168457, + "step": 170030 + }, + { + "epoch": 0.7300172587001881, + "grad_norm": 0.025158502161502838, + "learning_rate": 2.7115114303700318e-05, + "loss": 0.26365683078765867, + "step": 170040 + }, + { + "epoch": 0.730060190790208, + "grad_norm": 2.2967610359191895, + "learning_rate": 2.7110802583582695e-05, + "loss": 0.32564468383789064, + "step": 170050 + }, + { + "epoch": 0.730103122880228, + "grad_norm": 1.5477705001831055, + "learning_rate": 2.7106490863465073e-05, + "loss": 0.4748509407043457, + "step": 170060 + }, + { + "epoch": 0.7301460549702481, + "grad_norm": 0.09828320890665054, + "learning_rate": 2.710217914334745e-05, + "loss": 0.136482310295105, + "step": 170070 + }, + { + "epoch": 0.730188987060268, + "grad_norm": 0.8780837655067444, + "learning_rate": 2.7097867423229824e-05, + "loss": 0.2198232650756836, + "step": 170080 + }, + { + "epoch": 0.7302319191502881, + "grad_norm": 1.8543720245361328, + "learning_rate": 2.70935557031122e-05, + "loss": 0.18494052886962892, + "step": 170090 + }, + { + "epoch": 0.7302748512403081, + "grad_norm": 0.2783847451210022, + "learning_rate": 2.708924398299458e-05, + "loss": 0.053096276521682736, + "step": 170100 + }, + { + "epoch": 0.730317783330328, + "grad_norm": 0.0045045046135783195, + "learning_rate": 2.7084932262876956e-05, + "loss": 0.3136617660522461, + "step": 170110 + }, + { + "epoch": 0.7303607154203481, + "grad_norm": 0.3513777256011963, + "learning_rate": 2.7080620542759327e-05, + "loss": 0.12250692844390869, + "step": 170120 + }, + { + "epoch": 0.7304036475103681, + "grad_norm": 2.300067663192749, + "learning_rate": 2.7076308822641704e-05, + "loss": 0.07853416204452515, + "step": 170130 + }, + { + "epoch": 0.7304465796003881, + "grad_norm": 0.018564164638519287, + "learning_rate": 2.707199710252408e-05, + "loss": 0.12981711626052855, + "step": 170140 + }, + { + "epoch": 0.7304895116904081, + "grad_norm": 0.6602355241775513, + "learning_rate": 2.706768538240646e-05, + "loss": 0.4602662086486816, + "step": 170150 + }, + { + "epoch": 0.7305324437804281, + "grad_norm": 0.04296904429793358, + "learning_rate": 2.7063373662288833e-05, + "loss": 0.024672232568264008, + "step": 170160 + }, + { + "epoch": 0.7305753758704481, + "grad_norm": 0.08935589343309402, + "learning_rate": 2.705906194217121e-05, + "loss": 0.05151203870773315, + "step": 170170 + }, + { + "epoch": 0.7306183079604681, + "grad_norm": 1.2046279907226562, + "learning_rate": 2.7054750222053587e-05, + "loss": 0.11995961666107177, + "step": 170180 + }, + { + "epoch": 0.7306612400504882, + "grad_norm": 0.007651221007108688, + "learning_rate": 2.7050438501935965e-05, + "loss": 0.17691102027893066, + "step": 170190 + }, + { + "epoch": 0.7307041721405081, + "grad_norm": 0.016610870137810707, + "learning_rate": 2.704612678181834e-05, + "loss": 0.23532278537750245, + "step": 170200 + }, + { + "epoch": 0.7307471042305281, + "grad_norm": 0.010207589715719223, + "learning_rate": 2.7041815061700716e-05, + "loss": 0.19673173427581786, + "step": 170210 + }, + { + "epoch": 0.7307900363205482, + "grad_norm": 2.705111503601074, + "learning_rate": 2.7037503341583093e-05, + "loss": 0.15226796865463257, + "step": 170220 + }, + { + "epoch": 0.7308329684105682, + "grad_norm": 0.0040432969108223915, + "learning_rate": 2.703319162146547e-05, + "loss": 0.5022855758666992, + "step": 170230 + }, + { + "epoch": 0.7308759005005881, + "grad_norm": 0.009334494359791279, + "learning_rate": 2.7028879901347848e-05, + "loss": 0.08118436336517335, + "step": 170240 + }, + { + "epoch": 0.7309188325906082, + "grad_norm": 1.2461107969284058, + "learning_rate": 2.702456818123022e-05, + "loss": 0.3561758041381836, + "step": 170250 + }, + { + "epoch": 0.7309617646806282, + "grad_norm": 0.1575508862733841, + "learning_rate": 2.7020256461112596e-05, + "loss": 0.3139577627182007, + "step": 170260 + }, + { + "epoch": 0.7310046967706482, + "grad_norm": 0.013691013678908348, + "learning_rate": 2.7015944740994977e-05, + "loss": 0.15196446180343628, + "step": 170270 + }, + { + "epoch": 0.7310476288606682, + "grad_norm": 0.03952275961637497, + "learning_rate": 2.7011633020877354e-05, + "loss": 0.2643216371536255, + "step": 170280 + }, + { + "epoch": 0.7310905609506883, + "grad_norm": 0.08539864420890808, + "learning_rate": 2.7007321300759725e-05, + "loss": 0.2871419906616211, + "step": 170290 + }, + { + "epoch": 0.7311334930407082, + "grad_norm": 0.0039503430016338825, + "learning_rate": 2.7003009580642102e-05, + "loss": 0.19403316974639892, + "step": 170300 + }, + { + "epoch": 0.7311764251307282, + "grad_norm": 4.306076526641846, + "learning_rate": 2.699869786052448e-05, + "loss": 0.43187770843505857, + "step": 170310 + }, + { + "epoch": 0.7312193572207483, + "grad_norm": 0.0012534123379737139, + "learning_rate": 2.6994386140406857e-05, + "loss": 0.2351520299911499, + "step": 170320 + }, + { + "epoch": 0.7312622893107682, + "grad_norm": 1.3385637998580933, + "learning_rate": 2.699007442028923e-05, + "loss": 0.1445598840713501, + "step": 170330 + }, + { + "epoch": 0.7313052214007882, + "grad_norm": 0.6701360940933228, + "learning_rate": 2.6985762700171608e-05, + "loss": 0.12373219728469849, + "step": 170340 + }, + { + "epoch": 0.7313481534908083, + "grad_norm": 0.5813913345336914, + "learning_rate": 2.6981450980053985e-05, + "loss": 0.12824459075927735, + "step": 170350 + }, + { + "epoch": 0.7313910855808282, + "grad_norm": 0.10391009598970413, + "learning_rate": 2.6977139259936363e-05, + "loss": 0.3668433904647827, + "step": 170360 + }, + { + "epoch": 0.7314340176708483, + "grad_norm": 0.47627198696136475, + "learning_rate": 2.6972827539818733e-05, + "loss": 0.1359683632850647, + "step": 170370 + }, + { + "epoch": 0.7314769497608683, + "grad_norm": 0.030919019132852554, + "learning_rate": 2.6968515819701114e-05, + "loss": 0.2007523775100708, + "step": 170380 + }, + { + "epoch": 0.7315198818508882, + "grad_norm": 1.0183385610580444, + "learning_rate": 2.696420409958349e-05, + "loss": 0.1809418797492981, + "step": 170390 + }, + { + "epoch": 0.7315628139409083, + "grad_norm": 0.40025752782821655, + "learning_rate": 2.695989237946587e-05, + "loss": 0.1921942114830017, + "step": 170400 + }, + { + "epoch": 0.7316057460309283, + "grad_norm": 1.7565090656280518, + "learning_rate": 2.695558065934824e-05, + "loss": 0.1617509365081787, + "step": 170410 + }, + { + "epoch": 0.7316486781209482, + "grad_norm": 0.007016188930720091, + "learning_rate": 2.6951268939230616e-05, + "loss": 0.02748640179634094, + "step": 170420 + }, + { + "epoch": 0.7316916102109683, + "grad_norm": 2.187925100326538, + "learning_rate": 2.6946957219112994e-05, + "loss": 0.30386340618133545, + "step": 170430 + }, + { + "epoch": 0.7317345423009883, + "grad_norm": 2.4722328186035156, + "learning_rate": 2.694264549899537e-05, + "loss": 0.2640492916107178, + "step": 170440 + }, + { + "epoch": 0.7317774743910083, + "grad_norm": 0.014905155636370182, + "learning_rate": 2.6938333778877745e-05, + "loss": 0.24466726779937745, + "step": 170450 + }, + { + "epoch": 0.7318204064810283, + "grad_norm": 5.93389892578125, + "learning_rate": 2.6934022058760122e-05, + "loss": 0.4935251235961914, + "step": 170460 + }, + { + "epoch": 0.7318633385710483, + "grad_norm": 1.4191564321517944, + "learning_rate": 2.69297103386425e-05, + "loss": 0.3071990728378296, + "step": 170470 + }, + { + "epoch": 0.7319062706610683, + "grad_norm": 0.01276042778044939, + "learning_rate": 2.6925398618524877e-05, + "loss": 0.15078266859054565, + "step": 170480 + }, + { + "epoch": 0.7319492027510883, + "grad_norm": 0.004253904335200787, + "learning_rate": 2.692108689840725e-05, + "loss": 0.1008331298828125, + "step": 170490 + }, + { + "epoch": 0.7319921348411084, + "grad_norm": 0.9740898609161377, + "learning_rate": 2.691677517828963e-05, + "loss": 0.3161968231201172, + "step": 170500 + }, + { + "epoch": 0.7320350669311283, + "grad_norm": 31.598270416259766, + "learning_rate": 2.6912463458172006e-05, + "loss": 0.20869870185852052, + "step": 170510 + }, + { + "epoch": 0.7320779990211483, + "grad_norm": 0.015376714058220387, + "learning_rate": 2.6908151738054383e-05, + "loss": 0.09759688377380371, + "step": 170520 + }, + { + "epoch": 0.7321209311111684, + "grad_norm": 0.013746547512710094, + "learning_rate": 2.6903840017936754e-05, + "loss": 0.05905347466468811, + "step": 170530 + }, + { + "epoch": 0.7321638632011883, + "grad_norm": 1.2723731994628906, + "learning_rate": 2.689952829781913e-05, + "loss": 0.159501314163208, + "step": 170540 + }, + { + "epoch": 0.7322067952912084, + "grad_norm": 3.711704730987549, + "learning_rate": 2.689521657770151e-05, + "loss": 0.30540282726287843, + "step": 170550 + }, + { + "epoch": 0.7322497273812284, + "grad_norm": 1.8933461904525757, + "learning_rate": 2.6890904857583886e-05, + "loss": 0.17268972396850585, + "step": 170560 + }, + { + "epoch": 0.7322926594712483, + "grad_norm": 0.024054287001490593, + "learning_rate": 2.688659313746626e-05, + "loss": 0.2180922269821167, + "step": 170570 + }, + { + "epoch": 0.7323355915612684, + "grad_norm": 0.0017070991452783346, + "learning_rate": 2.6882281417348637e-05, + "loss": 0.3071908473968506, + "step": 170580 + }, + { + "epoch": 0.7323785236512884, + "grad_norm": 0.015459275804460049, + "learning_rate": 2.6877969697231014e-05, + "loss": 0.10010931491851807, + "step": 170590 + }, + { + "epoch": 0.7324214557413083, + "grad_norm": 0.25705915689468384, + "learning_rate": 2.687365797711339e-05, + "loss": 0.20313713550567628, + "step": 170600 + }, + { + "epoch": 0.7324643878313284, + "grad_norm": 1.2373805046081543, + "learning_rate": 2.686934625699577e-05, + "loss": 0.12128767967224122, + "step": 170610 + }, + { + "epoch": 0.7325073199213484, + "grad_norm": 0.06745709478855133, + "learning_rate": 2.6865034536878143e-05, + "loss": 0.10555492639541626, + "step": 170620 + }, + { + "epoch": 0.7325502520113684, + "grad_norm": 0.22075150907039642, + "learning_rate": 2.686072281676052e-05, + "loss": 0.21621780395507811, + "step": 170630 + }, + { + "epoch": 0.7325931841013884, + "grad_norm": 0.013154013082385063, + "learning_rate": 2.6856411096642898e-05, + "loss": 0.25960729122161863, + "step": 170640 + }, + { + "epoch": 0.7326361161914084, + "grad_norm": 0.006061031948775053, + "learning_rate": 2.6852099376525275e-05, + "loss": 0.10240201950073242, + "step": 170650 + }, + { + "epoch": 0.7326790482814285, + "grad_norm": 0.2058708518743515, + "learning_rate": 2.6847787656407646e-05, + "loss": 0.16383315324783326, + "step": 170660 + }, + { + "epoch": 0.7327219803714484, + "grad_norm": 0.01506069302558899, + "learning_rate": 2.6843475936290023e-05, + "loss": 0.15489013195037843, + "step": 170670 + }, + { + "epoch": 0.7327649124614685, + "grad_norm": 0.002717470284551382, + "learning_rate": 2.6839164216172404e-05, + "loss": 0.1274445176124573, + "step": 170680 + }, + { + "epoch": 0.7328078445514885, + "grad_norm": 3.37857985496521, + "learning_rate": 2.683485249605478e-05, + "loss": 0.3458231210708618, + "step": 170690 + }, + { + "epoch": 0.7328507766415084, + "grad_norm": 0.07261750847101212, + "learning_rate": 2.683054077593715e-05, + "loss": 0.05073615312576294, + "step": 170700 + }, + { + "epoch": 0.7328937087315285, + "grad_norm": 0.009463918395340443, + "learning_rate": 2.682622905581953e-05, + "loss": 0.25365068912506106, + "step": 170710 + }, + { + "epoch": 0.7329366408215485, + "grad_norm": 1.966654896736145, + "learning_rate": 2.6821917335701906e-05, + "loss": 0.1966190218925476, + "step": 170720 + }, + { + "epoch": 0.7329795729115685, + "grad_norm": 0.0020303381606936455, + "learning_rate": 2.6817605615584284e-05, + "loss": 0.28710646629333497, + "step": 170730 + }, + { + "epoch": 0.7330225050015885, + "grad_norm": 0.01680716685950756, + "learning_rate": 2.6813293895466658e-05, + "loss": 0.30534236431121825, + "step": 170740 + }, + { + "epoch": 0.7330654370916085, + "grad_norm": 0.021328696981072426, + "learning_rate": 2.6808982175349035e-05, + "loss": 0.08837099671363831, + "step": 170750 + }, + { + "epoch": 0.7331083691816285, + "grad_norm": 0.027719993144273758, + "learning_rate": 2.6804670455231412e-05, + "loss": 0.22146189212799072, + "step": 170760 + }, + { + "epoch": 0.7331513012716485, + "grad_norm": 57.73662185668945, + "learning_rate": 2.680035873511379e-05, + "loss": 0.2809849977493286, + "step": 170770 + }, + { + "epoch": 0.7331942333616686, + "grad_norm": 0.003101927461102605, + "learning_rate": 2.679604701499616e-05, + "loss": 0.16726034879684448, + "step": 170780 + }, + { + "epoch": 0.7332371654516885, + "grad_norm": 0.00422689039260149, + "learning_rate": 2.679173529487854e-05, + "loss": 0.40579957962036134, + "step": 170790 + }, + { + "epoch": 0.7332800975417085, + "grad_norm": 0.05328085273504257, + "learning_rate": 2.6787423574760918e-05, + "loss": 0.19401125907897948, + "step": 170800 + }, + { + "epoch": 0.7333230296317286, + "grad_norm": 3.178964614868164, + "learning_rate": 2.6783111854643296e-05, + "loss": 0.33956422805786135, + "step": 170810 + }, + { + "epoch": 0.7333659617217485, + "grad_norm": 0.07042407989501953, + "learning_rate": 2.6778800134525666e-05, + "loss": 0.18467453718185425, + "step": 170820 + }, + { + "epoch": 0.7334088938117685, + "grad_norm": 2.205892324447632, + "learning_rate": 2.6774488414408043e-05, + "loss": 0.14611732959747314, + "step": 170830 + }, + { + "epoch": 0.7334518259017886, + "grad_norm": 0.9493621587753296, + "learning_rate": 2.677017669429042e-05, + "loss": 0.26489179134368895, + "step": 170840 + }, + { + "epoch": 0.7334947579918085, + "grad_norm": 0.11686643213033676, + "learning_rate": 2.6765864974172798e-05, + "loss": 0.03403322398662567, + "step": 170850 + }, + { + "epoch": 0.7335376900818286, + "grad_norm": 0.1313803642988205, + "learning_rate": 2.6761553254055172e-05, + "loss": 0.14360102415084838, + "step": 170860 + }, + { + "epoch": 0.7335806221718486, + "grad_norm": 0.22880996763706207, + "learning_rate": 2.675724153393755e-05, + "loss": 0.08690401315689086, + "step": 170870 + }, + { + "epoch": 0.7336235542618685, + "grad_norm": 0.1637190282344818, + "learning_rate": 2.6752929813819927e-05, + "loss": 0.15708295106887818, + "step": 170880 + }, + { + "epoch": 0.7336664863518886, + "grad_norm": 0.02968779020011425, + "learning_rate": 2.6748618093702304e-05, + "loss": 0.19256193637847902, + "step": 170890 + }, + { + "epoch": 0.7337094184419086, + "grad_norm": 1.3331794738769531, + "learning_rate": 2.6744306373584678e-05, + "loss": 0.15403072834014891, + "step": 170900 + }, + { + "epoch": 0.7337523505319286, + "grad_norm": 22.366769790649414, + "learning_rate": 2.6739994653467055e-05, + "loss": 0.08723581433296204, + "step": 170910 + }, + { + "epoch": 0.7337952826219486, + "grad_norm": 2.587179660797119, + "learning_rate": 2.6735682933349433e-05, + "loss": 0.16030250787734984, + "step": 170920 + }, + { + "epoch": 0.7338382147119686, + "grad_norm": 0.016467789188027382, + "learning_rate": 2.673137121323181e-05, + "loss": 0.20831329822540284, + "step": 170930 + }, + { + "epoch": 0.7338811468019886, + "grad_norm": 1.1086673736572266, + "learning_rate": 2.672705949311418e-05, + "loss": 0.462873649597168, + "step": 170940 + }, + { + "epoch": 0.7339240788920086, + "grad_norm": 0.0017751099076122046, + "learning_rate": 2.6722747772996558e-05, + "loss": 0.13645445108413695, + "step": 170950 + }, + { + "epoch": 0.7339670109820287, + "grad_norm": 0.8315486907958984, + "learning_rate": 2.6718436052878935e-05, + "loss": 0.45193896293640134, + "step": 170960 + }, + { + "epoch": 0.7340099430720486, + "grad_norm": 0.0014380532084032893, + "learning_rate": 2.6714124332761316e-05, + "loss": 0.3336472988128662, + "step": 170970 + }, + { + "epoch": 0.7340528751620686, + "grad_norm": 0.3532212972640991, + "learning_rate": 2.6709812612643693e-05, + "loss": 0.1193724513053894, + "step": 170980 + }, + { + "epoch": 0.7340958072520887, + "grad_norm": 0.023362183943390846, + "learning_rate": 2.6705500892526064e-05, + "loss": 0.30927510261535646, + "step": 170990 + }, + { + "epoch": 0.7341387393421086, + "grad_norm": 0.012097061611711979, + "learning_rate": 2.670118917240844e-05, + "loss": 0.1073559045791626, + "step": 171000 + }, + { + "epoch": 0.7341387393421086, + "eval_loss": 0.39417701959609985, + "eval_runtime": 27.5821, + "eval_samples_per_second": 3.626, + "eval_steps_per_second": 3.626, + "step": 171000 + }, + { + "epoch": 0.7341816714321286, + "grad_norm": 0.09236214309930801, + "learning_rate": 2.669687745229082e-05, + "loss": 0.31250853538513185, + "step": 171010 + }, + { + "epoch": 0.7342246035221487, + "grad_norm": 0.05382629483938217, + "learning_rate": 2.6692565732173196e-05, + "loss": 0.048319220542907715, + "step": 171020 + }, + { + "epoch": 0.7342675356121686, + "grad_norm": 0.6148742437362671, + "learning_rate": 2.668825401205557e-05, + "loss": 0.0924647569656372, + "step": 171030 + }, + { + "epoch": 0.7343104677021887, + "grad_norm": 1.4304864406585693, + "learning_rate": 2.6683942291937947e-05, + "loss": 0.20467443466186525, + "step": 171040 + }, + { + "epoch": 0.7343533997922087, + "grad_norm": 1.1778323650360107, + "learning_rate": 2.6679630571820325e-05, + "loss": 0.5647805690765381, + "step": 171050 + }, + { + "epoch": 0.7343963318822286, + "grad_norm": 0.8232007026672363, + "learning_rate": 2.6675318851702702e-05, + "loss": 0.09155967235565185, + "step": 171060 + }, + { + "epoch": 0.7344392639722487, + "grad_norm": 0.0669042244553566, + "learning_rate": 2.6671007131585073e-05, + "loss": 0.1859180212020874, + "step": 171070 + }, + { + "epoch": 0.7344821960622687, + "grad_norm": 0.005053384695202112, + "learning_rate": 2.6666695411467453e-05, + "loss": 0.12264673709869385, + "step": 171080 + }, + { + "epoch": 0.7345251281522888, + "grad_norm": 0.9383001327514648, + "learning_rate": 2.666238369134983e-05, + "loss": 0.3935052156448364, + "step": 171090 + }, + { + "epoch": 0.7345680602423087, + "grad_norm": 0.0015224060043692589, + "learning_rate": 2.6658071971232208e-05, + "loss": 0.21541233062744142, + "step": 171100 + }, + { + "epoch": 0.7346109923323287, + "grad_norm": 0.3329278826713562, + "learning_rate": 2.665376025111458e-05, + "loss": 0.3047566652297974, + "step": 171110 + }, + { + "epoch": 0.7346539244223488, + "grad_norm": 0.7561929225921631, + "learning_rate": 2.6649448530996956e-05, + "loss": 0.30191969871520996, + "step": 171120 + }, + { + "epoch": 0.7346968565123687, + "grad_norm": 0.03231775388121605, + "learning_rate": 2.6645136810879333e-05, + "loss": 0.10164375305175781, + "step": 171130 + }, + { + "epoch": 0.7347397886023888, + "grad_norm": 0.06257763504981995, + "learning_rate": 2.664082509076171e-05, + "loss": 0.18539004325866698, + "step": 171140 + }, + { + "epoch": 0.7347827206924088, + "grad_norm": 1.2411458492279053, + "learning_rate": 2.6636513370644085e-05, + "loss": 0.2938519477844238, + "step": 171150 + }, + { + "epoch": 0.7348256527824287, + "grad_norm": 0.11264359205961227, + "learning_rate": 2.6632201650526462e-05, + "loss": 0.12792333364486694, + "step": 171160 + }, + { + "epoch": 0.7348685848724488, + "grad_norm": 0.006919489707797766, + "learning_rate": 2.662788993040884e-05, + "loss": 0.4435697555541992, + "step": 171170 + }, + { + "epoch": 0.7349115169624688, + "grad_norm": 0.014986931346356869, + "learning_rate": 2.6623578210291217e-05, + "loss": 0.0587616503238678, + "step": 171180 + }, + { + "epoch": 0.7349544490524887, + "grad_norm": 0.00439242459833622, + "learning_rate": 2.661926649017359e-05, + "loss": 0.12553696632385253, + "step": 171190 + }, + { + "epoch": 0.7349973811425088, + "grad_norm": 0.006219713948667049, + "learning_rate": 2.6614954770055968e-05, + "loss": 0.5123360633850098, + "step": 171200 + }, + { + "epoch": 0.7350403132325288, + "grad_norm": 0.004228357691317797, + "learning_rate": 2.6610643049938345e-05, + "loss": 0.05139904618263245, + "step": 171210 + }, + { + "epoch": 0.7350832453225488, + "grad_norm": 2.0546998977661133, + "learning_rate": 2.6606331329820723e-05, + "loss": 0.10472931861877441, + "step": 171220 + }, + { + "epoch": 0.7351261774125688, + "grad_norm": 0.42057308554649353, + "learning_rate": 2.6602019609703093e-05, + "loss": 0.1874741792678833, + "step": 171230 + }, + { + "epoch": 0.7351691095025888, + "grad_norm": 2.450517177581787, + "learning_rate": 2.659770788958547e-05, + "loss": 0.2614566802978516, + "step": 171240 + }, + { + "epoch": 0.7352120415926088, + "grad_norm": 0.06705211102962494, + "learning_rate": 2.6593396169467848e-05, + "loss": 0.288163161277771, + "step": 171250 + }, + { + "epoch": 0.7352549736826288, + "grad_norm": 0.570608377456665, + "learning_rate": 2.6589084449350225e-05, + "loss": 0.1233750820159912, + "step": 171260 + }, + { + "epoch": 0.7352979057726489, + "grad_norm": 2.294008493423462, + "learning_rate": 2.65847727292326e-05, + "loss": 0.3664525508880615, + "step": 171270 + }, + { + "epoch": 0.7353408378626688, + "grad_norm": 0.11099325120449066, + "learning_rate": 2.6580461009114976e-05, + "loss": 0.34699676036834715, + "step": 171280 + }, + { + "epoch": 0.7353837699526888, + "grad_norm": 0.1224088966846466, + "learning_rate": 2.6576149288997354e-05, + "loss": 0.07544822692871093, + "step": 171290 + }, + { + "epoch": 0.7354267020427089, + "grad_norm": 0.09361089766025543, + "learning_rate": 2.657183756887973e-05, + "loss": 0.44263458251953125, + "step": 171300 + }, + { + "epoch": 0.7354696341327288, + "grad_norm": 4.612122535705566, + "learning_rate": 2.6567525848762105e-05, + "loss": 0.26591827869415285, + "step": 171310 + }, + { + "epoch": 0.7355125662227489, + "grad_norm": 0.023752661421895027, + "learning_rate": 2.6563214128644482e-05, + "loss": 0.1700994849205017, + "step": 171320 + }, + { + "epoch": 0.7355554983127689, + "grad_norm": 0.03229743242263794, + "learning_rate": 2.655890240852686e-05, + "loss": 0.3263807535171509, + "step": 171330 + }, + { + "epoch": 0.7355984304027888, + "grad_norm": 0.003039776347577572, + "learning_rate": 2.6554590688409237e-05, + "loss": 0.2008406400680542, + "step": 171340 + }, + { + "epoch": 0.7356413624928089, + "grad_norm": 0.18037524819374084, + "learning_rate": 2.6550278968291614e-05, + "loss": 0.212782883644104, + "step": 171350 + }, + { + "epoch": 0.7356842945828289, + "grad_norm": 0.010732796974480152, + "learning_rate": 2.6545967248173985e-05, + "loss": 0.21220669746398926, + "step": 171360 + }, + { + "epoch": 0.7357272266728488, + "grad_norm": 0.00419106800109148, + "learning_rate": 2.6541655528056362e-05, + "loss": 0.15579986572265625, + "step": 171370 + }, + { + "epoch": 0.7357701587628689, + "grad_norm": 1.4872262477874756, + "learning_rate": 2.6537343807938743e-05, + "loss": 0.359185528755188, + "step": 171380 + }, + { + "epoch": 0.7358130908528889, + "grad_norm": 0.031842347234487534, + "learning_rate": 2.653303208782112e-05, + "loss": 0.14681037664413452, + "step": 171390 + }, + { + "epoch": 0.7358560229429089, + "grad_norm": 0.03504836559295654, + "learning_rate": 2.652872036770349e-05, + "loss": 0.18587688207626343, + "step": 171400 + }, + { + "epoch": 0.7358989550329289, + "grad_norm": 1.9147745370864868, + "learning_rate": 2.652440864758587e-05, + "loss": 0.15949387550354005, + "step": 171410 + }, + { + "epoch": 0.735941887122949, + "grad_norm": 0.004897118546068668, + "learning_rate": 2.6520096927468246e-05, + "loss": 0.2251277208328247, + "step": 171420 + }, + { + "epoch": 0.7359848192129689, + "grad_norm": 0.6692392826080322, + "learning_rate": 2.6515785207350623e-05, + "loss": 0.19807039499282836, + "step": 171430 + }, + { + "epoch": 0.7360277513029889, + "grad_norm": 0.005262289196252823, + "learning_rate": 2.6511473487232997e-05, + "loss": 0.23101203441619872, + "step": 171440 + }, + { + "epoch": 0.736070683393009, + "grad_norm": 1.8423153162002563, + "learning_rate": 2.6507161767115374e-05, + "loss": 0.38815784454345703, + "step": 171450 + }, + { + "epoch": 0.7361136154830289, + "grad_norm": 0.03533385694026947, + "learning_rate": 2.650285004699775e-05, + "loss": 0.1562308430671692, + "step": 171460 + }, + { + "epoch": 0.7361565475730489, + "grad_norm": 0.010025018826127052, + "learning_rate": 2.649853832688013e-05, + "loss": 0.10853803157806396, + "step": 171470 + }, + { + "epoch": 0.736199479663069, + "grad_norm": 0.0056663258001208305, + "learning_rate": 2.64942266067625e-05, + "loss": 0.18904753923416137, + "step": 171480 + }, + { + "epoch": 0.7362424117530889, + "grad_norm": 0.09215538948774338, + "learning_rate": 2.648991488664488e-05, + "loss": 0.2665423393249512, + "step": 171490 + }, + { + "epoch": 0.736285343843109, + "grad_norm": 1.4893046617507935, + "learning_rate": 2.6485603166527258e-05, + "loss": 0.17237144708633423, + "step": 171500 + }, + { + "epoch": 0.736328275933129, + "grad_norm": 0.1289806365966797, + "learning_rate": 2.6481291446409635e-05, + "loss": 0.10626275539398193, + "step": 171510 + }, + { + "epoch": 0.736371208023149, + "grad_norm": 0.02575540356338024, + "learning_rate": 2.6476979726292006e-05, + "loss": 0.1492064118385315, + "step": 171520 + }, + { + "epoch": 0.736414140113169, + "grad_norm": 2.737734079360962, + "learning_rate": 2.6472668006174383e-05, + "loss": 0.3282522439956665, + "step": 171530 + }, + { + "epoch": 0.736457072203189, + "grad_norm": 0.1628538817167282, + "learning_rate": 2.646835628605676e-05, + "loss": 0.011172150075435639, + "step": 171540 + }, + { + "epoch": 0.736500004293209, + "grad_norm": 0.026045171543955803, + "learning_rate": 2.6464044565939138e-05, + "loss": 0.2815445899963379, + "step": 171550 + }, + { + "epoch": 0.736542936383229, + "grad_norm": 5.577436923980713, + "learning_rate": 2.645973284582151e-05, + "loss": 0.16396323442459107, + "step": 171560 + }, + { + "epoch": 0.736585868473249, + "grad_norm": 1.4694331884384155, + "learning_rate": 2.645542112570389e-05, + "loss": 0.10221610069274903, + "step": 171570 + }, + { + "epoch": 0.7366288005632691, + "grad_norm": 2.34641432762146, + "learning_rate": 2.6451109405586266e-05, + "loss": 0.18182029724121093, + "step": 171580 + }, + { + "epoch": 0.736671732653289, + "grad_norm": 0.022412387654185295, + "learning_rate": 2.6446797685468644e-05, + "loss": 0.26555168628692627, + "step": 171590 + }, + { + "epoch": 0.736714664743309, + "grad_norm": 0.12042602151632309, + "learning_rate": 2.6442485965351018e-05, + "loss": 0.32265405654907225, + "step": 171600 + }, + { + "epoch": 0.7367575968333291, + "grad_norm": 0.16809596121311188, + "learning_rate": 2.6438174245233395e-05, + "loss": 0.1988927125930786, + "step": 171610 + }, + { + "epoch": 0.736800528923349, + "grad_norm": 0.004430527798831463, + "learning_rate": 2.6433862525115772e-05, + "loss": 0.32211244106292725, + "step": 171620 + }, + { + "epoch": 0.7368434610133691, + "grad_norm": 0.04840118810534477, + "learning_rate": 2.642955080499815e-05, + "loss": 0.10191557407379151, + "step": 171630 + }, + { + "epoch": 0.7368863931033891, + "grad_norm": 0.8072931170463562, + "learning_rate": 2.642523908488052e-05, + "loss": 0.24994516372680664, + "step": 171640 + }, + { + "epoch": 0.736929325193409, + "grad_norm": 0.7125005722045898, + "learning_rate": 2.6420927364762897e-05, + "loss": 0.04099064767360687, + "step": 171650 + }, + { + "epoch": 0.7369722572834291, + "grad_norm": 0.011734679341316223, + "learning_rate": 2.6416615644645275e-05, + "loss": 0.15249698162078856, + "step": 171660 + }, + { + "epoch": 0.7370151893734491, + "grad_norm": 2.276210069656372, + "learning_rate": 2.6412303924527652e-05, + "loss": 0.25595483779907224, + "step": 171670 + }, + { + "epoch": 0.737058121463469, + "grad_norm": 0.00881089735776186, + "learning_rate": 2.6407992204410026e-05, + "loss": 0.4060102939605713, + "step": 171680 + }, + { + "epoch": 0.7371010535534891, + "grad_norm": 4.0960774421691895, + "learning_rate": 2.6403680484292403e-05, + "loss": 0.28471989631652833, + "step": 171690 + }, + { + "epoch": 0.7371439856435091, + "grad_norm": 1.0635778903961182, + "learning_rate": 2.639936876417478e-05, + "loss": 0.30349667072296144, + "step": 171700 + }, + { + "epoch": 0.7371869177335291, + "grad_norm": 2.140429973602295, + "learning_rate": 2.6395057044057158e-05, + "loss": 0.22982544898986818, + "step": 171710 + }, + { + "epoch": 0.7372298498235491, + "grad_norm": 3.2402894496917725, + "learning_rate": 2.6390745323939536e-05, + "loss": 0.32609105110168457, + "step": 171720 + }, + { + "epoch": 0.7372727819135692, + "grad_norm": 1.8764134645462036, + "learning_rate": 2.638643360382191e-05, + "loss": 0.28361532688140867, + "step": 171730 + }, + { + "epoch": 0.7373157140035891, + "grad_norm": 0.014295602217316628, + "learning_rate": 2.6382121883704287e-05, + "loss": 0.1151078224182129, + "step": 171740 + }, + { + "epoch": 0.7373586460936091, + "grad_norm": 0.0026767903473228216, + "learning_rate": 2.6377810163586664e-05, + "loss": 0.2339120626449585, + "step": 171750 + }, + { + "epoch": 0.7374015781836292, + "grad_norm": 0.02163386344909668, + "learning_rate": 2.637349844346904e-05, + "loss": 0.19625715017318726, + "step": 171760 + }, + { + "epoch": 0.7374445102736491, + "grad_norm": 0.07742445170879364, + "learning_rate": 2.6369186723351412e-05, + "loss": 0.05881250500679016, + "step": 171770 + }, + { + "epoch": 0.7374874423636691, + "grad_norm": 1.3166648149490356, + "learning_rate": 2.636487500323379e-05, + "loss": 0.28442997932434083, + "step": 171780 + }, + { + "epoch": 0.7375303744536892, + "grad_norm": 0.334449827671051, + "learning_rate": 2.636056328311617e-05, + "loss": 0.19606301784515381, + "step": 171790 + }, + { + "epoch": 0.7375733065437091, + "grad_norm": 0.011855477467179298, + "learning_rate": 2.6356251562998547e-05, + "loss": 0.13084664344787597, + "step": 171800 + }, + { + "epoch": 0.7376162386337292, + "grad_norm": 10.077703475952148, + "learning_rate": 2.6351939842880918e-05, + "loss": 0.3036245584487915, + "step": 171810 + }, + { + "epoch": 0.7376591707237492, + "grad_norm": 1.4328504800796509, + "learning_rate": 2.6347628122763295e-05, + "loss": 0.23132739067077637, + "step": 171820 + }, + { + "epoch": 0.7377021028137691, + "grad_norm": 0.11290911585092545, + "learning_rate": 2.6343316402645673e-05, + "loss": 0.15605651140213012, + "step": 171830 + }, + { + "epoch": 0.7377450349037892, + "grad_norm": 0.06027461960911751, + "learning_rate": 2.633900468252805e-05, + "loss": 0.12230217456817627, + "step": 171840 + }, + { + "epoch": 0.7377879669938092, + "grad_norm": 2.58107328414917, + "learning_rate": 2.6334692962410424e-05, + "loss": 0.10988858938217164, + "step": 171850 + }, + { + "epoch": 0.7378308990838292, + "grad_norm": 1.151477575302124, + "learning_rate": 2.63303812422928e-05, + "loss": 0.3149649381637573, + "step": 171860 + }, + { + "epoch": 0.7378738311738492, + "grad_norm": 1.9065663814544678, + "learning_rate": 2.632606952217518e-05, + "loss": 0.3236576318740845, + "step": 171870 + }, + { + "epoch": 0.7379167632638692, + "grad_norm": 1.5041933059692383, + "learning_rate": 2.6321757802057556e-05, + "loss": 0.2537533283233643, + "step": 171880 + }, + { + "epoch": 0.7379596953538892, + "grad_norm": 0.03302591294050217, + "learning_rate": 2.6317446081939927e-05, + "loss": 0.05842955112457275, + "step": 171890 + }, + { + "epoch": 0.7380026274439092, + "grad_norm": 1.3000556230545044, + "learning_rate": 2.6313134361822307e-05, + "loss": 0.472438907623291, + "step": 171900 + }, + { + "epoch": 0.7380455595339293, + "grad_norm": 0.2630452513694763, + "learning_rate": 2.6308822641704685e-05, + "loss": 0.1062608003616333, + "step": 171910 + }, + { + "epoch": 0.7380884916239492, + "grad_norm": 2.648360013961792, + "learning_rate": 2.6304510921587062e-05, + "loss": 0.10505330562591553, + "step": 171920 + }, + { + "epoch": 0.7381314237139692, + "grad_norm": 0.03722335025668144, + "learning_rate": 2.6300199201469433e-05, + "loss": 0.14387372732162476, + "step": 171930 + }, + { + "epoch": 0.7381743558039893, + "grad_norm": 0.010214372538030148, + "learning_rate": 2.629588748135181e-05, + "loss": 0.052946603298187254, + "step": 171940 + }, + { + "epoch": 0.7382172878940093, + "grad_norm": 0.016190167516469955, + "learning_rate": 2.6291575761234187e-05, + "loss": 0.11722316741943359, + "step": 171950 + }, + { + "epoch": 0.7382602199840292, + "grad_norm": 0.11385954916477203, + "learning_rate": 2.6287264041116565e-05, + "loss": 0.4546807289123535, + "step": 171960 + }, + { + "epoch": 0.7383031520740493, + "grad_norm": 0.030435390770435333, + "learning_rate": 2.628295232099894e-05, + "loss": 0.23353421688079834, + "step": 171970 + }, + { + "epoch": 0.7383460841640693, + "grad_norm": 4.89837121963501, + "learning_rate": 2.6278640600881316e-05, + "loss": 0.15024209022521973, + "step": 171980 + }, + { + "epoch": 0.7383890162540893, + "grad_norm": 0.031725313514471054, + "learning_rate": 2.6274328880763693e-05, + "loss": 0.3565608024597168, + "step": 171990 + }, + { + "epoch": 0.7384319483441093, + "grad_norm": 0.5686355829238892, + "learning_rate": 2.627001716064607e-05, + "loss": 0.2020179271697998, + "step": 172000 + }, + { + "epoch": 0.7384319483441093, + "eval_loss": 0.3913794457912445, + "eval_runtime": 27.4732, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 172000 + }, + { + "epoch": 0.7384748804341293, + "grad_norm": 1.6541748046875, + "learning_rate": 2.6265705440528445e-05, + "loss": 0.05306849479675293, + "step": 172010 + }, + { + "epoch": 0.7385178125241493, + "grad_norm": 5.37670373916626, + "learning_rate": 2.6261393720410822e-05, + "loss": 0.21729233264923095, + "step": 172020 + }, + { + "epoch": 0.7385607446141693, + "grad_norm": 1.403529167175293, + "learning_rate": 2.62570820002932e-05, + "loss": 0.12173585891723633, + "step": 172030 + }, + { + "epoch": 0.7386036767041894, + "grad_norm": 0.32816994190216064, + "learning_rate": 2.6252770280175577e-05, + "loss": 0.3592313289642334, + "step": 172040 + }, + { + "epoch": 0.7386466087942093, + "grad_norm": 0.02111462503671646, + "learning_rate": 2.6248458560057947e-05, + "loss": 0.36513347625732423, + "step": 172050 + }, + { + "epoch": 0.7386895408842293, + "grad_norm": 7.12018346786499, + "learning_rate": 2.6244146839940325e-05, + "loss": 0.2642231464385986, + "step": 172060 + }, + { + "epoch": 0.7387324729742494, + "grad_norm": 0.02392994984984398, + "learning_rate": 2.6239835119822702e-05, + "loss": 0.2387470483779907, + "step": 172070 + }, + { + "epoch": 0.7387754050642693, + "grad_norm": 0.04821072146296501, + "learning_rate": 2.6235523399705083e-05, + "loss": 0.24384143352508544, + "step": 172080 + }, + { + "epoch": 0.7388183371542894, + "grad_norm": 0.27216729521751404, + "learning_rate": 2.623121167958746e-05, + "loss": 0.19975589513778685, + "step": 172090 + }, + { + "epoch": 0.7388612692443094, + "grad_norm": 0.005782154388725758, + "learning_rate": 2.622689995946983e-05, + "loss": 0.21162703037261962, + "step": 172100 + }, + { + "epoch": 0.7389042013343293, + "grad_norm": 0.018965255469083786, + "learning_rate": 2.6222588239352208e-05, + "loss": 0.21590452194213866, + "step": 172110 + }, + { + "epoch": 0.7389471334243494, + "grad_norm": 0.0020858191419392824, + "learning_rate": 2.6218276519234585e-05, + "loss": 0.25055460929870604, + "step": 172120 + }, + { + "epoch": 0.7389900655143694, + "grad_norm": 1.3369940519332886, + "learning_rate": 2.6213964799116963e-05, + "loss": 0.14633073806762695, + "step": 172130 + }, + { + "epoch": 0.7390329976043893, + "grad_norm": 0.527765691280365, + "learning_rate": 2.6209653078999336e-05, + "loss": 0.2503513813018799, + "step": 172140 + }, + { + "epoch": 0.7390759296944094, + "grad_norm": 1.8898754119873047, + "learning_rate": 2.6205341358881714e-05, + "loss": 0.23500373363494872, + "step": 172150 + }, + { + "epoch": 0.7391188617844294, + "grad_norm": 0.10808947682380676, + "learning_rate": 2.620102963876409e-05, + "loss": 0.23570013046264648, + "step": 172160 + }, + { + "epoch": 0.7391617938744494, + "grad_norm": 3.1415834426879883, + "learning_rate": 2.619671791864647e-05, + "loss": 0.13939026594161988, + "step": 172170 + }, + { + "epoch": 0.7392047259644694, + "grad_norm": 4.802440166473389, + "learning_rate": 2.619240619852884e-05, + "loss": 0.150770902633667, + "step": 172180 + }, + { + "epoch": 0.7392476580544894, + "grad_norm": 0.0116340983659029, + "learning_rate": 2.618809447841122e-05, + "loss": 0.4175867557525635, + "step": 172190 + }, + { + "epoch": 0.7392905901445094, + "grad_norm": 1.3842447996139526, + "learning_rate": 2.6183782758293597e-05, + "loss": 0.05002044439315796, + "step": 172200 + }, + { + "epoch": 0.7393335222345294, + "grad_norm": 1.6711241006851196, + "learning_rate": 2.6179471038175974e-05, + "loss": 0.23692212104797364, + "step": 172210 + }, + { + "epoch": 0.7393764543245495, + "grad_norm": 1.484615683555603, + "learning_rate": 2.6175159318058345e-05, + "loss": 0.22095875740051268, + "step": 172220 + }, + { + "epoch": 0.7394193864145694, + "grad_norm": 4.88292121887207, + "learning_rate": 2.6170847597940722e-05, + "loss": 0.21521258354187012, + "step": 172230 + }, + { + "epoch": 0.7394623185045894, + "grad_norm": 0.0060545094311237335, + "learning_rate": 2.61665358778231e-05, + "loss": 0.2710393190383911, + "step": 172240 + }, + { + "epoch": 0.7395052505946095, + "grad_norm": 0.011293796822428703, + "learning_rate": 2.6162224157705477e-05, + "loss": 0.08580965995788574, + "step": 172250 + }, + { + "epoch": 0.7395481826846294, + "grad_norm": 5.334449768066406, + "learning_rate": 2.615791243758785e-05, + "loss": 0.3057300329208374, + "step": 172260 + }, + { + "epoch": 0.7395911147746494, + "grad_norm": 20.33785629272461, + "learning_rate": 2.615360071747023e-05, + "loss": 0.23020739555358888, + "step": 172270 + }, + { + "epoch": 0.7396340468646695, + "grad_norm": 0.0005580539582297206, + "learning_rate": 2.6149288997352606e-05, + "loss": 0.03917438983917236, + "step": 172280 + }, + { + "epoch": 0.7396769789546894, + "grad_norm": 2.250082015991211, + "learning_rate": 2.6144977277234983e-05, + "loss": 0.3027928352355957, + "step": 172290 + }, + { + "epoch": 0.7397199110447095, + "grad_norm": 0.008189544081687927, + "learning_rate": 2.6140665557117354e-05, + "loss": 0.14146639108657838, + "step": 172300 + }, + { + "epoch": 0.7397628431347295, + "grad_norm": 15.595314979553223, + "learning_rate": 2.6136353836999734e-05, + "loss": 0.3388784885406494, + "step": 172310 + }, + { + "epoch": 0.7398057752247494, + "grad_norm": 0.15983085334300995, + "learning_rate": 2.6132042116882112e-05, + "loss": 0.1603078603744507, + "step": 172320 + }, + { + "epoch": 0.7398487073147695, + "grad_norm": 0.9329927563667297, + "learning_rate": 2.612773039676449e-05, + "loss": 0.13090095520019532, + "step": 172330 + }, + { + "epoch": 0.7398916394047895, + "grad_norm": 0.0009728687582537532, + "learning_rate": 2.612341867664686e-05, + "loss": 0.1858871340751648, + "step": 172340 + }, + { + "epoch": 0.7399345714948095, + "grad_norm": 1.6413109302520752, + "learning_rate": 2.6119106956529237e-05, + "loss": 0.29552493095397947, + "step": 172350 + }, + { + "epoch": 0.7399775035848295, + "grad_norm": 1.6135348081588745, + "learning_rate": 2.6114795236411614e-05, + "loss": 0.1183046579360962, + "step": 172360 + }, + { + "epoch": 0.7400204356748495, + "grad_norm": 0.00031716900411993265, + "learning_rate": 2.611048351629399e-05, + "loss": 0.1578521728515625, + "step": 172370 + }, + { + "epoch": 0.7400633677648696, + "grad_norm": 0.02061157487332821, + "learning_rate": 2.6106171796176366e-05, + "loss": 0.001564457081258297, + "step": 172380 + }, + { + "epoch": 0.7401062998548895, + "grad_norm": 2.053553342819214, + "learning_rate": 2.6101860076058743e-05, + "loss": 0.25558698177337646, + "step": 172390 + }, + { + "epoch": 0.7401492319449096, + "grad_norm": 0.22899110615253448, + "learning_rate": 2.609754835594112e-05, + "loss": 0.2048067569732666, + "step": 172400 + }, + { + "epoch": 0.7401921640349296, + "grad_norm": 3.2389774322509766, + "learning_rate": 2.6093236635823498e-05, + "loss": 0.16423354148864747, + "step": 172410 + }, + { + "epoch": 0.7402350961249495, + "grad_norm": 7.045860290527344, + "learning_rate": 2.6088924915705875e-05, + "loss": 0.3069408893585205, + "step": 172420 + }, + { + "epoch": 0.7402780282149696, + "grad_norm": 0.00262209540233016, + "learning_rate": 2.608461319558825e-05, + "loss": 0.09878330826759338, + "step": 172430 + }, + { + "epoch": 0.7403209603049896, + "grad_norm": 0.008975930511951447, + "learning_rate": 2.6080301475470626e-05, + "loss": 0.16282857656478883, + "step": 172440 + }, + { + "epoch": 0.7403638923950095, + "grad_norm": 0.006928627844899893, + "learning_rate": 2.6075989755353004e-05, + "loss": 0.060286080837249754, + "step": 172450 + }, + { + "epoch": 0.7404068244850296, + "grad_norm": 0.003378543769940734, + "learning_rate": 2.607167803523538e-05, + "loss": 0.07941926717758178, + "step": 172460 + }, + { + "epoch": 0.7404497565750496, + "grad_norm": 4.774534225463867, + "learning_rate": 2.606736631511775e-05, + "loss": 0.29351153373718264, + "step": 172470 + }, + { + "epoch": 0.7404926886650696, + "grad_norm": 0.4744148254394531, + "learning_rate": 2.606305459500013e-05, + "loss": 0.09133726954460145, + "step": 172480 + }, + { + "epoch": 0.7405356207550896, + "grad_norm": 6.601369857788086, + "learning_rate": 2.605874287488251e-05, + "loss": 0.190312922000885, + "step": 172490 + }, + { + "epoch": 0.7405785528451096, + "grad_norm": 14.058046340942383, + "learning_rate": 2.6054431154764887e-05, + "loss": 0.3220758199691772, + "step": 172500 + }, + { + "epoch": 0.7406214849351296, + "grad_norm": 0.09356804192066193, + "learning_rate": 2.6050119434647258e-05, + "loss": 0.3615183591842651, + "step": 172510 + }, + { + "epoch": 0.7406644170251496, + "grad_norm": 0.052794020622968674, + "learning_rate": 2.6045807714529635e-05, + "loss": 0.10844634771347046, + "step": 172520 + }, + { + "epoch": 0.7407073491151697, + "grad_norm": 0.1476939171552658, + "learning_rate": 2.6041495994412012e-05, + "loss": 0.19965894222259523, + "step": 172530 + }, + { + "epoch": 0.7407502812051896, + "grad_norm": 2.120439052581787, + "learning_rate": 2.603718427429439e-05, + "loss": 0.19153574705123902, + "step": 172540 + }, + { + "epoch": 0.7407932132952096, + "grad_norm": 0.0483546257019043, + "learning_rate": 2.6032872554176763e-05, + "loss": 0.28668229579925536, + "step": 172550 + }, + { + "epoch": 0.7408361453852297, + "grad_norm": 0.01480165682733059, + "learning_rate": 2.602856083405914e-05, + "loss": 0.2542074918746948, + "step": 172560 + }, + { + "epoch": 0.7408790774752496, + "grad_norm": 0.2001974880695343, + "learning_rate": 2.6024249113941518e-05, + "loss": 0.0713961124420166, + "step": 172570 + }, + { + "epoch": 0.7409220095652697, + "grad_norm": 1.2386269569396973, + "learning_rate": 2.6019937393823896e-05, + "loss": 0.18896110057830812, + "step": 172580 + }, + { + "epoch": 0.7409649416552897, + "grad_norm": 0.0029969087336212397, + "learning_rate": 2.6015625673706266e-05, + "loss": 0.10486527681350707, + "step": 172590 + }, + { + "epoch": 0.7410078737453096, + "grad_norm": 0.015220149420201778, + "learning_rate": 2.6011313953588647e-05, + "loss": 0.3754743576049805, + "step": 172600 + }, + { + "epoch": 0.7410508058353297, + "grad_norm": 4.762171268463135, + "learning_rate": 2.6007002233471024e-05, + "loss": 0.26454839706420896, + "step": 172610 + }, + { + "epoch": 0.7410937379253497, + "grad_norm": 1.5890958309173584, + "learning_rate": 2.60026905133534e-05, + "loss": 0.16944000720977784, + "step": 172620 + }, + { + "epoch": 0.7411366700153696, + "grad_norm": 0.0030514898244291544, + "learning_rate": 2.5998378793235772e-05, + "loss": 0.2463892698287964, + "step": 172630 + }, + { + "epoch": 0.7411796021053897, + "grad_norm": 0.13802503049373627, + "learning_rate": 2.599406707311815e-05, + "loss": 0.32329416275024414, + "step": 172640 + }, + { + "epoch": 0.7412225341954097, + "grad_norm": 0.013265220448374748, + "learning_rate": 2.5989755353000527e-05, + "loss": 0.28892490863800047, + "step": 172650 + }, + { + "epoch": 0.7412654662854297, + "grad_norm": 0.48437264561653137, + "learning_rate": 2.5985443632882904e-05, + "loss": 0.2645559549331665, + "step": 172660 + }, + { + "epoch": 0.7413083983754497, + "grad_norm": 0.04876817762851715, + "learning_rate": 2.5981131912765278e-05, + "loss": 0.24079082012176514, + "step": 172670 + }, + { + "epoch": 0.7413513304654697, + "grad_norm": 0.001597659313119948, + "learning_rate": 2.5976820192647655e-05, + "loss": 0.22210347652435303, + "step": 172680 + }, + { + "epoch": 0.7413942625554897, + "grad_norm": 0.1181657686829567, + "learning_rate": 2.5972508472530033e-05, + "loss": 0.14718619585037232, + "step": 172690 + }, + { + "epoch": 0.7414371946455097, + "grad_norm": 1.3400815725326538, + "learning_rate": 2.596819675241241e-05, + "loss": 0.16132137775421143, + "step": 172700 + }, + { + "epoch": 0.7414801267355298, + "grad_norm": 0.009908415377140045, + "learning_rate": 2.5963885032294784e-05, + "loss": 0.08111634254455566, + "step": 172710 + }, + { + "epoch": 0.7415230588255497, + "grad_norm": 0.3089774250984192, + "learning_rate": 2.595957331217716e-05, + "loss": 0.14888973236083985, + "step": 172720 + }, + { + "epoch": 0.7415659909155697, + "grad_norm": 0.2667434513568878, + "learning_rate": 2.595526159205954e-05, + "loss": 0.06582852005958557, + "step": 172730 + }, + { + "epoch": 0.7416089230055898, + "grad_norm": 5.848819732666016, + "learning_rate": 2.5950949871941916e-05, + "loss": 0.08641493320465088, + "step": 172740 + }, + { + "epoch": 0.7416518550956097, + "grad_norm": 0.0031941940542310476, + "learning_rate": 2.5946638151824287e-05, + "loss": 0.09354096055030822, + "step": 172750 + }, + { + "epoch": 0.7416947871856298, + "grad_norm": 8.418753623962402, + "learning_rate": 2.5942326431706664e-05, + "loss": 0.15573036670684814, + "step": 172760 + }, + { + "epoch": 0.7417377192756498, + "grad_norm": 0.03718193247914314, + "learning_rate": 2.593801471158904e-05, + "loss": 0.017639188468456267, + "step": 172770 + }, + { + "epoch": 0.7417806513656697, + "grad_norm": 0.3647063672542572, + "learning_rate": 2.593370299147142e-05, + "loss": 0.30183801651000974, + "step": 172780 + }, + { + "epoch": 0.7418235834556898, + "grad_norm": 0.01387582067400217, + "learning_rate": 2.59293912713538e-05, + "loss": 0.3928518533706665, + "step": 172790 + }, + { + "epoch": 0.7418665155457098, + "grad_norm": 1.8043218851089478, + "learning_rate": 2.592507955123617e-05, + "loss": 0.13272377252578735, + "step": 172800 + }, + { + "epoch": 0.7419094476357299, + "grad_norm": 0.012643926776945591, + "learning_rate": 2.5920767831118547e-05, + "loss": 0.2803585290908813, + "step": 172810 + }, + { + "epoch": 0.7419523797257498, + "grad_norm": 0.015432733111083508, + "learning_rate": 2.5916456111000925e-05, + "loss": 0.10544899702072144, + "step": 172820 + }, + { + "epoch": 0.7419953118157698, + "grad_norm": 0.8467946648597717, + "learning_rate": 2.5912144390883302e-05, + "loss": 0.37466087341308596, + "step": 172830 + }, + { + "epoch": 0.7420382439057899, + "grad_norm": 0.026423417031764984, + "learning_rate": 2.5907832670765676e-05, + "loss": 0.11121004819869995, + "step": 172840 + }, + { + "epoch": 0.7420811759958098, + "grad_norm": 2.762631416320801, + "learning_rate": 2.5903520950648053e-05, + "loss": 0.3465926170349121, + "step": 172850 + }, + { + "epoch": 0.7421241080858298, + "grad_norm": 0.00443359324708581, + "learning_rate": 2.589920923053043e-05, + "loss": 0.3730604648590088, + "step": 172860 + }, + { + "epoch": 0.7421670401758499, + "grad_norm": 0.0019174201879650354, + "learning_rate": 2.5894897510412808e-05, + "loss": 0.025777462124824523, + "step": 172870 + }, + { + "epoch": 0.7422099722658698, + "grad_norm": 0.011310220696032047, + "learning_rate": 2.589058579029518e-05, + "loss": 0.2822575092315674, + "step": 172880 + }, + { + "epoch": 0.7422529043558899, + "grad_norm": 0.02572781592607498, + "learning_rate": 2.5886274070177556e-05, + "loss": 0.1161012053489685, + "step": 172890 + }, + { + "epoch": 0.7422958364459099, + "grad_norm": 21.611812591552734, + "learning_rate": 2.5881962350059937e-05, + "loss": 0.3096113681793213, + "step": 172900 + }, + { + "epoch": 0.7423387685359298, + "grad_norm": 0.0002522652503103018, + "learning_rate": 2.5877650629942314e-05, + "loss": 0.3812889814376831, + "step": 172910 + }, + { + "epoch": 0.7423817006259499, + "grad_norm": 1.2188701629638672, + "learning_rate": 2.5873338909824685e-05, + "loss": 0.25791704654693604, + "step": 172920 + }, + { + "epoch": 0.7424246327159699, + "grad_norm": 0.6817674040794373, + "learning_rate": 2.5869027189707062e-05, + "loss": 0.24171240329742433, + "step": 172930 + }, + { + "epoch": 0.7424675648059899, + "grad_norm": 0.0008015789790078998, + "learning_rate": 2.586471546958944e-05, + "loss": 0.061844897270202634, + "step": 172940 + }, + { + "epoch": 0.7425104968960099, + "grad_norm": 4.590038776397705, + "learning_rate": 2.5860403749471817e-05, + "loss": 0.3130334377288818, + "step": 172950 + }, + { + "epoch": 0.7425534289860299, + "grad_norm": 0.005684803705662489, + "learning_rate": 2.585609202935419e-05, + "loss": 0.11198042631149292, + "step": 172960 + }, + { + "epoch": 0.7425963610760499, + "grad_norm": 0.6107711791992188, + "learning_rate": 2.5851780309236568e-05, + "loss": 0.31751389503479005, + "step": 172970 + }, + { + "epoch": 0.7426392931660699, + "grad_norm": 0.00203691772185266, + "learning_rate": 2.5847468589118945e-05, + "loss": 0.14549793004989625, + "step": 172980 + }, + { + "epoch": 0.74268222525609, + "grad_norm": 0.01611451804637909, + "learning_rate": 2.5843156869001323e-05, + "loss": 0.19413976669311522, + "step": 172990 + }, + { + "epoch": 0.7427251573461099, + "grad_norm": 0.01514244545251131, + "learning_rate": 2.5838845148883693e-05, + "loss": 0.06386711597442626, + "step": 173000 + }, + { + "epoch": 0.7427251573461099, + "eval_loss": 0.38922441005706787, + "eval_runtime": 27.6049, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 3.623, + "step": 173000 + }, + { + "epoch": 0.7427680894361299, + "grad_norm": 0.009602725505828857, + "learning_rate": 2.5834533428766074e-05, + "loss": 0.034127888083457944, + "step": 173010 + }, + { + "epoch": 0.74281102152615, + "grad_norm": 1.5638679265975952, + "learning_rate": 2.583022170864845e-05, + "loss": 0.14553526639938355, + "step": 173020 + }, + { + "epoch": 0.7428539536161699, + "grad_norm": 0.42116615176200867, + "learning_rate": 2.582590998853083e-05, + "loss": 0.3827761888504028, + "step": 173030 + }, + { + "epoch": 0.74289688570619, + "grad_norm": 0.009471829980611801, + "learning_rate": 2.58215982684132e-05, + "loss": 0.13673275709152222, + "step": 173040 + }, + { + "epoch": 0.74293981779621, + "grad_norm": 8.371386528015137, + "learning_rate": 2.5817286548295576e-05, + "loss": 0.36310031414031985, + "step": 173050 + }, + { + "epoch": 0.7429827498862299, + "grad_norm": 1.1578912734985352, + "learning_rate": 2.5812974828177954e-05, + "loss": 0.18568694591522217, + "step": 173060 + }, + { + "epoch": 0.74302568197625, + "grad_norm": 0.0743391215801239, + "learning_rate": 2.580866310806033e-05, + "loss": 0.16277066469192505, + "step": 173070 + }, + { + "epoch": 0.74306861406627, + "grad_norm": 0.011781970970332623, + "learning_rate": 2.5804351387942705e-05, + "loss": 0.24296224117279053, + "step": 173080 + }, + { + "epoch": 0.7431115461562899, + "grad_norm": 2.4936790466308594, + "learning_rate": 2.5800039667825082e-05, + "loss": 0.22644639015197754, + "step": 173090 + }, + { + "epoch": 0.74315447824631, + "grad_norm": 0.39981764554977417, + "learning_rate": 2.579572794770746e-05, + "loss": 0.28062996864318845, + "step": 173100 + }, + { + "epoch": 0.74319741033633, + "grad_norm": 5.6728644371032715, + "learning_rate": 2.5791416227589837e-05, + "loss": 0.3589806079864502, + "step": 173110 + }, + { + "epoch": 0.74324034242635, + "grad_norm": 0.006206681486219168, + "learning_rate": 2.578710450747221e-05, + "loss": 0.11007034778594971, + "step": 173120 + }, + { + "epoch": 0.74328327451637, + "grad_norm": 0.005777155049145222, + "learning_rate": 2.578279278735459e-05, + "loss": 0.04013732373714447, + "step": 173130 + }, + { + "epoch": 0.74332620660639, + "grad_norm": 2.492429256439209, + "learning_rate": 2.5778481067236966e-05, + "loss": 0.2559037208557129, + "step": 173140 + }, + { + "epoch": 0.74336913869641, + "grad_norm": 2.0569210052490234, + "learning_rate": 2.5774169347119343e-05, + "loss": 0.1842200756072998, + "step": 173150 + }, + { + "epoch": 0.74341207078643, + "grad_norm": 0.05245661735534668, + "learning_rate": 2.576985762700172e-05, + "loss": 0.11454921960830688, + "step": 173160 + }, + { + "epoch": 0.74345500287645, + "grad_norm": 0.0004969439469277859, + "learning_rate": 2.576554590688409e-05, + "loss": 0.36214404106140136, + "step": 173170 + }, + { + "epoch": 0.74349793496647, + "grad_norm": 0.011017000302672386, + "learning_rate": 2.576123418676647e-05, + "loss": 0.34534130096435545, + "step": 173180 + }, + { + "epoch": 0.74354086705649, + "grad_norm": 0.01024332270026207, + "learning_rate": 2.5756922466648846e-05, + "loss": 0.14188783168792723, + "step": 173190 + }, + { + "epoch": 0.7435837991465101, + "grad_norm": 0.053222961723804474, + "learning_rate": 2.5752610746531226e-05, + "loss": 0.15369497537612914, + "step": 173200 + }, + { + "epoch": 0.74362673123653, + "grad_norm": 1.4653114080429077, + "learning_rate": 2.5748299026413597e-05, + "loss": 0.2768231391906738, + "step": 173210 + }, + { + "epoch": 0.74366966332655, + "grad_norm": 1.0309170484542847, + "learning_rate": 2.5743987306295974e-05, + "loss": 0.3903130292892456, + "step": 173220 + }, + { + "epoch": 0.7437125954165701, + "grad_norm": 0.03295344114303589, + "learning_rate": 2.573967558617835e-05, + "loss": 0.09203203916549682, + "step": 173230 + }, + { + "epoch": 0.7437555275065901, + "grad_norm": 1.7051453590393066, + "learning_rate": 2.573536386606073e-05, + "loss": 0.04676951467990875, + "step": 173240 + }, + { + "epoch": 0.7437984595966101, + "grad_norm": 0.07142875343561172, + "learning_rate": 2.5731052145943103e-05, + "loss": 0.4174853801727295, + "step": 173250 + }, + { + "epoch": 0.7438413916866301, + "grad_norm": 0.9815771579742432, + "learning_rate": 2.572674042582548e-05, + "loss": 0.12617738246917726, + "step": 173260 + }, + { + "epoch": 0.7438843237766501, + "grad_norm": 0.4857109785079956, + "learning_rate": 2.5722428705707858e-05, + "loss": 0.11668617725372314, + "step": 173270 + }, + { + "epoch": 0.7439272558666701, + "grad_norm": 0.07516008615493774, + "learning_rate": 2.5718116985590235e-05, + "loss": 0.04436193406581879, + "step": 173280 + }, + { + "epoch": 0.7439701879566901, + "grad_norm": 0.07428845763206482, + "learning_rate": 2.5713805265472606e-05, + "loss": 0.03692366778850555, + "step": 173290 + }, + { + "epoch": 0.7440131200467102, + "grad_norm": 0.0036778750363737345, + "learning_rate": 2.5709493545354983e-05, + "loss": 0.07944020628929138, + "step": 173300 + }, + { + "epoch": 0.7440560521367301, + "grad_norm": 10.471997261047363, + "learning_rate": 2.5705181825237364e-05, + "loss": 0.2046130657196045, + "step": 173310 + }, + { + "epoch": 0.7440989842267501, + "grad_norm": 0.05514159053564072, + "learning_rate": 2.570087010511974e-05, + "loss": 0.08784713745117187, + "step": 173320 + }, + { + "epoch": 0.7441419163167702, + "grad_norm": 0.3847615420818329, + "learning_rate": 2.569655838500211e-05, + "loss": 0.163929283618927, + "step": 173330 + }, + { + "epoch": 0.7441848484067901, + "grad_norm": 0.041439611464738846, + "learning_rate": 2.569224666488449e-05, + "loss": 0.08686239123344422, + "step": 173340 + }, + { + "epoch": 0.7442277804968102, + "grad_norm": 1.8955515623092651, + "learning_rate": 2.5687934944766866e-05, + "loss": 0.14141509532928467, + "step": 173350 + }, + { + "epoch": 0.7442707125868302, + "grad_norm": 2.777130603790283, + "learning_rate": 2.5683623224649244e-05, + "loss": 0.11059495210647582, + "step": 173360 + }, + { + "epoch": 0.7443136446768501, + "grad_norm": 0.016122760251164436, + "learning_rate": 2.5679311504531618e-05, + "loss": 0.17072702646255494, + "step": 173370 + }, + { + "epoch": 0.7443565767668702, + "grad_norm": 1.575832724571228, + "learning_rate": 2.5674999784413995e-05, + "loss": 0.31911892890930177, + "step": 173380 + }, + { + "epoch": 0.7443995088568902, + "grad_norm": 1.009772777557373, + "learning_rate": 2.5670688064296372e-05, + "loss": 0.16791296005249023, + "step": 173390 + }, + { + "epoch": 0.7444424409469101, + "grad_norm": 2.09256911277771, + "learning_rate": 2.566637634417875e-05, + "loss": 0.1358073353767395, + "step": 173400 + }, + { + "epoch": 0.7444853730369302, + "grad_norm": 0.003986488562077284, + "learning_rate": 2.566206462406112e-05, + "loss": 0.30874583721160886, + "step": 173410 + }, + { + "epoch": 0.7445283051269502, + "grad_norm": 0.0005260541802272201, + "learning_rate": 2.56577529039435e-05, + "loss": 0.12634888887405396, + "step": 173420 + }, + { + "epoch": 0.7445712372169702, + "grad_norm": 0.09744524955749512, + "learning_rate": 2.5653441183825878e-05, + "loss": 0.1951184868812561, + "step": 173430 + }, + { + "epoch": 0.7446141693069902, + "grad_norm": 2.7815160751342773, + "learning_rate": 2.5649129463708256e-05, + "loss": 0.13784483671188355, + "step": 173440 + }, + { + "epoch": 0.7446571013970102, + "grad_norm": 1.0305331945419312, + "learning_rate": 2.5644817743590626e-05, + "loss": 0.30634360313415526, + "step": 173450 + }, + { + "epoch": 0.7447000334870302, + "grad_norm": 2.2558400630950928, + "learning_rate": 2.5640506023473003e-05, + "loss": 0.06751551032066345, + "step": 173460 + }, + { + "epoch": 0.7447429655770502, + "grad_norm": 1.7618029117584229, + "learning_rate": 2.563619430335538e-05, + "loss": 0.16229534149169922, + "step": 173470 + }, + { + "epoch": 0.7447858976670703, + "grad_norm": 0.021817678585648537, + "learning_rate": 2.5631882583237758e-05, + "loss": 0.22537617683410643, + "step": 173480 + }, + { + "epoch": 0.7448288297570902, + "grad_norm": 0.02080300636589527, + "learning_rate": 2.5627570863120132e-05, + "loss": 0.2169330358505249, + "step": 173490 + }, + { + "epoch": 0.7448717618471102, + "grad_norm": 0.438027560710907, + "learning_rate": 2.562325914300251e-05, + "loss": 0.20829992294311522, + "step": 173500 + }, + { + "epoch": 0.7449146939371303, + "grad_norm": 0.0014316404704004526, + "learning_rate": 2.5618947422884887e-05, + "loss": 0.045699626207351685, + "step": 173510 + }, + { + "epoch": 0.7449576260271502, + "grad_norm": 0.029168443754315376, + "learning_rate": 2.5614635702767264e-05, + "loss": 0.22984709739685058, + "step": 173520 + }, + { + "epoch": 0.7450005581171703, + "grad_norm": 0.024142567068338394, + "learning_rate": 2.561032398264964e-05, + "loss": 0.23821489810943602, + "step": 173530 + }, + { + "epoch": 0.7450434902071903, + "grad_norm": 0.07180153578519821, + "learning_rate": 2.5606012262532015e-05, + "loss": 0.24222948551177978, + "step": 173540 + }, + { + "epoch": 0.7450864222972102, + "grad_norm": 0.04247334226965904, + "learning_rate": 2.5601700542414393e-05, + "loss": 0.08847699761390686, + "step": 173550 + }, + { + "epoch": 0.7451293543872303, + "grad_norm": 7.379086971282959, + "learning_rate": 2.559738882229677e-05, + "loss": 0.07075945138931275, + "step": 173560 + }, + { + "epoch": 0.7451722864772503, + "grad_norm": 1.955690622329712, + "learning_rate": 2.5593077102179147e-05, + "loss": 0.2634904384613037, + "step": 173570 + }, + { + "epoch": 0.7452152185672702, + "grad_norm": 0.0009114326676353812, + "learning_rate": 2.5588765382061518e-05, + "loss": 0.2877689599990845, + "step": 173580 + }, + { + "epoch": 0.7452581506572903, + "grad_norm": 0.00039587041828781366, + "learning_rate": 2.5584453661943895e-05, + "loss": 0.20023708343505858, + "step": 173590 + }, + { + "epoch": 0.7453010827473103, + "grad_norm": 0.005947303492575884, + "learning_rate": 2.5580141941826276e-05, + "loss": 0.20177597999572755, + "step": 173600 + }, + { + "epoch": 0.7453440148373303, + "grad_norm": 0.9493522047996521, + "learning_rate": 2.5575830221708653e-05, + "loss": 0.041297358274459836, + "step": 173610 + }, + { + "epoch": 0.7453869469273503, + "grad_norm": 0.00046072210534475744, + "learning_rate": 2.5571518501591024e-05, + "loss": 0.1452507734298706, + "step": 173620 + }, + { + "epoch": 0.7454298790173703, + "grad_norm": 0.4799286127090454, + "learning_rate": 2.55672067814734e-05, + "loss": 0.35864949226379395, + "step": 173630 + }, + { + "epoch": 0.7454728111073903, + "grad_norm": 2.4819602966308594, + "learning_rate": 2.556289506135578e-05, + "loss": 0.3039137363433838, + "step": 173640 + }, + { + "epoch": 0.7455157431974103, + "grad_norm": 2.6950457096099854, + "learning_rate": 2.5558583341238156e-05, + "loss": 0.04391449689865112, + "step": 173650 + }, + { + "epoch": 0.7455586752874304, + "grad_norm": 0.13231931626796722, + "learning_rate": 2.555427162112053e-05, + "loss": 0.3039705276489258, + "step": 173660 + }, + { + "epoch": 0.7456016073774504, + "grad_norm": 0.13154283165931702, + "learning_rate": 2.5549959901002907e-05, + "loss": 0.05160494446754456, + "step": 173670 + }, + { + "epoch": 0.7456445394674703, + "grad_norm": 0.18037918210029602, + "learning_rate": 2.5545648180885285e-05, + "loss": 0.12189091444015503, + "step": 173680 + }, + { + "epoch": 0.7456874715574904, + "grad_norm": 1.9181801080703735, + "learning_rate": 2.5541336460767662e-05, + "loss": 0.1407124638557434, + "step": 173690 + }, + { + "epoch": 0.7457304036475104, + "grad_norm": 0.001891557709313929, + "learning_rate": 2.5537024740650033e-05, + "loss": 0.1649843454360962, + "step": 173700 + }, + { + "epoch": 0.7457733357375304, + "grad_norm": 0.009757429361343384, + "learning_rate": 2.5532713020532413e-05, + "loss": 0.3177161693572998, + "step": 173710 + }, + { + "epoch": 0.7458162678275504, + "grad_norm": 1.442531943321228, + "learning_rate": 2.552840130041479e-05, + "loss": 0.36181912422180174, + "step": 173720 + }, + { + "epoch": 0.7458591999175704, + "grad_norm": 0.07108304649591446, + "learning_rate": 2.5524089580297168e-05, + "loss": 0.13565880060195923, + "step": 173730 + }, + { + "epoch": 0.7459021320075904, + "grad_norm": 7.040263652801514, + "learning_rate": 2.551977786017954e-05, + "loss": 0.20055036544799804, + "step": 173740 + }, + { + "epoch": 0.7459450640976104, + "grad_norm": 4.002945423126221, + "learning_rate": 2.5515466140061916e-05, + "loss": 0.4659637451171875, + "step": 173750 + }, + { + "epoch": 0.7459879961876305, + "grad_norm": 0.011049356311559677, + "learning_rate": 2.5511154419944293e-05, + "loss": 0.2700127124786377, + "step": 173760 + }, + { + "epoch": 0.7460309282776504, + "grad_norm": 0.00305456412024796, + "learning_rate": 2.550684269982667e-05, + "loss": 0.21567902565002442, + "step": 173770 + }, + { + "epoch": 0.7460738603676704, + "grad_norm": 0.7426313161849976, + "learning_rate": 2.5502530979709045e-05, + "loss": 0.20117294788360596, + "step": 173780 + }, + { + "epoch": 0.7461167924576905, + "grad_norm": 0.27082696557044983, + "learning_rate": 2.5498219259591422e-05, + "loss": 0.10781660079956054, + "step": 173790 + }, + { + "epoch": 0.7461597245477104, + "grad_norm": 0.006183996796607971, + "learning_rate": 2.54939075394738e-05, + "loss": 0.10997037887573242, + "step": 173800 + }, + { + "epoch": 0.7462026566377304, + "grad_norm": 0.0367814339697361, + "learning_rate": 2.5489595819356177e-05, + "loss": 0.3016884088516235, + "step": 173810 + }, + { + "epoch": 0.7462455887277505, + "grad_norm": 4.757618427276611, + "learning_rate": 2.548528409923855e-05, + "loss": 0.4883930206298828, + "step": 173820 + }, + { + "epoch": 0.7462885208177704, + "grad_norm": 0.00017911156464833766, + "learning_rate": 2.5480972379120928e-05, + "loss": 0.28103320598602294, + "step": 173830 + }, + { + "epoch": 0.7463314529077905, + "grad_norm": 0.10610644519329071, + "learning_rate": 2.5476660659003305e-05, + "loss": 0.11108193397521973, + "step": 173840 + }, + { + "epoch": 0.7463743849978105, + "grad_norm": 0.007589709013700485, + "learning_rate": 2.5472348938885683e-05, + "loss": 0.47249712944030764, + "step": 173850 + }, + { + "epoch": 0.7464173170878304, + "grad_norm": 2.555662155151367, + "learning_rate": 2.5468037218768053e-05, + "loss": 0.13019719123840331, + "step": 173860 + }, + { + "epoch": 0.7464602491778505, + "grad_norm": 3.0848388671875, + "learning_rate": 2.546372549865043e-05, + "loss": 0.10937469005584717, + "step": 173870 + }, + { + "epoch": 0.7465031812678705, + "grad_norm": 0.0011001820676028728, + "learning_rate": 2.5459413778532808e-05, + "loss": 0.06224575638771057, + "step": 173880 + }, + { + "epoch": 0.7465461133578905, + "grad_norm": 1.4132946729660034, + "learning_rate": 2.5455102058415185e-05, + "loss": 0.3091392993927002, + "step": 173890 + }, + { + "epoch": 0.7465890454479105, + "grad_norm": 3.6197102069854736, + "learning_rate": 2.5450790338297566e-05, + "loss": 0.28432717323303225, + "step": 173900 + }, + { + "epoch": 0.7466319775379305, + "grad_norm": 0.5134070515632629, + "learning_rate": 2.5446478618179936e-05, + "loss": 0.21487205028533934, + "step": 173910 + }, + { + "epoch": 0.7466749096279505, + "grad_norm": 1.3445314168930054, + "learning_rate": 2.5442166898062314e-05, + "loss": 0.08604246973991395, + "step": 173920 + }, + { + "epoch": 0.7467178417179705, + "grad_norm": 0.9623371362686157, + "learning_rate": 2.543785517794469e-05, + "loss": 0.1378989338874817, + "step": 173930 + }, + { + "epoch": 0.7467607738079906, + "grad_norm": 0.07971750944852829, + "learning_rate": 2.543354345782707e-05, + "loss": 0.136910879611969, + "step": 173940 + }, + { + "epoch": 0.7468037058980105, + "grad_norm": 19.788955688476562, + "learning_rate": 2.5429231737709442e-05, + "loss": 0.07829828262329101, + "step": 173950 + }, + { + "epoch": 0.7468466379880305, + "grad_norm": 0.132577046751976, + "learning_rate": 2.542492001759182e-05, + "loss": 0.16890411376953124, + "step": 173960 + }, + { + "epoch": 0.7468895700780506, + "grad_norm": 6.434149265289307, + "learning_rate": 2.5420608297474197e-05, + "loss": 0.14480010271072388, + "step": 173970 + }, + { + "epoch": 0.7469325021680705, + "grad_norm": 0.12347644567489624, + "learning_rate": 2.5416296577356574e-05, + "loss": 0.22138521671295167, + "step": 173980 + }, + { + "epoch": 0.7469754342580905, + "grad_norm": 0.12491331994533539, + "learning_rate": 2.5411984857238945e-05, + "loss": 0.31520378589630127, + "step": 173990 + }, + { + "epoch": 0.7470183663481106, + "grad_norm": 5.576565742492676, + "learning_rate": 2.5407673137121322e-05, + "loss": 0.18219449520111083, + "step": 174000 + }, + { + "epoch": 0.7470183663481106, + "eval_loss": 0.38934630155563354, + "eval_runtime": 27.4236, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 174000 + }, + { + "epoch": 0.7470612984381305, + "grad_norm": 1.1183409690856934, + "learning_rate": 2.5403361417003703e-05, + "loss": 0.33946449756622316, + "step": 174010 + }, + { + "epoch": 0.7471042305281506, + "grad_norm": 0.02250983752310276, + "learning_rate": 2.539904969688608e-05, + "loss": 0.08952296376228333, + "step": 174020 + }, + { + "epoch": 0.7471471626181706, + "grad_norm": 0.13014785945415497, + "learning_rate": 2.539473797676845e-05, + "loss": 0.18597830533981324, + "step": 174030 + }, + { + "epoch": 0.7471900947081905, + "grad_norm": 1.9278321266174316, + "learning_rate": 2.539042625665083e-05, + "loss": 0.18685194253921508, + "step": 174040 + }, + { + "epoch": 0.7472330267982106, + "grad_norm": 2.881211519241333, + "learning_rate": 2.5386114536533206e-05, + "loss": 0.18293185234069825, + "step": 174050 + }, + { + "epoch": 0.7472759588882306, + "grad_norm": 0.12321528792381287, + "learning_rate": 2.5381802816415583e-05, + "loss": 0.19650537967681886, + "step": 174060 + }, + { + "epoch": 0.7473188909782505, + "grad_norm": 0.0030896917451173067, + "learning_rate": 2.5377491096297957e-05, + "loss": 0.33722474575042727, + "step": 174070 + }, + { + "epoch": 0.7473618230682706, + "grad_norm": 2.7675693035125732, + "learning_rate": 2.5373179376180334e-05, + "loss": 0.2696052551269531, + "step": 174080 + }, + { + "epoch": 0.7474047551582906, + "grad_norm": 0.3029930591583252, + "learning_rate": 2.536886765606271e-05, + "loss": 0.16292864084243774, + "step": 174090 + }, + { + "epoch": 0.7474476872483107, + "grad_norm": 0.000494702544528991, + "learning_rate": 2.536455593594509e-05, + "loss": 0.04459400177001953, + "step": 174100 + }, + { + "epoch": 0.7474906193383306, + "grad_norm": 0.043872445821762085, + "learning_rate": 2.536024421582746e-05, + "loss": 0.009959495067596436, + "step": 174110 + }, + { + "epoch": 0.7475335514283507, + "grad_norm": 0.008128165267407894, + "learning_rate": 2.535593249570984e-05, + "loss": 0.07839224338531495, + "step": 174120 + }, + { + "epoch": 0.7475764835183707, + "grad_norm": 1.3136489391326904, + "learning_rate": 2.5351620775592218e-05, + "loss": 0.14459240436553955, + "step": 174130 + }, + { + "epoch": 0.7476194156083906, + "grad_norm": 3.1148681640625, + "learning_rate": 2.5347309055474595e-05, + "loss": 0.28304076194763184, + "step": 174140 + }, + { + "epoch": 0.7476623476984107, + "grad_norm": 0.47905516624450684, + "learning_rate": 2.5342997335356966e-05, + "loss": 0.1126200795173645, + "step": 174150 + }, + { + "epoch": 0.7477052797884307, + "grad_norm": 0.06672538816928864, + "learning_rate": 2.5338685615239343e-05, + "loss": 0.16702120304107665, + "step": 174160 + }, + { + "epoch": 0.7477482118784506, + "grad_norm": 0.004307939670979977, + "learning_rate": 2.533437389512172e-05, + "loss": 0.2730269908905029, + "step": 174170 + }, + { + "epoch": 0.7477911439684707, + "grad_norm": 3.2004806995391846, + "learning_rate": 2.5330062175004098e-05, + "loss": 0.06000160574913025, + "step": 174180 + }, + { + "epoch": 0.7478340760584907, + "grad_norm": 0.006153371185064316, + "learning_rate": 2.532575045488647e-05, + "loss": 0.04418157935142517, + "step": 174190 + }, + { + "epoch": 0.7478770081485107, + "grad_norm": 0.005760138388723135, + "learning_rate": 2.532143873476885e-05, + "loss": 0.25185327529907225, + "step": 174200 + }, + { + "epoch": 0.7479199402385307, + "grad_norm": 0.004189047031104565, + "learning_rate": 2.5317127014651226e-05, + "loss": 0.3935739755630493, + "step": 174210 + }, + { + "epoch": 0.7479628723285507, + "grad_norm": 1.2200220823287964, + "learning_rate": 2.5312815294533604e-05, + "loss": 0.17278871536254883, + "step": 174220 + }, + { + "epoch": 0.7480058044185707, + "grad_norm": 0.32186490297317505, + "learning_rate": 2.5308503574415978e-05, + "loss": 0.26594464778900145, + "step": 174230 + }, + { + "epoch": 0.7480487365085907, + "grad_norm": 2.935149908065796, + "learning_rate": 2.5304191854298355e-05, + "loss": 0.10904730558395385, + "step": 174240 + }, + { + "epoch": 0.7480916685986108, + "grad_norm": 3.964522361755371, + "learning_rate": 2.5299880134180732e-05, + "loss": 0.3033463001251221, + "step": 174250 + }, + { + "epoch": 0.7481346006886307, + "grad_norm": 0.11229217797517776, + "learning_rate": 2.529556841406311e-05, + "loss": 0.2783724308013916, + "step": 174260 + }, + { + "epoch": 0.7481775327786507, + "grad_norm": 1.392785668373108, + "learning_rate": 2.5291256693945487e-05, + "loss": 0.16704895496368408, + "step": 174270 + }, + { + "epoch": 0.7482204648686708, + "grad_norm": 1.1950591802597046, + "learning_rate": 2.5286944973827857e-05, + "loss": 0.37365069389343264, + "step": 174280 + }, + { + "epoch": 0.7482633969586907, + "grad_norm": 1.6007744073867798, + "learning_rate": 2.5282633253710235e-05, + "loss": 0.25082619190216066, + "step": 174290 + }, + { + "epoch": 0.7483063290487107, + "grad_norm": 0.006831104401499033, + "learning_rate": 2.5278321533592612e-05, + "loss": 0.27920453548431395, + "step": 174300 + }, + { + "epoch": 0.7483492611387308, + "grad_norm": 0.01827859692275524, + "learning_rate": 2.5274009813474993e-05, + "loss": 0.11764630079269409, + "step": 174310 + }, + { + "epoch": 0.7483921932287507, + "grad_norm": 1.2253323793411255, + "learning_rate": 2.5269698093357363e-05, + "loss": 0.17802076339721679, + "step": 174320 + }, + { + "epoch": 0.7484351253187708, + "grad_norm": 0.004427074920386076, + "learning_rate": 2.526538637323974e-05, + "loss": 0.1897179365158081, + "step": 174330 + }, + { + "epoch": 0.7484780574087908, + "grad_norm": 0.019519807770848274, + "learning_rate": 2.5261074653122118e-05, + "loss": 0.2350210428237915, + "step": 174340 + }, + { + "epoch": 0.7485209894988107, + "grad_norm": 46.768394470214844, + "learning_rate": 2.5256762933004495e-05, + "loss": 0.24312927722930908, + "step": 174350 + }, + { + "epoch": 0.7485639215888308, + "grad_norm": 0.025334032252430916, + "learning_rate": 2.525245121288687e-05, + "loss": 0.20443146228790282, + "step": 174360 + }, + { + "epoch": 0.7486068536788508, + "grad_norm": 0.11763063818216324, + "learning_rate": 2.5248139492769247e-05, + "loss": 0.19978156089782714, + "step": 174370 + }, + { + "epoch": 0.7486497857688708, + "grad_norm": 6.262375354766846, + "learning_rate": 2.5243827772651624e-05, + "loss": 0.039094260334968566, + "step": 174380 + }, + { + "epoch": 0.7486927178588908, + "grad_norm": 0.0023005977272987366, + "learning_rate": 2.5239516052534e-05, + "loss": 0.07948078513145447, + "step": 174390 + }, + { + "epoch": 0.7487356499489108, + "grad_norm": 0.006748616229742765, + "learning_rate": 2.5235204332416372e-05, + "loss": 0.029941585659980775, + "step": 174400 + }, + { + "epoch": 0.7487785820389308, + "grad_norm": 0.002810591831803322, + "learning_rate": 2.523089261229875e-05, + "loss": 0.4636706352233887, + "step": 174410 + }, + { + "epoch": 0.7488215141289508, + "grad_norm": 1.3158349990844727, + "learning_rate": 2.522658089218113e-05, + "loss": 0.558734655380249, + "step": 174420 + }, + { + "epoch": 0.7488644462189709, + "grad_norm": 0.004559780471026897, + "learning_rate": 2.5222269172063507e-05, + "loss": 0.14058899879455566, + "step": 174430 + }, + { + "epoch": 0.7489073783089908, + "grad_norm": 1.9623855352401733, + "learning_rate": 2.5217957451945878e-05, + "loss": 0.16024988889694214, + "step": 174440 + }, + { + "epoch": 0.7489503103990108, + "grad_norm": 0.013161673210561275, + "learning_rate": 2.5213645731828255e-05, + "loss": 0.2463479995727539, + "step": 174450 + }, + { + "epoch": 0.7489932424890309, + "grad_norm": 0.013577003963291645, + "learning_rate": 2.5209334011710633e-05, + "loss": 0.2299262523651123, + "step": 174460 + }, + { + "epoch": 0.7490361745790508, + "grad_norm": 0.0064038001000881195, + "learning_rate": 2.520502229159301e-05, + "loss": 0.16454423666000367, + "step": 174470 + }, + { + "epoch": 0.7490791066690708, + "grad_norm": 0.006632882170379162, + "learning_rate": 2.5200710571475384e-05, + "loss": 0.15173077583312988, + "step": 174480 + }, + { + "epoch": 0.7491220387590909, + "grad_norm": 1.0597169399261475, + "learning_rate": 2.519639885135776e-05, + "loss": 0.12685471773147583, + "step": 174490 + }, + { + "epoch": 0.7491649708491108, + "grad_norm": 0.0004002380883321166, + "learning_rate": 2.519208713124014e-05, + "loss": 0.16025757789611816, + "step": 174500 + }, + { + "epoch": 0.7492079029391309, + "grad_norm": 3.280585289001465, + "learning_rate": 2.5187775411122516e-05, + "loss": 0.09133055210113525, + "step": 174510 + }, + { + "epoch": 0.7492508350291509, + "grad_norm": 0.008724031038582325, + "learning_rate": 2.5183463691004887e-05, + "loss": 0.21232659816741944, + "step": 174520 + }, + { + "epoch": 0.749293767119171, + "grad_norm": 0.005878915078938007, + "learning_rate": 2.5179151970887267e-05, + "loss": 0.1962152123451233, + "step": 174530 + }, + { + "epoch": 0.7493366992091909, + "grad_norm": 0.04787326976656914, + "learning_rate": 2.5174840250769645e-05, + "loss": 0.19027726650238036, + "step": 174540 + }, + { + "epoch": 0.7493796312992109, + "grad_norm": 0.006681304890662432, + "learning_rate": 2.5170528530652022e-05, + "loss": 0.05747186541557312, + "step": 174550 + }, + { + "epoch": 0.749422563389231, + "grad_norm": 0.04217128828167915, + "learning_rate": 2.5166216810534393e-05, + "loss": 0.18018451929092408, + "step": 174560 + }, + { + "epoch": 0.7494654954792509, + "grad_norm": 7.066211700439453, + "learning_rate": 2.516190509041677e-05, + "loss": 0.18665244579315185, + "step": 174570 + }, + { + "epoch": 0.7495084275692709, + "grad_norm": 3.013502597808838, + "learning_rate": 2.5157593370299147e-05, + "loss": 0.23242578506469727, + "step": 174580 + }, + { + "epoch": 0.749551359659291, + "grad_norm": 1.7900302410125732, + "learning_rate": 2.5153281650181525e-05, + "loss": 0.18565467596054078, + "step": 174590 + }, + { + "epoch": 0.7495942917493109, + "grad_norm": 1.183828592300415, + "learning_rate": 2.5148969930063905e-05, + "loss": 0.25924339294433596, + "step": 174600 + }, + { + "epoch": 0.749637223839331, + "grad_norm": 3.1744091510772705, + "learning_rate": 2.5144658209946276e-05, + "loss": 0.19662506580352784, + "step": 174610 + }, + { + "epoch": 0.749680155929351, + "grad_norm": 1.2753418684005737, + "learning_rate": 2.5140346489828653e-05, + "loss": 0.3594705820083618, + "step": 174620 + }, + { + "epoch": 0.7497230880193709, + "grad_norm": 0.009313058108091354, + "learning_rate": 2.513603476971103e-05, + "loss": 0.10875067710876465, + "step": 174630 + }, + { + "epoch": 0.749766020109391, + "grad_norm": 1.209693193435669, + "learning_rate": 2.5131723049593408e-05, + "loss": 0.1888060212135315, + "step": 174640 + }, + { + "epoch": 0.749808952199411, + "grad_norm": 0.008648032322525978, + "learning_rate": 2.5127411329475782e-05, + "loss": 0.17343711853027344, + "step": 174650 + }, + { + "epoch": 0.749851884289431, + "grad_norm": 0.12723885476589203, + "learning_rate": 2.512309960935816e-05, + "loss": 0.1848118543624878, + "step": 174660 + }, + { + "epoch": 0.749894816379451, + "grad_norm": 0.01464887149631977, + "learning_rate": 2.5118787889240537e-05, + "loss": 0.050753462314605716, + "step": 174670 + }, + { + "epoch": 0.749937748469471, + "grad_norm": 0.044885214418172836, + "learning_rate": 2.5114476169122914e-05, + "loss": 0.1882512927055359, + "step": 174680 + }, + { + "epoch": 0.749980680559491, + "grad_norm": 1.5992013216018677, + "learning_rate": 2.5110164449005284e-05, + "loss": 0.14516894817352294, + "step": 174690 + }, + { + "epoch": 0.750023612649511, + "grad_norm": 1.8369817733764648, + "learning_rate": 2.5105852728887662e-05, + "loss": 0.38750979900360105, + "step": 174700 + }, + { + "epoch": 0.750066544739531, + "grad_norm": 0.6581123471260071, + "learning_rate": 2.5101541008770043e-05, + "loss": 0.28684771060943604, + "step": 174710 + }, + { + "epoch": 0.750109476829551, + "grad_norm": 40.683799743652344, + "learning_rate": 2.509722928865242e-05, + "loss": 0.1460190534591675, + "step": 174720 + }, + { + "epoch": 0.750152408919571, + "grad_norm": 0.03157550096511841, + "learning_rate": 2.509291756853479e-05, + "loss": 0.19834092855453492, + "step": 174730 + }, + { + "epoch": 0.7501953410095911, + "grad_norm": 0.0021849041804671288, + "learning_rate": 2.5088605848417168e-05, + "loss": 0.1647346019744873, + "step": 174740 + }, + { + "epoch": 0.750238273099611, + "grad_norm": 0.18896019458770752, + "learning_rate": 2.5084294128299545e-05, + "loss": 0.3278000593185425, + "step": 174750 + }, + { + "epoch": 0.750281205189631, + "grad_norm": 0.932102620601654, + "learning_rate": 2.5079982408181922e-05, + "loss": 0.0883977472782135, + "step": 174760 + }, + { + "epoch": 0.7503241372796511, + "grad_norm": 2.2175753116607666, + "learning_rate": 2.5075670688064296e-05, + "loss": 0.32731032371520996, + "step": 174770 + }, + { + "epoch": 0.750367069369671, + "grad_norm": 0.004549882840365171, + "learning_rate": 2.5071358967946674e-05, + "loss": 0.17782050371170044, + "step": 174780 + }, + { + "epoch": 0.7504100014596911, + "grad_norm": 2.5548672676086426, + "learning_rate": 2.506704724782905e-05, + "loss": 0.09278339147567749, + "step": 174790 + }, + { + "epoch": 0.7504529335497111, + "grad_norm": 0.035067737102508545, + "learning_rate": 2.506273552771143e-05, + "loss": 0.01957797259092331, + "step": 174800 + }, + { + "epoch": 0.750495865639731, + "grad_norm": 0.33087292313575745, + "learning_rate": 2.50584238075938e-05, + "loss": 0.4236994743347168, + "step": 174810 + }, + { + "epoch": 0.7505387977297511, + "grad_norm": 0.002858347026631236, + "learning_rate": 2.505411208747618e-05, + "loss": 0.24580299854278564, + "step": 174820 + }, + { + "epoch": 0.7505817298197711, + "grad_norm": 3.3408989906311035, + "learning_rate": 2.5049800367358557e-05, + "loss": 0.10669113397598266, + "step": 174830 + }, + { + "epoch": 0.750624661909791, + "grad_norm": 0.4025706350803375, + "learning_rate": 2.5045488647240934e-05, + "loss": 0.19302643537521363, + "step": 174840 + }, + { + "epoch": 0.7506675939998111, + "grad_norm": 0.33814287185668945, + "learning_rate": 2.5041176927123305e-05, + "loss": 0.1657320261001587, + "step": 174850 + }, + { + "epoch": 0.7507105260898311, + "grad_norm": 0.5623843669891357, + "learning_rate": 2.5036865207005682e-05, + "loss": 0.030144301056861878, + "step": 174860 + }, + { + "epoch": 0.7507534581798511, + "grad_norm": 0.4477728605270386, + "learning_rate": 2.503255348688806e-05, + "loss": 0.09932756423950195, + "step": 174870 + }, + { + "epoch": 0.7507963902698711, + "grad_norm": 0.5520215034484863, + "learning_rate": 2.5028241766770437e-05, + "loss": 0.21815659999847412, + "step": 174880 + }, + { + "epoch": 0.7508393223598911, + "grad_norm": 0.015766866505146027, + "learning_rate": 2.502393004665281e-05, + "loss": 0.17627745866775513, + "step": 174890 + }, + { + "epoch": 0.7508822544499111, + "grad_norm": 0.22014105319976807, + "learning_rate": 2.501961832653519e-05, + "loss": 0.048931142687797545, + "step": 174900 + }, + { + "epoch": 0.7509251865399311, + "grad_norm": 0.0031273365020751953, + "learning_rate": 2.5015306606417566e-05, + "loss": 0.2751063346862793, + "step": 174910 + }, + { + "epoch": 0.7509681186299512, + "grad_norm": 0.4453137516975403, + "learning_rate": 2.5010994886299943e-05, + "loss": 0.32244696617126467, + "step": 174920 + }, + { + "epoch": 0.7510110507199711, + "grad_norm": 2.4020941257476807, + "learning_rate": 2.5006683166182317e-05, + "loss": 0.15655696392059326, + "step": 174930 + }, + { + "epoch": 0.7510539828099911, + "grad_norm": 2.225407361984253, + "learning_rate": 2.5002371446064694e-05, + "loss": 0.32480764389038086, + "step": 174940 + }, + { + "epoch": 0.7510969149000112, + "grad_norm": 0.3002236783504486, + "learning_rate": 2.499805972594707e-05, + "loss": 0.14254547357559205, + "step": 174950 + }, + { + "epoch": 0.7511398469900312, + "grad_norm": 1.3607335090637207, + "learning_rate": 2.4993748005829446e-05, + "loss": 0.3548447132110596, + "step": 174960 + }, + { + "epoch": 0.7511827790800512, + "grad_norm": 10.988116264343262, + "learning_rate": 2.4989436285711823e-05, + "loss": 0.29602463245391847, + "step": 174970 + }, + { + "epoch": 0.7512257111700712, + "grad_norm": 1.3006998300552368, + "learning_rate": 2.49851245655942e-05, + "loss": 0.15326464176177979, + "step": 174980 + }, + { + "epoch": 0.7512686432600912, + "grad_norm": 0.04871809855103493, + "learning_rate": 2.4980812845476574e-05, + "loss": 0.08068002462387085, + "step": 174990 + }, + { + "epoch": 0.7513115753501112, + "grad_norm": 0.0009460552246309817, + "learning_rate": 2.497650112535895e-05, + "loss": 0.1050765872001648, + "step": 175000 + }, + { + "epoch": 0.7513115753501112, + "eval_loss": 0.39374786615371704, + "eval_runtime": 27.3897, + "eval_samples_per_second": 3.651, + "eval_steps_per_second": 3.651, + "step": 175000 + }, + { + "epoch": 0.7513545074401312, + "grad_norm": 0.04455732926726341, + "learning_rate": 2.497218940524133e-05, + "loss": 0.07660987973213196, + "step": 175010 + }, + { + "epoch": 0.7513974395301513, + "grad_norm": 0.011786301620304585, + "learning_rate": 2.4967877685123706e-05, + "loss": 0.24636490345001222, + "step": 175020 + }, + { + "epoch": 0.7514403716201712, + "grad_norm": 0.09234892576932907, + "learning_rate": 2.496356596500608e-05, + "loss": 0.35528485774993895, + "step": 175030 + }, + { + "epoch": 0.7514833037101912, + "grad_norm": 0.13859136402606964, + "learning_rate": 2.4959254244888458e-05, + "loss": 0.2566323518753052, + "step": 175040 + }, + { + "epoch": 0.7515262358002113, + "grad_norm": 0.0014120546402409673, + "learning_rate": 2.495494252477083e-05, + "loss": 0.06871371865272521, + "step": 175050 + }, + { + "epoch": 0.7515691678902312, + "grad_norm": 0.20913314819335938, + "learning_rate": 2.495063080465321e-05, + "loss": 0.3590980052947998, + "step": 175060 + }, + { + "epoch": 0.7516120999802512, + "grad_norm": 0.0026090750470757484, + "learning_rate": 2.4946319084535586e-05, + "loss": 0.10994521379470826, + "step": 175070 + }, + { + "epoch": 0.7516550320702713, + "grad_norm": 0.018767498433589935, + "learning_rate": 2.4942007364417964e-05, + "loss": 0.1653411030769348, + "step": 175080 + }, + { + "epoch": 0.7516979641602912, + "grad_norm": 0.011714949272572994, + "learning_rate": 2.4937695644300338e-05, + "loss": 0.17751481533050537, + "step": 175090 + }, + { + "epoch": 0.7517408962503113, + "grad_norm": 0.042523354291915894, + "learning_rate": 2.4933383924182715e-05, + "loss": 0.19038463830947877, + "step": 175100 + }, + { + "epoch": 0.7517838283403313, + "grad_norm": 1.2998957633972168, + "learning_rate": 2.492907220406509e-05, + "loss": 0.10467228889465333, + "step": 175110 + }, + { + "epoch": 0.7518267604303512, + "grad_norm": 1.3815051317214966, + "learning_rate": 2.492476048394747e-05, + "loss": 0.15653225183486938, + "step": 175120 + }, + { + "epoch": 0.7518696925203713, + "grad_norm": 9.51794719696045, + "learning_rate": 2.4920448763829844e-05, + "loss": 0.30117287635803225, + "step": 175130 + }, + { + "epoch": 0.7519126246103913, + "grad_norm": 0.8320513367652893, + "learning_rate": 2.491613704371222e-05, + "loss": 0.19667335748672485, + "step": 175140 + }, + { + "epoch": 0.7519555567004113, + "grad_norm": 1.9889274835586548, + "learning_rate": 2.4911825323594595e-05, + "loss": 0.20154480934143065, + "step": 175150 + }, + { + "epoch": 0.7519984887904313, + "grad_norm": 0.0013290152419358492, + "learning_rate": 2.4907513603476972e-05, + "loss": 0.22941241264343262, + "step": 175160 + }, + { + "epoch": 0.7520414208804513, + "grad_norm": 0.02920190989971161, + "learning_rate": 2.4903201883359346e-05, + "loss": 0.13015002012252808, + "step": 175170 + }, + { + "epoch": 0.7520843529704713, + "grad_norm": 9.561676979064941, + "learning_rate": 2.4898890163241727e-05, + "loss": 0.31256589889526365, + "step": 175180 + }, + { + "epoch": 0.7521272850604913, + "grad_norm": 1.7143646478652954, + "learning_rate": 2.48945784431241e-05, + "loss": 0.30118331909179685, + "step": 175190 + }, + { + "epoch": 0.7521702171505114, + "grad_norm": 0.371857225894928, + "learning_rate": 2.4890266723006478e-05, + "loss": 0.24887962341308595, + "step": 175200 + }, + { + "epoch": 0.7522131492405313, + "grad_norm": 1.154715657234192, + "learning_rate": 2.4885955002888852e-05, + "loss": 0.27854137420654296, + "step": 175210 + }, + { + "epoch": 0.7522560813305513, + "grad_norm": 3.212376832962036, + "learning_rate": 2.488164328277123e-05, + "loss": 0.2116297721862793, + "step": 175220 + }, + { + "epoch": 0.7522990134205714, + "grad_norm": 0.03922676295042038, + "learning_rate": 2.4877331562653607e-05, + "loss": 0.308078408241272, + "step": 175230 + }, + { + "epoch": 0.7523419455105913, + "grad_norm": 0.01890389621257782, + "learning_rate": 2.4873019842535984e-05, + "loss": 0.2405827522277832, + "step": 175240 + }, + { + "epoch": 0.7523848776006113, + "grad_norm": 6.3656487464904785, + "learning_rate": 2.4868708122418358e-05, + "loss": 0.36165671348571776, + "step": 175250 + }, + { + "epoch": 0.7524278096906314, + "grad_norm": 0.003142143599689007, + "learning_rate": 2.4864396402300735e-05, + "loss": 0.30589354038238525, + "step": 175260 + }, + { + "epoch": 0.7524707417806513, + "grad_norm": 0.10753688961267471, + "learning_rate": 2.486008468218311e-05, + "loss": 0.17673590183258056, + "step": 175270 + }, + { + "epoch": 0.7525136738706714, + "grad_norm": 2.2089762687683105, + "learning_rate": 2.4855772962065487e-05, + "loss": 0.1999528169631958, + "step": 175280 + }, + { + "epoch": 0.7525566059606914, + "grad_norm": 1.118449330329895, + "learning_rate": 2.4851461241947864e-05, + "loss": 0.15476460456848146, + "step": 175290 + }, + { + "epoch": 0.7525995380507113, + "grad_norm": 1.8658347129821777, + "learning_rate": 2.484714952183024e-05, + "loss": 0.26465458869934083, + "step": 175300 + }, + { + "epoch": 0.7526424701407314, + "grad_norm": 0.03414380922913551, + "learning_rate": 2.4842837801712615e-05, + "loss": 0.11501485109329224, + "step": 175310 + }, + { + "epoch": 0.7526854022307514, + "grad_norm": 0.054499465972185135, + "learning_rate": 2.4838526081594993e-05, + "loss": 0.13327181339263916, + "step": 175320 + }, + { + "epoch": 0.7527283343207714, + "grad_norm": 4.911616802215576, + "learning_rate": 2.483421436147737e-05, + "loss": 0.15141682624816893, + "step": 175330 + }, + { + "epoch": 0.7527712664107914, + "grad_norm": 1.755698561668396, + "learning_rate": 2.4829902641359744e-05, + "loss": 0.1645986557006836, + "step": 175340 + }, + { + "epoch": 0.7528141985008114, + "grad_norm": 1.3716384172439575, + "learning_rate": 2.482559092124212e-05, + "loss": 0.11155534982681274, + "step": 175350 + }, + { + "epoch": 0.7528571305908314, + "grad_norm": 0.03115398809313774, + "learning_rate": 2.48212792011245e-05, + "loss": 0.13327916860580444, + "step": 175360 + }, + { + "epoch": 0.7529000626808514, + "grad_norm": 1.6688318252563477, + "learning_rate": 2.4816967481006876e-05, + "loss": 0.15580270290374756, + "step": 175370 + }, + { + "epoch": 0.7529429947708715, + "grad_norm": 0.20583480596542358, + "learning_rate": 2.481265576088925e-05, + "loss": 0.02036563605070114, + "step": 175380 + }, + { + "epoch": 0.7529859268608915, + "grad_norm": 0.08086307346820831, + "learning_rate": 2.4808344040771627e-05, + "loss": 0.024845921993255617, + "step": 175390 + }, + { + "epoch": 0.7530288589509114, + "grad_norm": 0.007556057535111904, + "learning_rate": 2.4804032320654e-05, + "loss": 0.17657938003540039, + "step": 175400 + }, + { + "epoch": 0.7530717910409315, + "grad_norm": 0.0017575263045728207, + "learning_rate": 2.479972060053638e-05, + "loss": 0.186118745803833, + "step": 175410 + }, + { + "epoch": 0.7531147231309515, + "grad_norm": 1.4013792276382446, + "learning_rate": 2.4795408880418756e-05, + "loss": 0.1842397928237915, + "step": 175420 + }, + { + "epoch": 0.7531576552209714, + "grad_norm": 0.01723472774028778, + "learning_rate": 2.4791097160301133e-05, + "loss": 0.28156132698059083, + "step": 175430 + }, + { + "epoch": 0.7532005873109915, + "grad_norm": 2.1313681602478027, + "learning_rate": 2.4786785440183507e-05, + "loss": 0.21020984649658203, + "step": 175440 + }, + { + "epoch": 0.7532435194010115, + "grad_norm": 0.006951197050511837, + "learning_rate": 2.4782473720065885e-05, + "loss": 0.09738854169845582, + "step": 175450 + }, + { + "epoch": 0.7532864514910315, + "grad_norm": 6.1560444831848145, + "learning_rate": 2.477816199994826e-05, + "loss": 0.23765075206756592, + "step": 175460 + }, + { + "epoch": 0.7533293835810515, + "grad_norm": 0.4051508605480194, + "learning_rate": 2.477385027983064e-05, + "loss": 0.2988025665283203, + "step": 175470 + }, + { + "epoch": 0.7533723156710715, + "grad_norm": 0.34460052847862244, + "learning_rate": 2.4769538559713013e-05, + "loss": 0.1418423533439636, + "step": 175480 + }, + { + "epoch": 0.7534152477610915, + "grad_norm": 0.13387997448444366, + "learning_rate": 2.476522683959539e-05, + "loss": 0.10612796545028687, + "step": 175490 + }, + { + "epoch": 0.7534581798511115, + "grad_norm": 1.4258102178573608, + "learning_rate": 2.4760915119477765e-05, + "loss": 0.141604745388031, + "step": 175500 + }, + { + "epoch": 0.7535011119411316, + "grad_norm": 0.0010811506072059274, + "learning_rate": 2.4756603399360142e-05, + "loss": 0.2934016227722168, + "step": 175510 + }, + { + "epoch": 0.7535440440311515, + "grad_norm": 0.0018770555034279823, + "learning_rate": 2.4752291679242516e-05, + "loss": 0.017631618678569792, + "step": 175520 + }, + { + "epoch": 0.7535869761211715, + "grad_norm": 4.808864116668701, + "learning_rate": 2.4747979959124897e-05, + "loss": 0.3421001434326172, + "step": 175530 + }, + { + "epoch": 0.7536299082111916, + "grad_norm": 0.003242551814764738, + "learning_rate": 2.474366823900727e-05, + "loss": 0.24067187309265137, + "step": 175540 + }, + { + "epoch": 0.7536728403012115, + "grad_norm": 0.0072052436880767345, + "learning_rate": 2.4739356518889648e-05, + "loss": 0.08315362930297851, + "step": 175550 + }, + { + "epoch": 0.7537157723912316, + "grad_norm": 0.00040326855378225446, + "learning_rate": 2.4735044798772022e-05, + "loss": 0.1443575143814087, + "step": 175560 + }, + { + "epoch": 0.7537587044812516, + "grad_norm": 20.111753463745117, + "learning_rate": 2.47307330786544e-05, + "loss": 0.31186325550079347, + "step": 175570 + }, + { + "epoch": 0.7538016365712715, + "grad_norm": 0.008096352219581604, + "learning_rate": 2.4726421358536777e-05, + "loss": 0.08084225654602051, + "step": 175580 + }, + { + "epoch": 0.7538445686612916, + "grad_norm": 3.5794265270233154, + "learning_rate": 2.4722109638419154e-05, + "loss": 0.23514416217803955, + "step": 175590 + }, + { + "epoch": 0.7538875007513116, + "grad_norm": 0.01341279223561287, + "learning_rate": 2.4717797918301528e-05, + "loss": 0.17565954923629762, + "step": 175600 + }, + { + "epoch": 0.7539304328413315, + "grad_norm": 0.15792067348957062, + "learning_rate": 2.4713486198183905e-05, + "loss": 0.10981240272521972, + "step": 175610 + }, + { + "epoch": 0.7539733649313516, + "grad_norm": 0.02763104997575283, + "learning_rate": 2.470917447806628e-05, + "loss": 0.10967028141021729, + "step": 175620 + }, + { + "epoch": 0.7540162970213716, + "grad_norm": 3.09664249420166, + "learning_rate": 2.4704862757948656e-05, + "loss": 0.14928872585296632, + "step": 175630 + }, + { + "epoch": 0.7540592291113916, + "grad_norm": 3.242180347442627, + "learning_rate": 2.4700551037831034e-05, + "loss": 0.19187842607498168, + "step": 175640 + }, + { + "epoch": 0.7541021612014116, + "grad_norm": 0.0028528040274977684, + "learning_rate": 2.469623931771341e-05, + "loss": 0.005672257766127587, + "step": 175650 + }, + { + "epoch": 0.7541450932914316, + "grad_norm": 0.5106819272041321, + "learning_rate": 2.4691927597595785e-05, + "loss": 0.16544620990753173, + "step": 175660 + }, + { + "epoch": 0.7541880253814516, + "grad_norm": 1.5341259241104126, + "learning_rate": 2.4687615877478162e-05, + "loss": 0.2775939226150513, + "step": 175670 + }, + { + "epoch": 0.7542309574714716, + "grad_norm": 0.32357120513916016, + "learning_rate": 2.4683304157360536e-05, + "loss": 0.1690353274345398, + "step": 175680 + }, + { + "epoch": 0.7542738895614917, + "grad_norm": 0.04019639268517494, + "learning_rate": 2.4678992437242914e-05, + "loss": 0.21661920547485353, + "step": 175690 + }, + { + "epoch": 0.7543168216515116, + "grad_norm": 0.0546407625079155, + "learning_rate": 2.467468071712529e-05, + "loss": 0.18709558248519897, + "step": 175700 + }, + { + "epoch": 0.7543597537415316, + "grad_norm": 0.029099030420184135, + "learning_rate": 2.467036899700767e-05, + "loss": 0.17751796245574952, + "step": 175710 + }, + { + "epoch": 0.7544026858315517, + "grad_norm": 0.00495111383497715, + "learning_rate": 2.4666057276890046e-05, + "loss": 0.14952001571655274, + "step": 175720 + }, + { + "epoch": 0.7544456179215716, + "grad_norm": 2.223857879638672, + "learning_rate": 2.466174555677242e-05, + "loss": 0.34983224868774415, + "step": 175730 + }, + { + "epoch": 0.7544885500115917, + "grad_norm": 0.016578232869505882, + "learning_rate": 2.4657433836654797e-05, + "loss": 0.15543704032897948, + "step": 175740 + }, + { + "epoch": 0.7545314821016117, + "grad_norm": 0.9805734157562256, + "learning_rate": 2.465312211653717e-05, + "loss": 0.2730615854263306, + "step": 175750 + }, + { + "epoch": 0.7545744141916316, + "grad_norm": 0.12643659114837646, + "learning_rate": 2.464881039641955e-05, + "loss": 0.07886726260185242, + "step": 175760 + }, + { + "epoch": 0.7546173462816517, + "grad_norm": 2.201289176940918, + "learning_rate": 2.4644498676301926e-05, + "loss": 0.30348107814788816, + "step": 175770 + }, + { + "epoch": 0.7546602783716717, + "grad_norm": 1.6394163370132446, + "learning_rate": 2.4640186956184303e-05, + "loss": 0.31897358894348143, + "step": 175780 + }, + { + "epoch": 0.7547032104616916, + "grad_norm": 0.00793420895934105, + "learning_rate": 2.4635875236066677e-05, + "loss": 0.35818772315979003, + "step": 175790 + }, + { + "epoch": 0.7547461425517117, + "grad_norm": 0.0008388396818190813, + "learning_rate": 2.4631563515949054e-05, + "loss": 0.1866925001144409, + "step": 175800 + }, + { + "epoch": 0.7547890746417317, + "grad_norm": 0.001753473188728094, + "learning_rate": 2.4627251795831428e-05, + "loss": 0.368836498260498, + "step": 175810 + }, + { + "epoch": 0.7548320067317518, + "grad_norm": 0.00452902726829052, + "learning_rate": 2.462294007571381e-05, + "loss": 0.1541559338569641, + "step": 175820 + }, + { + "epoch": 0.7548749388217717, + "grad_norm": 0.17083846032619476, + "learning_rate": 2.4618628355596183e-05, + "loss": 0.12283452749252319, + "step": 175830 + }, + { + "epoch": 0.7549178709117917, + "grad_norm": 7.021027088165283, + "learning_rate": 2.461431663547856e-05, + "loss": 0.34672794342041013, + "step": 175840 + }, + { + "epoch": 0.7549608030018118, + "grad_norm": 0.008842471987009048, + "learning_rate": 2.4610004915360934e-05, + "loss": 0.4350734233856201, + "step": 175850 + }, + { + "epoch": 0.7550037350918317, + "grad_norm": 2.1589739322662354, + "learning_rate": 2.460569319524331e-05, + "loss": 0.1623774290084839, + "step": 175860 + }, + { + "epoch": 0.7550466671818518, + "grad_norm": 1.3766883611679077, + "learning_rate": 2.4601381475125686e-05, + "loss": 0.24213268756866455, + "step": 175870 + }, + { + "epoch": 0.7550895992718718, + "grad_norm": 1.2221431732177734, + "learning_rate": 2.4597069755008066e-05, + "loss": 0.19608230590820314, + "step": 175880 + }, + { + "epoch": 0.7551325313618917, + "grad_norm": 4.313803672790527, + "learning_rate": 2.459275803489044e-05, + "loss": 0.32244043350219725, + "step": 175890 + }, + { + "epoch": 0.7551754634519118, + "grad_norm": 0.08472935110330582, + "learning_rate": 2.4588446314772818e-05, + "loss": 0.07666860222816467, + "step": 175900 + }, + { + "epoch": 0.7552183955419318, + "grad_norm": 0.015484074130654335, + "learning_rate": 2.458413459465519e-05, + "loss": 0.17252891063690184, + "step": 175910 + }, + { + "epoch": 0.7552613276319518, + "grad_norm": 0.014127151109278202, + "learning_rate": 2.457982287453757e-05, + "loss": 0.1840136766433716, + "step": 175920 + }, + { + "epoch": 0.7553042597219718, + "grad_norm": 0.28970640897750854, + "learning_rate": 2.4575511154419943e-05, + "loss": 0.21439287662506104, + "step": 175930 + }, + { + "epoch": 0.7553471918119918, + "grad_norm": 0.0020846251863986254, + "learning_rate": 2.4571199434302324e-05, + "loss": 0.13129196166992188, + "step": 175940 + }, + { + "epoch": 0.7553901239020118, + "grad_norm": 0.0034930300898849964, + "learning_rate": 2.4566887714184698e-05, + "loss": 0.3986776828765869, + "step": 175950 + }, + { + "epoch": 0.7554330559920318, + "grad_norm": 0.006673635449260473, + "learning_rate": 2.4562575994067075e-05, + "loss": 0.21093640327453614, + "step": 175960 + }, + { + "epoch": 0.7554759880820519, + "grad_norm": 1.6050211191177368, + "learning_rate": 2.455826427394945e-05, + "loss": 0.15360102653503419, + "step": 175970 + }, + { + "epoch": 0.7555189201720718, + "grad_norm": 2.7723329067230225, + "learning_rate": 2.4553952553831826e-05, + "loss": 0.11786471605300904, + "step": 175980 + }, + { + "epoch": 0.7555618522620918, + "grad_norm": 0.004894792102277279, + "learning_rate": 2.4549640833714204e-05, + "loss": 0.11137468814849853, + "step": 175990 + }, + { + "epoch": 0.7556047843521119, + "grad_norm": 0.0016606005374342203, + "learning_rate": 2.454532911359658e-05, + "loss": 0.2943387031555176, + "step": 176000 + }, + { + "epoch": 0.7556047843521119, + "eval_loss": 0.3844696879386902, + "eval_runtime": 27.4381, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 176000 + }, + { + "epoch": 0.7556477164421318, + "grad_norm": 3.118253469467163, + "learning_rate": 2.4541017393478955e-05, + "loss": 0.13530291318893434, + "step": 176010 + }, + { + "epoch": 0.7556906485321518, + "grad_norm": 0.21326979994773865, + "learning_rate": 2.4536705673361332e-05, + "loss": 0.449766731262207, + "step": 176020 + }, + { + "epoch": 0.7557335806221719, + "grad_norm": 0.07444790750741959, + "learning_rate": 2.4532393953243706e-05, + "loss": 0.0024720698595046995, + "step": 176030 + }, + { + "epoch": 0.7557765127121918, + "grad_norm": 1.2973755598068237, + "learning_rate": 2.4528082233126083e-05, + "loss": 0.26403958797454835, + "step": 176040 + }, + { + "epoch": 0.7558194448022119, + "grad_norm": 0.0019384416518732905, + "learning_rate": 2.452377051300846e-05, + "loss": 0.13067800998687745, + "step": 176050 + }, + { + "epoch": 0.7558623768922319, + "grad_norm": 0.9657537937164307, + "learning_rate": 2.4519458792890838e-05, + "loss": 0.20171825885772704, + "step": 176060 + }, + { + "epoch": 0.7559053089822518, + "grad_norm": 0.0036415038630366325, + "learning_rate": 2.4515147072773216e-05, + "loss": 0.2104336977005005, + "step": 176070 + }, + { + "epoch": 0.7559482410722719, + "grad_norm": 3.9165921211242676, + "learning_rate": 2.451083535265559e-05, + "loss": 0.4046797752380371, + "step": 176080 + }, + { + "epoch": 0.7559911731622919, + "grad_norm": 0.004895086400210857, + "learning_rate": 2.4506523632537967e-05, + "loss": 0.2396653413772583, + "step": 176090 + }, + { + "epoch": 0.7560341052523118, + "grad_norm": 7.71179723739624, + "learning_rate": 2.450221191242034e-05, + "loss": 0.17154238224029542, + "step": 176100 + }, + { + "epoch": 0.7560770373423319, + "grad_norm": 0.16281859576702118, + "learning_rate": 2.4497900192302718e-05, + "loss": 0.2616174936294556, + "step": 176110 + }, + { + "epoch": 0.7561199694323519, + "grad_norm": 3.693711757659912, + "learning_rate": 2.4493588472185095e-05, + "loss": 0.32230591773986816, + "step": 176120 + }, + { + "epoch": 0.7561629015223719, + "grad_norm": 0.003052955726161599, + "learning_rate": 2.4489276752067473e-05, + "loss": 0.24035139083862306, + "step": 176130 + }, + { + "epoch": 0.7562058336123919, + "grad_norm": 5.051456451416016, + "learning_rate": 2.4484965031949847e-05, + "loss": 0.368407154083252, + "step": 176140 + }, + { + "epoch": 0.756248765702412, + "grad_norm": 0.01023197453469038, + "learning_rate": 2.4480653311832224e-05, + "loss": 0.2336270809173584, + "step": 176150 + }, + { + "epoch": 0.7562916977924319, + "grad_norm": 0.053483642637729645, + "learning_rate": 2.4476341591714598e-05, + "loss": 0.07102344632148742, + "step": 176160 + }, + { + "epoch": 0.7563346298824519, + "grad_norm": 0.002425705548375845, + "learning_rate": 2.4472029871596975e-05, + "loss": 0.19212217330932618, + "step": 176170 + }, + { + "epoch": 0.756377561972472, + "grad_norm": 0.0017519342945888638, + "learning_rate": 2.4467718151479353e-05, + "loss": 0.4195821285247803, + "step": 176180 + }, + { + "epoch": 0.7564204940624919, + "grad_norm": 0.8289617896080017, + "learning_rate": 2.446340643136173e-05, + "loss": 0.2355353355407715, + "step": 176190 + }, + { + "epoch": 0.7564634261525119, + "grad_norm": 0.9240083694458008, + "learning_rate": 2.4459094711244104e-05, + "loss": 0.11165834665298462, + "step": 176200 + }, + { + "epoch": 0.756506358242532, + "grad_norm": 3.18680739402771, + "learning_rate": 2.445478299112648e-05, + "loss": 0.3855263710021973, + "step": 176210 + }, + { + "epoch": 0.7565492903325519, + "grad_norm": 0.004274186212569475, + "learning_rate": 2.4450471271008855e-05, + "loss": 0.07496775388717651, + "step": 176220 + }, + { + "epoch": 0.756592222422572, + "grad_norm": 0.28305789828300476, + "learning_rate": 2.4446159550891236e-05, + "loss": 0.11907908916473389, + "step": 176230 + }, + { + "epoch": 0.756635154512592, + "grad_norm": 0.0008765619131736457, + "learning_rate": 2.444184783077361e-05, + "loss": 0.014611579477787018, + "step": 176240 + }, + { + "epoch": 0.756678086602612, + "grad_norm": 0.030822429805994034, + "learning_rate": 2.4437536110655987e-05, + "loss": 0.17702211141586305, + "step": 176250 + }, + { + "epoch": 0.756721018692632, + "grad_norm": 0.00023859924112912267, + "learning_rate": 2.443322439053836e-05, + "loss": 0.21805362701416015, + "step": 176260 + }, + { + "epoch": 0.756763950782652, + "grad_norm": 0.06023327261209488, + "learning_rate": 2.442891267042074e-05, + "loss": 0.23725461959838867, + "step": 176270 + }, + { + "epoch": 0.7568068828726721, + "grad_norm": 0.06006970629096031, + "learning_rate": 2.4424600950303113e-05, + "loss": 0.07275346517562867, + "step": 176280 + }, + { + "epoch": 0.756849814962692, + "grad_norm": 2.060161590576172, + "learning_rate": 2.4420289230185493e-05, + "loss": 0.469630765914917, + "step": 176290 + }, + { + "epoch": 0.756892747052712, + "grad_norm": 0.01709127053618431, + "learning_rate": 2.4415977510067867e-05, + "loss": 0.29026083946228026, + "step": 176300 + }, + { + "epoch": 0.7569356791427321, + "grad_norm": 1.0979907512664795, + "learning_rate": 2.4411665789950245e-05, + "loss": 0.152878737449646, + "step": 176310 + }, + { + "epoch": 0.756978611232752, + "grad_norm": 0.0037508816458284855, + "learning_rate": 2.440735406983262e-05, + "loss": 0.14382373094558715, + "step": 176320 + }, + { + "epoch": 0.757021543322772, + "grad_norm": 3.9158923625946045, + "learning_rate": 2.4403042349714996e-05, + "loss": 0.27426795959472655, + "step": 176330 + }, + { + "epoch": 0.7570644754127921, + "grad_norm": 0.7643011808395386, + "learning_rate": 2.4398730629597373e-05, + "loss": 0.14274754524230956, + "step": 176340 + }, + { + "epoch": 0.757107407502812, + "grad_norm": 0.0008324419031850994, + "learning_rate": 2.439441890947975e-05, + "loss": 0.22440826892852783, + "step": 176350 + }, + { + "epoch": 0.7571503395928321, + "grad_norm": 1.3607858419418335, + "learning_rate": 2.4390107189362125e-05, + "loss": 0.1967037558555603, + "step": 176360 + }, + { + "epoch": 0.7571932716828521, + "grad_norm": 0.0033644582144916058, + "learning_rate": 2.4385795469244502e-05, + "loss": 0.12327347993850708, + "step": 176370 + }, + { + "epoch": 0.757236203772872, + "grad_norm": 0.0029008015990257263, + "learning_rate": 2.4381483749126876e-05, + "loss": 0.21013979911804198, + "step": 176380 + }, + { + "epoch": 0.7572791358628921, + "grad_norm": 0.00942598469555378, + "learning_rate": 2.4377172029009253e-05, + "loss": 0.0036503538489341737, + "step": 176390 + }, + { + "epoch": 0.7573220679529121, + "grad_norm": 1.111317753791809, + "learning_rate": 2.437286030889163e-05, + "loss": 0.19102848768234254, + "step": 176400 + }, + { + "epoch": 0.7573650000429321, + "grad_norm": 4.567811965942383, + "learning_rate": 2.4368548588774008e-05, + "loss": 0.16488993167877197, + "step": 176410 + }, + { + "epoch": 0.7574079321329521, + "grad_norm": 0.011961126700043678, + "learning_rate": 2.4364236868656385e-05, + "loss": 0.15731921195983886, + "step": 176420 + }, + { + "epoch": 0.7574508642229721, + "grad_norm": 0.000113897658593487, + "learning_rate": 2.435992514853876e-05, + "loss": 0.3114840030670166, + "step": 176430 + }, + { + "epoch": 0.7574937963129921, + "grad_norm": 1.7821091413497925, + "learning_rate": 2.4355613428421137e-05, + "loss": 0.4081563949584961, + "step": 176440 + }, + { + "epoch": 0.7575367284030121, + "grad_norm": 0.09143106639385223, + "learning_rate": 2.435130170830351e-05, + "loss": 0.18028711080551146, + "step": 176450 + }, + { + "epoch": 0.7575796604930322, + "grad_norm": 1.0728000402450562, + "learning_rate": 2.4346989988185888e-05, + "loss": 0.25679965019226075, + "step": 176460 + }, + { + "epoch": 0.7576225925830521, + "grad_norm": 1.9048360586166382, + "learning_rate": 2.4342678268068265e-05, + "loss": 0.1729556918144226, + "step": 176470 + }, + { + "epoch": 0.7576655246730721, + "grad_norm": 1.2008455991744995, + "learning_rate": 2.4338366547950643e-05, + "loss": 0.2813676118850708, + "step": 176480 + }, + { + "epoch": 0.7577084567630922, + "grad_norm": 0.7849253416061401, + "learning_rate": 2.4334054827833016e-05, + "loss": 0.24318392276763917, + "step": 176490 + }, + { + "epoch": 0.7577513888531121, + "grad_norm": 0.015166381374001503, + "learning_rate": 2.4329743107715394e-05, + "loss": 0.14935698509216308, + "step": 176500 + }, + { + "epoch": 0.7577943209431321, + "grad_norm": 0.3513484299182892, + "learning_rate": 2.4325431387597768e-05, + "loss": 0.09348582029342652, + "step": 176510 + }, + { + "epoch": 0.7578372530331522, + "grad_norm": 2.0167341232299805, + "learning_rate": 2.4321119667480145e-05, + "loss": 0.15381017923355103, + "step": 176520 + }, + { + "epoch": 0.7578801851231721, + "grad_norm": 2.2151646614074707, + "learning_rate": 2.4316807947362522e-05, + "loss": 0.25024800300598143, + "step": 176530 + }, + { + "epoch": 0.7579231172131922, + "grad_norm": 4.011649131774902, + "learning_rate": 2.43124962272449e-05, + "loss": 0.31911654472351075, + "step": 176540 + }, + { + "epoch": 0.7579660493032122, + "grad_norm": 0.0012101922184228897, + "learning_rate": 2.4308184507127274e-05, + "loss": 0.1742846131324768, + "step": 176550 + }, + { + "epoch": 0.7580089813932321, + "grad_norm": 0.0010554291075095534, + "learning_rate": 2.430387278700965e-05, + "loss": 0.16264034509658815, + "step": 176560 + }, + { + "epoch": 0.7580519134832522, + "grad_norm": 2.1255459785461426, + "learning_rate": 2.4299561066892025e-05, + "loss": 0.22510194778442383, + "step": 176570 + }, + { + "epoch": 0.7580948455732722, + "grad_norm": 5.232020378112793, + "learning_rate": 2.4295249346774406e-05, + "loss": 0.4652894973754883, + "step": 176580 + }, + { + "epoch": 0.7581377776632922, + "grad_norm": 0.01214161328971386, + "learning_rate": 2.429093762665678e-05, + "loss": 0.21678454875946046, + "step": 176590 + }, + { + "epoch": 0.7581807097533122, + "grad_norm": 0.051134634763002396, + "learning_rate": 2.4286625906539157e-05, + "loss": 0.17276641130447387, + "step": 176600 + }, + { + "epoch": 0.7582236418433322, + "grad_norm": 0.879493236541748, + "learning_rate": 2.428231418642153e-05, + "loss": 0.23750553131103516, + "step": 176610 + }, + { + "epoch": 0.7582665739333522, + "grad_norm": 0.2266494333744049, + "learning_rate": 2.427800246630391e-05, + "loss": 0.14086905717849732, + "step": 176620 + }, + { + "epoch": 0.7583095060233722, + "grad_norm": 5.996125221252441, + "learning_rate": 2.4273690746186282e-05, + "loss": 0.16965644359588622, + "step": 176630 + }, + { + "epoch": 0.7583524381133923, + "grad_norm": 0.20658458769321442, + "learning_rate": 2.4269379026068663e-05, + "loss": 0.25354115962982177, + "step": 176640 + }, + { + "epoch": 0.7583953702034122, + "grad_norm": 0.07418932020664215, + "learning_rate": 2.4265067305951037e-05, + "loss": 0.27120108604431153, + "step": 176650 + }, + { + "epoch": 0.7584383022934322, + "grad_norm": 0.009839876554906368, + "learning_rate": 2.4260755585833414e-05, + "loss": 0.3132223844528198, + "step": 176660 + }, + { + "epoch": 0.7584812343834523, + "grad_norm": 0.00416004192084074, + "learning_rate": 2.425644386571579e-05, + "loss": 0.35200812816619875, + "step": 176670 + }, + { + "epoch": 0.7585241664734723, + "grad_norm": 0.0022613676264882088, + "learning_rate": 2.4252132145598166e-05, + "loss": 0.13670860528945922, + "step": 176680 + }, + { + "epoch": 0.7585670985634922, + "grad_norm": 0.19223640859127045, + "learning_rate": 2.4247820425480543e-05, + "loss": 0.06737182736396789, + "step": 176690 + }, + { + "epoch": 0.7586100306535123, + "grad_norm": 1.3617298603057861, + "learning_rate": 2.424350870536292e-05, + "loss": 0.0552653968334198, + "step": 176700 + }, + { + "epoch": 0.7586529627435323, + "grad_norm": 0.01469328347593546, + "learning_rate": 2.4239196985245294e-05, + "loss": 0.16656371355056762, + "step": 176710 + }, + { + "epoch": 0.7586958948335523, + "grad_norm": 0.01902511715888977, + "learning_rate": 2.423488526512767e-05, + "loss": 0.16330991983413695, + "step": 176720 + }, + { + "epoch": 0.7587388269235723, + "grad_norm": 0.26943960785865784, + "learning_rate": 2.4230573545010046e-05, + "loss": 0.22292008399963378, + "step": 176730 + }, + { + "epoch": 0.7587817590135923, + "grad_norm": 0.25879213213920593, + "learning_rate": 2.4226261824892423e-05, + "loss": 0.17576502561569213, + "step": 176740 + }, + { + "epoch": 0.7588246911036123, + "grad_norm": 0.42930835485458374, + "learning_rate": 2.42219501047748e-05, + "loss": 0.15430418252944947, + "step": 176750 + }, + { + "epoch": 0.7588676231936323, + "grad_norm": 1.0486270189285278, + "learning_rate": 2.4217638384657178e-05, + "loss": 0.4111494064331055, + "step": 176760 + }, + { + "epoch": 0.7589105552836524, + "grad_norm": 1.5241907835006714, + "learning_rate": 2.421332666453955e-05, + "loss": 0.15125579833984376, + "step": 176770 + }, + { + "epoch": 0.7589534873736723, + "grad_norm": 3.3031556606292725, + "learning_rate": 2.420901494442193e-05, + "loss": 0.05916283130645752, + "step": 176780 + }, + { + "epoch": 0.7589964194636923, + "grad_norm": 0.0032096717040985823, + "learning_rate": 2.4204703224304306e-05, + "loss": 0.1436079502105713, + "step": 176790 + }, + { + "epoch": 0.7590393515537124, + "grad_norm": 0.14260707795619965, + "learning_rate": 2.420039150418668e-05, + "loss": 0.3289307117462158, + "step": 176800 + }, + { + "epoch": 0.7590822836437323, + "grad_norm": 4.995758533477783, + "learning_rate": 2.4196079784069058e-05, + "loss": 0.2021782875061035, + "step": 176810 + }, + { + "epoch": 0.7591252157337524, + "grad_norm": 15.415989875793457, + "learning_rate": 2.4191768063951435e-05, + "loss": 0.3475226163864136, + "step": 176820 + }, + { + "epoch": 0.7591681478237724, + "grad_norm": 2.6475751399993896, + "learning_rate": 2.4187456343833812e-05, + "loss": 0.22502753734588624, + "step": 176830 + }, + { + "epoch": 0.7592110799137923, + "grad_norm": 5.432012557983398, + "learning_rate": 2.4183144623716186e-05, + "loss": 0.15463144779205323, + "step": 176840 + }, + { + "epoch": 0.7592540120038124, + "grad_norm": 1.455808401107788, + "learning_rate": 2.4178832903598564e-05, + "loss": 0.3014279127120972, + "step": 176850 + }, + { + "epoch": 0.7592969440938324, + "grad_norm": 0.05897454172372818, + "learning_rate": 2.4174521183480938e-05, + "loss": 0.22337071895599364, + "step": 176860 + }, + { + "epoch": 0.7593398761838523, + "grad_norm": 1.421477198600769, + "learning_rate": 2.4170209463363315e-05, + "loss": 0.36787757873535154, + "step": 176870 + }, + { + "epoch": 0.7593828082738724, + "grad_norm": 25.1231746673584, + "learning_rate": 2.4165897743245692e-05, + "loss": 0.10162899494171143, + "step": 176880 + }, + { + "epoch": 0.7594257403638924, + "grad_norm": 0.4189836382865906, + "learning_rate": 2.416158602312807e-05, + "loss": 0.3461848258972168, + "step": 176890 + }, + { + "epoch": 0.7594686724539124, + "grad_norm": 3.7396621704101562, + "learning_rate": 2.4157274303010443e-05, + "loss": 0.12798227071762086, + "step": 176900 + }, + { + "epoch": 0.7595116045439324, + "grad_norm": 0.008923502638936043, + "learning_rate": 2.415296258289282e-05, + "loss": 0.15080041885375978, + "step": 176910 + }, + { + "epoch": 0.7595545366339524, + "grad_norm": 0.0778871700167656, + "learning_rate": 2.4148650862775195e-05, + "loss": 0.04988257884979248, + "step": 176920 + }, + { + "epoch": 0.7595974687239724, + "grad_norm": 0.15040385723114014, + "learning_rate": 2.4144339142657572e-05, + "loss": 0.3418047666549683, + "step": 176930 + }, + { + "epoch": 0.7596404008139924, + "grad_norm": 54.44385528564453, + "learning_rate": 2.414002742253995e-05, + "loss": 0.30226004123687744, + "step": 176940 + }, + { + "epoch": 0.7596833329040125, + "grad_norm": 3.170269250869751, + "learning_rate": 2.4135715702422327e-05, + "loss": 0.1979185461997986, + "step": 176950 + }, + { + "epoch": 0.7597262649940324, + "grad_norm": 1.6455987691879272, + "learning_rate": 2.41314039823047e-05, + "loss": 0.18162251710891725, + "step": 176960 + }, + { + "epoch": 0.7597691970840524, + "grad_norm": 1.4148166179656982, + "learning_rate": 2.4127092262187078e-05, + "loss": 0.16824384927749633, + "step": 176970 + }, + { + "epoch": 0.7598121291740725, + "grad_norm": 0.18569713830947876, + "learning_rate": 2.4122780542069452e-05, + "loss": 0.12295417785644532, + "step": 176980 + }, + { + "epoch": 0.7598550612640924, + "grad_norm": 2.242105007171631, + "learning_rate": 2.4118468821951833e-05, + "loss": 0.1427332878112793, + "step": 176990 + }, + { + "epoch": 0.7598979933541125, + "grad_norm": 0.004524548072367907, + "learning_rate": 2.4114157101834207e-05, + "loss": 0.3069367647171021, + "step": 177000 + }, + { + "epoch": 0.7598979933541125, + "eval_loss": 0.39005473256111145, + "eval_runtime": 27.4316, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 177000 + }, + { + "epoch": 0.7599409254441325, + "grad_norm": 0.8188188672065735, + "learning_rate": 2.4109845381716584e-05, + "loss": 0.17687420845031737, + "step": 177010 + }, + { + "epoch": 0.7599838575341524, + "grad_norm": 0.44165608286857605, + "learning_rate": 2.4105533661598958e-05, + "loss": 0.06316535472869873, + "step": 177020 + }, + { + "epoch": 0.7600267896241725, + "grad_norm": 2.4210214614868164, + "learning_rate": 2.4101221941481335e-05, + "loss": 0.49973325729370116, + "step": 177030 + }, + { + "epoch": 0.7600697217141925, + "grad_norm": 0.012840594165027142, + "learning_rate": 2.409691022136371e-05, + "loss": 0.06451746821403503, + "step": 177040 + }, + { + "epoch": 0.7601126538042124, + "grad_norm": 0.003341269213706255, + "learning_rate": 2.409259850124609e-05, + "loss": 0.1332295298576355, + "step": 177050 + }, + { + "epoch": 0.7601555858942325, + "grad_norm": 1.3087090253829956, + "learning_rate": 2.4088286781128464e-05, + "loss": 0.11503291130065918, + "step": 177060 + }, + { + "epoch": 0.7601985179842525, + "grad_norm": 0.07258166372776031, + "learning_rate": 2.408397506101084e-05, + "loss": 0.18839584589004515, + "step": 177070 + }, + { + "epoch": 0.7602414500742725, + "grad_norm": 0.07161829620599747, + "learning_rate": 2.4079663340893215e-05, + "loss": 0.07642164826393127, + "step": 177080 + }, + { + "epoch": 0.7602843821642925, + "grad_norm": 0.0005039049428887665, + "learning_rate": 2.4075351620775593e-05, + "loss": 0.17287344932556153, + "step": 177090 + }, + { + "epoch": 0.7603273142543125, + "grad_norm": 0.0029817174654453993, + "learning_rate": 2.407103990065797e-05, + "loss": 0.1694784641265869, + "step": 177100 + }, + { + "epoch": 0.7603702463443326, + "grad_norm": 1.9690752029418945, + "learning_rate": 2.4066728180540347e-05, + "loss": 0.1936242938041687, + "step": 177110 + }, + { + "epoch": 0.7604131784343525, + "grad_norm": 0.013945703394711018, + "learning_rate": 2.406241646042272e-05, + "loss": 0.18476022481918336, + "step": 177120 + }, + { + "epoch": 0.7604561105243726, + "grad_norm": 0.012228801846504211, + "learning_rate": 2.40581047403051e-05, + "loss": 0.10598087310791016, + "step": 177130 + }, + { + "epoch": 0.7604990426143926, + "grad_norm": 0.026704225689172745, + "learning_rate": 2.4053793020187473e-05, + "loss": 0.11414153575897217, + "step": 177140 + }, + { + "epoch": 0.7605419747044125, + "grad_norm": 0.03746294975280762, + "learning_rate": 2.404948130006985e-05, + "loss": 0.07400650978088379, + "step": 177150 + }, + { + "epoch": 0.7605849067944326, + "grad_norm": 2.7711026668548584, + "learning_rate": 2.4045169579952227e-05, + "loss": 0.19555963277816774, + "step": 177160 + }, + { + "epoch": 0.7606278388844526, + "grad_norm": 3.9885945320129395, + "learning_rate": 2.4040857859834605e-05, + "loss": 0.20378420352935792, + "step": 177170 + }, + { + "epoch": 0.7606707709744726, + "grad_norm": 0.01776500605046749, + "learning_rate": 2.4036546139716982e-05, + "loss": 0.2603089094161987, + "step": 177180 + }, + { + "epoch": 0.7607137030644926, + "grad_norm": 0.15067414939403534, + "learning_rate": 2.4032234419599356e-05, + "loss": 0.2957641124725342, + "step": 177190 + }, + { + "epoch": 0.7607566351545126, + "grad_norm": 1.563685655593872, + "learning_rate": 2.4027922699481733e-05, + "loss": 0.1789721965789795, + "step": 177200 + }, + { + "epoch": 0.7607995672445326, + "grad_norm": 0.46982231736183167, + "learning_rate": 2.4023610979364107e-05, + "loss": 0.23951370716094972, + "step": 177210 + }, + { + "epoch": 0.7608424993345526, + "grad_norm": 7.683855056762695, + "learning_rate": 2.4019299259246485e-05, + "loss": 0.35542027950286864, + "step": 177220 + }, + { + "epoch": 0.7608854314245727, + "grad_norm": 0.038922760635614395, + "learning_rate": 2.4014987539128862e-05, + "loss": 0.05637596845626831, + "step": 177230 + }, + { + "epoch": 0.7609283635145926, + "grad_norm": 0.03396385908126831, + "learning_rate": 2.401067581901124e-05, + "loss": 0.2034245252609253, + "step": 177240 + }, + { + "epoch": 0.7609712956046126, + "grad_norm": 5.346138000488281, + "learning_rate": 2.4006364098893613e-05, + "loss": 0.10607466697692872, + "step": 177250 + }, + { + "epoch": 0.7610142276946327, + "grad_norm": 0.0009848641930148005, + "learning_rate": 2.400205237877599e-05, + "loss": 0.23548879623413085, + "step": 177260 + }, + { + "epoch": 0.7610571597846526, + "grad_norm": 0.8244641423225403, + "learning_rate": 2.3997740658658365e-05, + "loss": 0.13387619256973265, + "step": 177270 + }, + { + "epoch": 0.7611000918746726, + "grad_norm": 0.0011648483341559768, + "learning_rate": 2.3993428938540742e-05, + "loss": 0.09976664781570435, + "step": 177280 + }, + { + "epoch": 0.7611430239646927, + "grad_norm": 5.723977088928223, + "learning_rate": 2.398911721842312e-05, + "loss": 0.24518101215362548, + "step": 177290 + }, + { + "epoch": 0.7611859560547126, + "grad_norm": 0.1946738213300705, + "learning_rate": 2.3984805498305497e-05, + "loss": 0.2515752077102661, + "step": 177300 + }, + { + "epoch": 0.7612288881447327, + "grad_norm": 0.001265937928110361, + "learning_rate": 2.398049377818787e-05, + "loss": 0.16979835033416749, + "step": 177310 + }, + { + "epoch": 0.7612718202347527, + "grad_norm": 0.04173870384693146, + "learning_rate": 2.3976182058070248e-05, + "loss": 0.35801122188568113, + "step": 177320 + }, + { + "epoch": 0.7613147523247726, + "grad_norm": 1.011568546295166, + "learning_rate": 2.3971870337952622e-05, + "loss": 0.31412568092346194, + "step": 177330 + }, + { + "epoch": 0.7613576844147927, + "grad_norm": 0.6044009923934937, + "learning_rate": 2.3967558617835003e-05, + "loss": 0.2454387664794922, + "step": 177340 + }, + { + "epoch": 0.7614006165048127, + "grad_norm": 0.006332141347229481, + "learning_rate": 2.3963246897717376e-05, + "loss": 0.3005851984024048, + "step": 177350 + }, + { + "epoch": 0.7614435485948327, + "grad_norm": 0.3464721739292145, + "learning_rate": 2.3958935177599754e-05, + "loss": 0.1406756043434143, + "step": 177360 + }, + { + "epoch": 0.7614864806848527, + "grad_norm": 0.14981555938720703, + "learning_rate": 2.3954623457482128e-05, + "loss": 0.11905041933059693, + "step": 177370 + }, + { + "epoch": 0.7615294127748727, + "grad_norm": 1.5472999811172485, + "learning_rate": 2.3950311737364505e-05, + "loss": 0.1065073013305664, + "step": 177380 + }, + { + "epoch": 0.7615723448648927, + "grad_norm": 0.0009052807581610978, + "learning_rate": 2.394600001724688e-05, + "loss": 0.22083621025085448, + "step": 177390 + }, + { + "epoch": 0.7616152769549127, + "grad_norm": 0.0028880308382213116, + "learning_rate": 2.394168829712926e-05, + "loss": 0.22731993198394776, + "step": 177400 + }, + { + "epoch": 0.7616582090449328, + "grad_norm": 0.006786768790334463, + "learning_rate": 2.3937376577011634e-05, + "loss": 0.05157320499420166, + "step": 177410 + }, + { + "epoch": 0.7617011411349527, + "grad_norm": 1.7782171964645386, + "learning_rate": 2.393306485689401e-05, + "loss": 0.28482072353363036, + "step": 177420 + }, + { + "epoch": 0.7617440732249727, + "grad_norm": 0.010772820562124252, + "learning_rate": 2.3928753136776385e-05, + "loss": 0.15599217414855956, + "step": 177430 + }, + { + "epoch": 0.7617870053149928, + "grad_norm": 0.3834504187107086, + "learning_rate": 2.3924441416658762e-05, + "loss": 0.06821857094764709, + "step": 177440 + }, + { + "epoch": 0.7618299374050127, + "grad_norm": 10.938364028930664, + "learning_rate": 2.392012969654114e-05, + "loss": 0.11866967678070069, + "step": 177450 + }, + { + "epoch": 0.7618728694950327, + "grad_norm": 7.032254219055176, + "learning_rate": 2.3915817976423517e-05, + "loss": 0.20671021938323975, + "step": 177460 + }, + { + "epoch": 0.7619158015850528, + "grad_norm": 0.004301557317376137, + "learning_rate": 2.391150625630589e-05, + "loss": 0.2941230058670044, + "step": 177470 + }, + { + "epoch": 0.7619587336750727, + "grad_norm": 1.5329325199127197, + "learning_rate": 2.390719453618827e-05, + "loss": 0.2828094244003296, + "step": 177480 + }, + { + "epoch": 0.7620016657650928, + "grad_norm": 2.2881085872650146, + "learning_rate": 2.3902882816070642e-05, + "loss": 0.40318880081176756, + "step": 177490 + }, + { + "epoch": 0.7620445978551128, + "grad_norm": 5.213508129119873, + "learning_rate": 2.389857109595302e-05, + "loss": 0.14141581058502198, + "step": 177500 + }, + { + "epoch": 0.7620875299451327, + "grad_norm": 0.006306284107267857, + "learning_rate": 2.3894259375835397e-05, + "loss": 0.08049569725990295, + "step": 177510 + }, + { + "epoch": 0.7621304620351528, + "grad_norm": 0.22153890132904053, + "learning_rate": 2.3889947655717774e-05, + "loss": 0.12354075908660889, + "step": 177520 + }, + { + "epoch": 0.7621733941251728, + "grad_norm": 0.04059620201587677, + "learning_rate": 2.3885635935600152e-05, + "loss": 0.0005926693323999643, + "step": 177530 + }, + { + "epoch": 0.7622163262151929, + "grad_norm": 0.004459694027900696, + "learning_rate": 2.3881324215482526e-05, + "loss": 0.2787043809890747, + "step": 177540 + }, + { + "epoch": 0.7622592583052128, + "grad_norm": 0.11074524372816086, + "learning_rate": 2.3877012495364903e-05, + "loss": 0.25904710292816163, + "step": 177550 + }, + { + "epoch": 0.7623021903952328, + "grad_norm": 0.013455307111144066, + "learning_rate": 2.3872700775247277e-05, + "loss": 0.3018791675567627, + "step": 177560 + }, + { + "epoch": 0.7623451224852529, + "grad_norm": 1.0258920192718506, + "learning_rate": 2.3868389055129654e-05, + "loss": 0.06581991314888, + "step": 177570 + }, + { + "epoch": 0.7623880545752728, + "grad_norm": 1.115838646888733, + "learning_rate": 2.386407733501203e-05, + "loss": 0.2682212829589844, + "step": 177580 + }, + { + "epoch": 0.7624309866652929, + "grad_norm": 0.015212813392281532, + "learning_rate": 2.385976561489441e-05, + "loss": 0.2845586061477661, + "step": 177590 + }, + { + "epoch": 0.7624739187553129, + "grad_norm": 0.15196259319782257, + "learning_rate": 2.3855453894776783e-05, + "loss": 0.2876842498779297, + "step": 177600 + }, + { + "epoch": 0.7625168508453328, + "grad_norm": 0.07368028908967972, + "learning_rate": 2.385114217465916e-05, + "loss": 0.10164880752563477, + "step": 177610 + }, + { + "epoch": 0.7625597829353529, + "grad_norm": 0.0007988035795278847, + "learning_rate": 2.3846830454541534e-05, + "loss": 0.09450948238372803, + "step": 177620 + }, + { + "epoch": 0.7626027150253729, + "grad_norm": 0.010584760457277298, + "learning_rate": 2.384251873442391e-05, + "loss": 0.3373741626739502, + "step": 177630 + }, + { + "epoch": 0.7626456471153928, + "grad_norm": 0.02437574975192547, + "learning_rate": 2.383820701430629e-05, + "loss": 0.23039453029632567, + "step": 177640 + }, + { + "epoch": 0.7626885792054129, + "grad_norm": 0.001247620559297502, + "learning_rate": 2.3833895294188666e-05, + "loss": 0.27570860385894774, + "step": 177650 + }, + { + "epoch": 0.7627315112954329, + "grad_norm": 2.2336456775665283, + "learning_rate": 2.382958357407104e-05, + "loss": 0.1201433539390564, + "step": 177660 + }, + { + "epoch": 0.7627744433854529, + "grad_norm": 0.146112859249115, + "learning_rate": 2.3825271853953418e-05, + "loss": 0.21582226753234862, + "step": 177670 + }, + { + "epoch": 0.7628173754754729, + "grad_norm": 14.193013191223145, + "learning_rate": 2.382096013383579e-05, + "loss": 0.397243595123291, + "step": 177680 + }, + { + "epoch": 0.762860307565493, + "grad_norm": 0.22348107397556305, + "learning_rate": 2.3816648413718172e-05, + "loss": 0.21426901817321778, + "step": 177690 + }, + { + "epoch": 0.7629032396555129, + "grad_norm": 4.388669967651367, + "learning_rate": 2.3812336693600546e-05, + "loss": 0.3503814697265625, + "step": 177700 + }, + { + "epoch": 0.7629461717455329, + "grad_norm": 0.05588550865650177, + "learning_rate": 2.3808024973482924e-05, + "loss": 0.11665399074554443, + "step": 177710 + }, + { + "epoch": 0.762989103835553, + "grad_norm": 0.00433525163680315, + "learning_rate": 2.3803713253365298e-05, + "loss": 0.2739673137664795, + "step": 177720 + }, + { + "epoch": 0.7630320359255729, + "grad_norm": 0.035902559757232666, + "learning_rate": 2.3799401533247675e-05, + "loss": 0.300280237197876, + "step": 177730 + }, + { + "epoch": 0.7630749680155929, + "grad_norm": 0.34329330921173096, + "learning_rate": 2.379508981313005e-05, + "loss": 0.3793229579925537, + "step": 177740 + }, + { + "epoch": 0.763117900105613, + "grad_norm": 0.04771098494529724, + "learning_rate": 2.379077809301243e-05, + "loss": 0.15737433433532716, + "step": 177750 + }, + { + "epoch": 0.7631608321956329, + "grad_norm": 0.0009794149082154036, + "learning_rate": 2.3786466372894804e-05, + "loss": 0.11524491310119629, + "step": 177760 + }, + { + "epoch": 0.763203764285653, + "grad_norm": 0.09087666869163513, + "learning_rate": 2.378215465277718e-05, + "loss": 0.21049742698669432, + "step": 177770 + }, + { + "epoch": 0.763246696375673, + "grad_norm": 0.0185843575745821, + "learning_rate": 2.3777842932659555e-05, + "loss": 0.24470274448394774, + "step": 177780 + }, + { + "epoch": 0.7632896284656929, + "grad_norm": 0.0009324979619123042, + "learning_rate": 2.3773531212541932e-05, + "loss": 0.08331471681594849, + "step": 177790 + }, + { + "epoch": 0.763332560555713, + "grad_norm": 0.2116546779870987, + "learning_rate": 2.3769219492424306e-05, + "loss": 0.37836732864379885, + "step": 177800 + }, + { + "epoch": 0.763375492645733, + "grad_norm": 0.2698652446269989, + "learning_rate": 2.3764907772306687e-05, + "loss": 0.2392141342163086, + "step": 177810 + }, + { + "epoch": 0.7634184247357529, + "grad_norm": 0.10046686977148056, + "learning_rate": 2.376059605218906e-05, + "loss": 0.02579231858253479, + "step": 177820 + }, + { + "epoch": 0.763461356825773, + "grad_norm": 0.1942238211631775, + "learning_rate": 2.3756284332071438e-05, + "loss": 0.31575570106506345, + "step": 177830 + }, + { + "epoch": 0.763504288915793, + "grad_norm": 0.0024036553222686052, + "learning_rate": 2.3751972611953812e-05, + "loss": 0.25040130615234374, + "step": 177840 + }, + { + "epoch": 0.763547221005813, + "grad_norm": 8.009116172790527, + "learning_rate": 2.374766089183619e-05, + "loss": 0.21138277053833007, + "step": 177850 + }, + { + "epoch": 0.763590153095833, + "grad_norm": 1.8110581636428833, + "learning_rate": 2.3743349171718567e-05, + "loss": 0.3237518310546875, + "step": 177860 + }, + { + "epoch": 0.763633085185853, + "grad_norm": 1.874503254890442, + "learning_rate": 2.3739037451600944e-05, + "loss": 0.2857369422912598, + "step": 177870 + }, + { + "epoch": 0.763676017275873, + "grad_norm": 1.8950402736663818, + "learning_rate": 2.373472573148332e-05, + "loss": 0.14816755056381226, + "step": 177880 + }, + { + "epoch": 0.763718949365893, + "grad_norm": 1.536193609237671, + "learning_rate": 2.3730414011365695e-05, + "loss": 0.3412646293640137, + "step": 177890 + }, + { + "epoch": 0.7637618814559131, + "grad_norm": 0.028296170756220818, + "learning_rate": 2.3726102291248073e-05, + "loss": 0.07663151621818542, + "step": 177900 + }, + { + "epoch": 0.763804813545933, + "grad_norm": 0.1083562821149826, + "learning_rate": 2.3721790571130447e-05, + "loss": 0.4823110580444336, + "step": 177910 + }, + { + "epoch": 0.763847745635953, + "grad_norm": 0.038458842784166336, + "learning_rate": 2.3717478851012824e-05, + "loss": 0.2413325786590576, + "step": 177920 + }, + { + "epoch": 0.7638906777259731, + "grad_norm": 1.106160044670105, + "learning_rate": 2.37131671308952e-05, + "loss": 0.21324872970581055, + "step": 177930 + }, + { + "epoch": 0.763933609815993, + "grad_norm": 1.998532772064209, + "learning_rate": 2.370885541077758e-05, + "loss": 0.14935051202774047, + "step": 177940 + }, + { + "epoch": 0.763976541906013, + "grad_norm": 0.047563642263412476, + "learning_rate": 2.3704543690659953e-05, + "loss": 0.09624691009521484, + "step": 177950 + }, + { + "epoch": 0.7640194739960331, + "grad_norm": 0.014821537770330906, + "learning_rate": 2.370023197054233e-05, + "loss": 0.05763006806373596, + "step": 177960 + }, + { + "epoch": 0.7640624060860531, + "grad_norm": 2.6112759113311768, + "learning_rate": 2.3695920250424704e-05, + "loss": 0.23190555572509766, + "step": 177970 + }, + { + "epoch": 0.7641053381760731, + "grad_norm": 0.008518679067492485, + "learning_rate": 2.369160853030708e-05, + "loss": 0.29477295875549314, + "step": 177980 + }, + { + "epoch": 0.7641482702660931, + "grad_norm": 1.1182646751403809, + "learning_rate": 2.368729681018946e-05, + "loss": 0.19260014295578004, + "step": 177990 + }, + { + "epoch": 0.7641912023561132, + "grad_norm": 8.637264251708984, + "learning_rate": 2.3682985090071836e-05, + "loss": 0.22437336444854736, + "step": 178000 + }, + { + "epoch": 0.7641912023561132, + "eval_loss": 0.38519030809402466, + "eval_runtime": 27.4348, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 178000 + }, + { + "epoch": 0.7642341344461331, + "grad_norm": 0.12673498690128326, + "learning_rate": 2.367867336995421e-05, + "loss": 0.05941138863563537, + "step": 178010 + }, + { + "epoch": 0.7642770665361531, + "grad_norm": 0.019599629566073418, + "learning_rate": 2.3674361649836587e-05, + "loss": 0.048000562191009524, + "step": 178020 + }, + { + "epoch": 0.7643199986261732, + "grad_norm": 1.9546705484390259, + "learning_rate": 2.367004992971896e-05, + "loss": 0.22433915138244628, + "step": 178030 + }, + { + "epoch": 0.7643629307161931, + "grad_norm": 1.5347076654434204, + "learning_rate": 2.366573820960134e-05, + "loss": 0.28104162216186523, + "step": 178040 + }, + { + "epoch": 0.7644058628062131, + "grad_norm": 0.35148516297340393, + "learning_rate": 2.3661426489483716e-05, + "loss": 0.35195064544677734, + "step": 178050 + }, + { + "epoch": 0.7644487948962332, + "grad_norm": 0.2593337297439575, + "learning_rate": 2.3657114769366093e-05, + "loss": 0.15546387434005737, + "step": 178060 + }, + { + "epoch": 0.7644917269862531, + "grad_norm": 1.2594435214996338, + "learning_rate": 2.3652803049248467e-05, + "loss": 0.2298504114151001, + "step": 178070 + }, + { + "epoch": 0.7645346590762732, + "grad_norm": 0.08988411724567413, + "learning_rate": 2.3648491329130845e-05, + "loss": 0.4057119369506836, + "step": 178080 + }, + { + "epoch": 0.7645775911662932, + "grad_norm": 1.5320706367492676, + "learning_rate": 2.364417960901322e-05, + "loss": 0.31486694812774657, + "step": 178090 + }, + { + "epoch": 0.7646205232563131, + "grad_norm": 0.11509314924478531, + "learning_rate": 2.36398678888956e-05, + "loss": 0.35809285640716554, + "step": 178100 + }, + { + "epoch": 0.7646634553463332, + "grad_norm": 0.002720639342442155, + "learning_rate": 2.3635556168777973e-05, + "loss": 0.14641090631484985, + "step": 178110 + }, + { + "epoch": 0.7647063874363532, + "grad_norm": 0.6865724325180054, + "learning_rate": 2.363124444866035e-05, + "loss": 0.23801584243774415, + "step": 178120 + }, + { + "epoch": 0.7647493195263732, + "grad_norm": 0.005314926616847515, + "learning_rate": 2.3626932728542725e-05, + "loss": 0.19524000883102416, + "step": 178130 + }, + { + "epoch": 0.7647922516163932, + "grad_norm": 0.13690844178199768, + "learning_rate": 2.3622621008425102e-05, + "loss": 0.19549834728240967, + "step": 178140 + }, + { + "epoch": 0.7648351837064132, + "grad_norm": 0.011456413194537163, + "learning_rate": 2.3618309288307476e-05, + "loss": 0.312131667137146, + "step": 178150 + }, + { + "epoch": 0.7648781157964332, + "grad_norm": 0.0010632037883624434, + "learning_rate": 2.3613997568189857e-05, + "loss": 0.014251169562339783, + "step": 178160 + }, + { + "epoch": 0.7649210478864532, + "grad_norm": 0.006948573049157858, + "learning_rate": 2.360968584807223e-05, + "loss": 0.26813473701477053, + "step": 178170 + }, + { + "epoch": 0.7649639799764733, + "grad_norm": 0.06771288067102432, + "learning_rate": 2.3605374127954608e-05, + "loss": 0.15589020252227784, + "step": 178180 + }, + { + "epoch": 0.7650069120664932, + "grad_norm": 0.03058125637471676, + "learning_rate": 2.3601062407836982e-05, + "loss": 0.13075129985809325, + "step": 178190 + }, + { + "epoch": 0.7650498441565132, + "grad_norm": 0.023907043039798737, + "learning_rate": 2.359675068771936e-05, + "loss": 0.02735612988471985, + "step": 178200 + }, + { + "epoch": 0.7650927762465333, + "grad_norm": 0.032358940690755844, + "learning_rate": 2.3592438967601736e-05, + "loss": 0.1290936827659607, + "step": 178210 + }, + { + "epoch": 0.7651357083365532, + "grad_norm": 1.0458128452301025, + "learning_rate": 2.3588127247484114e-05, + "loss": 0.19473346471786498, + "step": 178220 + }, + { + "epoch": 0.7651786404265732, + "grad_norm": 0.03492206335067749, + "learning_rate": 2.3583815527366488e-05, + "loss": 0.27468626499176024, + "step": 178230 + }, + { + "epoch": 0.7652215725165933, + "grad_norm": 0.02318352460861206, + "learning_rate": 2.3579503807248865e-05, + "loss": 0.18723065853118898, + "step": 178240 + }, + { + "epoch": 0.7652645046066132, + "grad_norm": 5.709285259246826, + "learning_rate": 2.3575192087131242e-05, + "loss": 0.11720331907272338, + "step": 178250 + }, + { + "epoch": 0.7653074366966333, + "grad_norm": 1.5659072399139404, + "learning_rate": 2.3570880367013616e-05, + "loss": 0.16660025119781494, + "step": 178260 + }, + { + "epoch": 0.7653503687866533, + "grad_norm": 0.004724172875285149, + "learning_rate": 2.3566568646895994e-05, + "loss": 0.14990419149398804, + "step": 178270 + }, + { + "epoch": 0.7653933008766732, + "grad_norm": 0.00437846640124917, + "learning_rate": 2.356225692677837e-05, + "loss": 0.32151336669921876, + "step": 178280 + }, + { + "epoch": 0.7654362329666933, + "grad_norm": 0.3685600459575653, + "learning_rate": 2.355794520666075e-05, + "loss": 0.09072909355163575, + "step": 178290 + }, + { + "epoch": 0.7654791650567133, + "grad_norm": 0.0566171258687973, + "learning_rate": 2.3553633486543122e-05, + "loss": 0.23811821937561034, + "step": 178300 + }, + { + "epoch": 0.7655220971467332, + "grad_norm": 3.1550590991973877, + "learning_rate": 2.35493217664255e-05, + "loss": 0.3866184949874878, + "step": 178310 + }, + { + "epoch": 0.7655650292367533, + "grad_norm": 0.04981329292058945, + "learning_rate": 2.3545010046307874e-05, + "loss": 0.24460201263427733, + "step": 178320 + }, + { + "epoch": 0.7656079613267733, + "grad_norm": 0.008631882257759571, + "learning_rate": 2.354069832619025e-05, + "loss": 0.14547290802001953, + "step": 178330 + }, + { + "epoch": 0.7656508934167933, + "grad_norm": 0.17233169078826904, + "learning_rate": 2.353638660607263e-05, + "loss": 0.12638088464736938, + "step": 178340 + }, + { + "epoch": 0.7656938255068133, + "grad_norm": 0.03211379051208496, + "learning_rate": 2.3532074885955006e-05, + "loss": 0.2023622751235962, + "step": 178350 + }, + { + "epoch": 0.7657367575968334, + "grad_norm": 0.06673210114240646, + "learning_rate": 2.352776316583738e-05, + "loss": 0.2539444208145142, + "step": 178360 + }, + { + "epoch": 0.7657796896868533, + "grad_norm": 2.9048259258270264, + "learning_rate": 2.3523451445719757e-05, + "loss": 0.1878517746925354, + "step": 178370 + }, + { + "epoch": 0.7658226217768733, + "grad_norm": 0.19632039964199066, + "learning_rate": 2.351913972560213e-05, + "loss": 0.38019933700561526, + "step": 178380 + }, + { + "epoch": 0.7658655538668934, + "grad_norm": 0.12534579634666443, + "learning_rate": 2.351482800548451e-05, + "loss": 0.39364616870880126, + "step": 178390 + }, + { + "epoch": 0.7659084859569134, + "grad_norm": 0.040436238050460815, + "learning_rate": 2.3510516285366886e-05, + "loss": 0.1989324927330017, + "step": 178400 + }, + { + "epoch": 0.7659514180469333, + "grad_norm": 0.3849923312664032, + "learning_rate": 2.3506204565249263e-05, + "loss": 0.229500150680542, + "step": 178410 + }, + { + "epoch": 0.7659943501369534, + "grad_norm": 0.6619948148727417, + "learning_rate": 2.3501892845131637e-05, + "loss": 0.15900182723999023, + "step": 178420 + }, + { + "epoch": 0.7660372822269734, + "grad_norm": 1.1308156251907349, + "learning_rate": 2.3497581125014014e-05, + "loss": 0.2050173759460449, + "step": 178430 + }, + { + "epoch": 0.7660802143169934, + "grad_norm": 0.0016266998136416078, + "learning_rate": 2.3493269404896388e-05, + "loss": 0.2516968250274658, + "step": 178440 + }, + { + "epoch": 0.7661231464070134, + "grad_norm": 0.3307945430278778, + "learning_rate": 2.348895768477877e-05, + "loss": 0.19992436170578004, + "step": 178450 + }, + { + "epoch": 0.7661660784970334, + "grad_norm": 1.8131606578826904, + "learning_rate": 2.3484645964661143e-05, + "loss": 0.23348846435546874, + "step": 178460 + }, + { + "epoch": 0.7662090105870534, + "grad_norm": 2.5765650272369385, + "learning_rate": 2.348033424454352e-05, + "loss": 0.14857852458953857, + "step": 178470 + }, + { + "epoch": 0.7662519426770734, + "grad_norm": 0.001548268715851009, + "learning_rate": 2.3476022524425894e-05, + "loss": 0.32491796016693114, + "step": 178480 + }, + { + "epoch": 0.7662948747670935, + "grad_norm": 0.004323096945881844, + "learning_rate": 2.347171080430827e-05, + "loss": 0.07983020544052125, + "step": 178490 + }, + { + "epoch": 0.7663378068571134, + "grad_norm": 0.33664846420288086, + "learning_rate": 2.3467399084190646e-05, + "loss": 0.2000946044921875, + "step": 178500 + }, + { + "epoch": 0.7663807389471334, + "grad_norm": 3.2330658435821533, + "learning_rate": 2.3463087364073026e-05, + "loss": 0.30185685157775877, + "step": 178510 + }, + { + "epoch": 0.7664236710371535, + "grad_norm": 0.001679239678196609, + "learning_rate": 2.34587756439554e-05, + "loss": 0.11961598396301269, + "step": 178520 + }, + { + "epoch": 0.7664666031271734, + "grad_norm": 0.001713413163088262, + "learning_rate": 2.3454463923837778e-05, + "loss": 0.26211502552032473, + "step": 178530 + }, + { + "epoch": 0.7665095352171934, + "grad_norm": 1.5684071779251099, + "learning_rate": 2.345015220372015e-05, + "loss": 0.1904462218284607, + "step": 178540 + }, + { + "epoch": 0.7665524673072135, + "grad_norm": 0.0025774992536753416, + "learning_rate": 2.344584048360253e-05, + "loss": 0.19834427833557128, + "step": 178550 + }, + { + "epoch": 0.7665953993972334, + "grad_norm": 0.013022633269429207, + "learning_rate": 2.3441528763484906e-05, + "loss": 0.11189095973968506, + "step": 178560 + }, + { + "epoch": 0.7666383314872535, + "grad_norm": 0.1410759687423706, + "learning_rate": 2.3437217043367284e-05, + "loss": 0.07068445682525634, + "step": 178570 + }, + { + "epoch": 0.7666812635772735, + "grad_norm": 0.12566423416137695, + "learning_rate": 2.3432905323249658e-05, + "loss": 0.1137201189994812, + "step": 178580 + }, + { + "epoch": 0.7667241956672934, + "grad_norm": 0.004756872076541185, + "learning_rate": 2.3428593603132035e-05, + "loss": 0.1246252179145813, + "step": 178590 + }, + { + "epoch": 0.7667671277573135, + "grad_norm": 2.4529027938842773, + "learning_rate": 2.342428188301441e-05, + "loss": 0.11557846069335938, + "step": 178600 + }, + { + "epoch": 0.7668100598473335, + "grad_norm": 0.004438826348632574, + "learning_rate": 2.3419970162896786e-05, + "loss": 0.25914185047149657, + "step": 178610 + }, + { + "epoch": 0.7668529919373535, + "grad_norm": 4.537032127380371, + "learning_rate": 2.3415658442779164e-05, + "loss": 0.2596637010574341, + "step": 178620 + }, + { + "epoch": 0.7668959240273735, + "grad_norm": 0.0016977523919194937, + "learning_rate": 2.341134672266154e-05, + "loss": 0.11433390378952027, + "step": 178630 + }, + { + "epoch": 0.7669388561173935, + "grad_norm": 0.6404827237129211, + "learning_rate": 2.3407035002543918e-05, + "loss": 0.3891470193862915, + "step": 178640 + }, + { + "epoch": 0.7669817882074135, + "grad_norm": 1.4652893543243408, + "learning_rate": 2.3402723282426292e-05, + "loss": 0.24137496948242188, + "step": 178650 + }, + { + "epoch": 0.7670247202974335, + "grad_norm": 0.006902490276843309, + "learning_rate": 2.339841156230867e-05, + "loss": 0.2944831848144531, + "step": 178660 + }, + { + "epoch": 0.7670676523874536, + "grad_norm": 2.101875066757202, + "learning_rate": 2.3394099842191043e-05, + "loss": 0.1436405062675476, + "step": 178670 + }, + { + "epoch": 0.7671105844774735, + "grad_norm": 0.07013654708862305, + "learning_rate": 2.338978812207342e-05, + "loss": 0.13763973712921143, + "step": 178680 + }, + { + "epoch": 0.7671535165674935, + "grad_norm": 0.21145546436309814, + "learning_rate": 2.3385476401955798e-05, + "loss": 0.12384222745895386, + "step": 178690 + }, + { + "epoch": 0.7671964486575136, + "grad_norm": 1.3103197813034058, + "learning_rate": 2.3381164681838175e-05, + "loss": 0.16632239818572997, + "step": 178700 + }, + { + "epoch": 0.7672393807475335, + "grad_norm": 0.02879696898162365, + "learning_rate": 2.337685296172055e-05, + "loss": 0.20476338863372803, + "step": 178710 + }, + { + "epoch": 0.7672823128375535, + "grad_norm": 1.125725269317627, + "learning_rate": 2.3372541241602927e-05, + "loss": 0.3404285192489624, + "step": 178720 + }, + { + "epoch": 0.7673252449275736, + "grad_norm": 0.10981670767068863, + "learning_rate": 2.33682295214853e-05, + "loss": 0.01966983377933502, + "step": 178730 + }, + { + "epoch": 0.7673681770175935, + "grad_norm": 0.002532815095037222, + "learning_rate": 2.3363917801367678e-05, + "loss": 0.1262107491493225, + "step": 178740 + }, + { + "epoch": 0.7674111091076136, + "grad_norm": 1.2392613887786865, + "learning_rate": 2.3359606081250055e-05, + "loss": 0.1985929489135742, + "step": 178750 + }, + { + "epoch": 0.7674540411976336, + "grad_norm": 1.4831242561340332, + "learning_rate": 2.3355294361132433e-05, + "loss": 0.07167594432830811, + "step": 178760 + }, + { + "epoch": 0.7674969732876535, + "grad_norm": 0.013040987774729729, + "learning_rate": 2.3350982641014807e-05, + "loss": 0.016568221151828766, + "step": 178770 + }, + { + "epoch": 0.7675399053776736, + "grad_norm": 0.09118735790252686, + "learning_rate": 2.3346670920897184e-05, + "loss": 0.203193998336792, + "step": 178780 + }, + { + "epoch": 0.7675828374676936, + "grad_norm": 2.505168914794922, + "learning_rate": 2.3342359200779558e-05, + "loss": 0.41781888008117674, + "step": 178790 + }, + { + "epoch": 0.7676257695577136, + "grad_norm": 0.9963707327842712, + "learning_rate": 2.3338047480661935e-05, + "loss": 0.2993806838989258, + "step": 178800 + }, + { + "epoch": 0.7676687016477336, + "grad_norm": 0.0019730194471776485, + "learning_rate": 2.3333735760544313e-05, + "loss": 0.2103712558746338, + "step": 178810 + }, + { + "epoch": 0.7677116337377536, + "grad_norm": 0.0005218818550929427, + "learning_rate": 2.332942404042669e-05, + "loss": 0.1815126657485962, + "step": 178820 + }, + { + "epoch": 0.7677545658277737, + "grad_norm": 12.162750244140625, + "learning_rate": 2.3325112320309064e-05, + "loss": 0.10047402381896972, + "step": 178830 + }, + { + "epoch": 0.7677974979177936, + "grad_norm": 1.1582043170928955, + "learning_rate": 2.332080060019144e-05, + "loss": 0.12247195243835449, + "step": 178840 + }, + { + "epoch": 0.7678404300078137, + "grad_norm": 0.007292480207979679, + "learning_rate": 2.3316488880073815e-05, + "loss": 0.1326340913772583, + "step": 178850 + }, + { + "epoch": 0.7678833620978337, + "grad_norm": 0.11198096722364426, + "learning_rate": 2.3312177159956196e-05, + "loss": 0.14960025548934935, + "step": 178860 + }, + { + "epoch": 0.7679262941878536, + "grad_norm": 0.0018669597338885069, + "learning_rate": 2.330786543983857e-05, + "loss": 0.08662062883377075, + "step": 178870 + }, + { + "epoch": 0.7679692262778737, + "grad_norm": 0.0002546081959735602, + "learning_rate": 2.3303553719720947e-05, + "loss": 0.16194459199905395, + "step": 178880 + }, + { + "epoch": 0.7680121583678937, + "grad_norm": 0.004007402341812849, + "learning_rate": 2.329924199960332e-05, + "loss": 0.20729336738586426, + "step": 178890 + }, + { + "epoch": 0.7680550904579136, + "grad_norm": 0.017977435141801834, + "learning_rate": 2.32949302794857e-05, + "loss": 0.1353507161140442, + "step": 178900 + }, + { + "epoch": 0.7680980225479337, + "grad_norm": 0.009154192171990871, + "learning_rate": 2.3290618559368073e-05, + "loss": 0.17526334524154663, + "step": 178910 + }, + { + "epoch": 0.7681409546379537, + "grad_norm": 0.003638830967247486, + "learning_rate": 2.3286306839250453e-05, + "loss": 0.05437243580818176, + "step": 178920 + }, + { + "epoch": 0.7681838867279737, + "grad_norm": 0.0003096268919762224, + "learning_rate": 2.3281995119132827e-05, + "loss": 0.12296555042266846, + "step": 178930 + }, + { + "epoch": 0.7682268188179937, + "grad_norm": 0.004455664660781622, + "learning_rate": 2.3277683399015205e-05, + "loss": 0.03397051095962524, + "step": 178940 + }, + { + "epoch": 0.7682697509080137, + "grad_norm": 0.08765816688537598, + "learning_rate": 2.327337167889758e-05, + "loss": 0.17958006858825684, + "step": 178950 + }, + { + "epoch": 0.7683126829980337, + "grad_norm": 0.005225532688200474, + "learning_rate": 2.3269059958779956e-05, + "loss": 0.43357043266296386, + "step": 178960 + }, + { + "epoch": 0.7683556150880537, + "grad_norm": 10.482624053955078, + "learning_rate": 2.3264748238662333e-05, + "loss": 0.30485365390777586, + "step": 178970 + }, + { + "epoch": 0.7683985471780738, + "grad_norm": 0.008869537152349949, + "learning_rate": 2.326043651854471e-05, + "loss": 0.06108769774436951, + "step": 178980 + }, + { + "epoch": 0.7684414792680937, + "grad_norm": 0.00020321154443081468, + "learning_rate": 2.3256124798427088e-05, + "loss": 0.2938662052154541, + "step": 178990 + }, + { + "epoch": 0.7684844113581137, + "grad_norm": 0.019519424065947533, + "learning_rate": 2.3251813078309462e-05, + "loss": 0.08139798045158386, + "step": 179000 + }, + { + "epoch": 0.7684844113581137, + "eval_loss": 0.3833867907524109, + "eval_runtime": 27.4034, + "eval_samples_per_second": 3.649, + "eval_steps_per_second": 3.649, + "step": 179000 + }, + { + "epoch": 0.7685273434481338, + "grad_norm": 0.006098241079598665, + "learning_rate": 2.324750135819184e-05, + "loss": 0.1604252576828003, + "step": 179010 + }, + { + "epoch": 0.7685702755381537, + "grad_norm": 0.01255844533443451, + "learning_rate": 2.3243189638074213e-05, + "loss": 0.07683556675910949, + "step": 179020 + }, + { + "epoch": 0.7686132076281738, + "grad_norm": 0.09523410350084305, + "learning_rate": 2.323887791795659e-05, + "loss": 0.24342586994171142, + "step": 179030 + }, + { + "epoch": 0.7686561397181938, + "grad_norm": 3.1891558170318604, + "learning_rate": 2.3234566197838968e-05, + "loss": 0.3374955177307129, + "step": 179040 + }, + { + "epoch": 0.7686990718082137, + "grad_norm": 4.197160720825195, + "learning_rate": 2.3230254477721345e-05, + "loss": 0.34609458446502683, + "step": 179050 + }, + { + "epoch": 0.7687420038982338, + "grad_norm": 0.15234249830245972, + "learning_rate": 2.322594275760372e-05, + "loss": 0.25315892696380615, + "step": 179060 + }, + { + "epoch": 0.7687849359882538, + "grad_norm": 0.11703097075223923, + "learning_rate": 2.3221631037486097e-05, + "loss": 0.021952293813228607, + "step": 179070 + }, + { + "epoch": 0.7688278680782737, + "grad_norm": 0.0015012217918410897, + "learning_rate": 2.321731931736847e-05, + "loss": 0.08591393828392029, + "step": 179080 + }, + { + "epoch": 0.7688708001682938, + "grad_norm": 0.0504002645611763, + "learning_rate": 2.3213007597250848e-05, + "loss": 0.20234971046447753, + "step": 179090 + }, + { + "epoch": 0.7689137322583138, + "grad_norm": 1.278030276298523, + "learning_rate": 2.3208695877133225e-05, + "loss": 0.30761592388153075, + "step": 179100 + }, + { + "epoch": 0.7689566643483338, + "grad_norm": 0.025709327310323715, + "learning_rate": 2.3204384157015602e-05, + "loss": 0.1279531955718994, + "step": 179110 + }, + { + "epoch": 0.7689995964383538, + "grad_norm": 2.917078971862793, + "learning_rate": 2.3200072436897976e-05, + "loss": 0.4120755195617676, + "step": 179120 + }, + { + "epoch": 0.7690425285283738, + "grad_norm": 0.0006804691511206329, + "learning_rate": 2.3195760716780354e-05, + "loss": 0.09456906318664551, + "step": 179130 + }, + { + "epoch": 0.7690854606183938, + "grad_norm": 0.006927388720214367, + "learning_rate": 2.3191448996662728e-05, + "loss": 0.31485681533813475, + "step": 179140 + }, + { + "epoch": 0.7691283927084138, + "grad_norm": 8.310165405273438, + "learning_rate": 2.3187137276545105e-05, + "loss": 0.40290584564208987, + "step": 179150 + }, + { + "epoch": 0.7691713247984339, + "grad_norm": 1.0132633447647095, + "learning_rate": 2.3182825556427482e-05, + "loss": 0.326165771484375, + "step": 179160 + }, + { + "epoch": 0.7692142568884538, + "grad_norm": 0.004784280899912119, + "learning_rate": 2.317851383630986e-05, + "loss": 0.3097829341888428, + "step": 179170 + }, + { + "epoch": 0.7692571889784738, + "grad_norm": 0.0012556664878502488, + "learning_rate": 2.3174202116192234e-05, + "loss": 0.118812096118927, + "step": 179180 + }, + { + "epoch": 0.7693001210684939, + "grad_norm": 2.7564048767089844, + "learning_rate": 2.316989039607461e-05, + "loss": 0.06245466470718384, + "step": 179190 + }, + { + "epoch": 0.7693430531585138, + "grad_norm": 2.507434129714966, + "learning_rate": 2.3165578675956985e-05, + "loss": 0.1164400339126587, + "step": 179200 + }, + { + "epoch": 0.7693859852485339, + "grad_norm": 0.6105691194534302, + "learning_rate": 2.3161266955839366e-05, + "loss": 0.20296921730041503, + "step": 179210 + }, + { + "epoch": 0.7694289173385539, + "grad_norm": 2.032741069793701, + "learning_rate": 2.315695523572174e-05, + "loss": 0.28797986507415774, + "step": 179220 + }, + { + "epoch": 0.7694718494285738, + "grad_norm": 0.03527984768152237, + "learning_rate": 2.3152643515604117e-05, + "loss": 0.10299174785614014, + "step": 179230 + }, + { + "epoch": 0.7695147815185939, + "grad_norm": 0.0007198863895609975, + "learning_rate": 2.314833179548649e-05, + "loss": 0.13453786373138427, + "step": 179240 + }, + { + "epoch": 0.7695577136086139, + "grad_norm": 0.7343387007713318, + "learning_rate": 2.314402007536887e-05, + "loss": 0.06165881752967835, + "step": 179250 + }, + { + "epoch": 0.769600645698634, + "grad_norm": 0.2602272629737854, + "learning_rate": 2.3139708355251242e-05, + "loss": 0.2444288730621338, + "step": 179260 + }, + { + "epoch": 0.7696435777886539, + "grad_norm": 3.470984935760498, + "learning_rate": 2.3135396635133623e-05, + "loss": 0.1751070499420166, + "step": 179270 + }, + { + "epoch": 0.7696865098786739, + "grad_norm": 0.012675684876739979, + "learning_rate": 2.3131084915015997e-05, + "loss": 0.12060407400131226, + "step": 179280 + }, + { + "epoch": 0.769729441968694, + "grad_norm": 8.048802375793457, + "learning_rate": 2.3126773194898374e-05, + "loss": 0.31142096519470214, + "step": 179290 + }, + { + "epoch": 0.7697723740587139, + "grad_norm": 0.052960868924856186, + "learning_rate": 2.3122461474780748e-05, + "loss": 0.11074914932250976, + "step": 179300 + }, + { + "epoch": 0.769815306148734, + "grad_norm": 0.0007027122192084789, + "learning_rate": 2.3118149754663126e-05, + "loss": 0.1947923183441162, + "step": 179310 + }, + { + "epoch": 0.769858238238754, + "grad_norm": 0.8784052729606628, + "learning_rate": 2.3113838034545503e-05, + "loss": 0.30508151054382326, + "step": 179320 + }, + { + "epoch": 0.7699011703287739, + "grad_norm": 0.06336329132318497, + "learning_rate": 2.310952631442788e-05, + "loss": 0.17809114456176758, + "step": 179330 + }, + { + "epoch": 0.769944102418794, + "grad_norm": 0.02893325686454773, + "learning_rate": 2.3105214594310258e-05, + "loss": 0.21597938537597655, + "step": 179340 + }, + { + "epoch": 0.769987034508814, + "grad_norm": 0.0019647746812552214, + "learning_rate": 2.310090287419263e-05, + "loss": 0.07605461478233337, + "step": 179350 + }, + { + "epoch": 0.7700299665988339, + "grad_norm": 0.0028181010857224464, + "learning_rate": 2.309659115407501e-05, + "loss": 0.17536162137985228, + "step": 179360 + }, + { + "epoch": 0.770072898688854, + "grad_norm": 1.347222924232483, + "learning_rate": 2.3092279433957383e-05, + "loss": 0.22230615615844726, + "step": 179370 + }, + { + "epoch": 0.770115830778874, + "grad_norm": 0.07306050509214401, + "learning_rate": 2.308796771383976e-05, + "loss": 0.31887292861938477, + "step": 179380 + }, + { + "epoch": 0.770158762868894, + "grad_norm": 2.1165270805358887, + "learning_rate": 2.3083655993722138e-05, + "loss": 0.34431474208831786, + "step": 179390 + }, + { + "epoch": 0.770201694958914, + "grad_norm": 0.15936022996902466, + "learning_rate": 2.3079344273604515e-05, + "loss": 0.1585230588912964, + "step": 179400 + }, + { + "epoch": 0.770244627048934, + "grad_norm": 1.2283518314361572, + "learning_rate": 2.307503255348689e-05, + "loss": 0.36916613578796387, + "step": 179410 + }, + { + "epoch": 0.770287559138954, + "grad_norm": 0.015880318358540535, + "learning_rate": 2.3070720833369266e-05, + "loss": 0.3850353717803955, + "step": 179420 + }, + { + "epoch": 0.770330491228974, + "grad_norm": 6.375476837158203, + "learning_rate": 2.306640911325164e-05, + "loss": 0.3537035703659058, + "step": 179430 + }, + { + "epoch": 0.770373423318994, + "grad_norm": 2.0374484062194824, + "learning_rate": 2.3062097393134018e-05, + "loss": 0.46723246574401855, + "step": 179440 + }, + { + "epoch": 0.770416355409014, + "grad_norm": 0.9383795857429504, + "learning_rate": 2.3057785673016395e-05, + "loss": 0.11651902198791504, + "step": 179450 + }, + { + "epoch": 0.770459287499034, + "grad_norm": 0.7634946703910828, + "learning_rate": 2.3053473952898772e-05, + "loss": 0.1451859712600708, + "step": 179460 + }, + { + "epoch": 0.7705022195890541, + "grad_norm": 6.315857887268066, + "learning_rate": 2.3049162232781146e-05, + "loss": 0.3340526342391968, + "step": 179470 + }, + { + "epoch": 0.770545151679074, + "grad_norm": 1.4309190511703491, + "learning_rate": 2.3044850512663524e-05, + "loss": 0.12667789459228515, + "step": 179480 + }, + { + "epoch": 0.770588083769094, + "grad_norm": 1.7753784656524658, + "learning_rate": 2.3040538792545897e-05, + "loss": 0.3001526355743408, + "step": 179490 + }, + { + "epoch": 0.7706310158591141, + "grad_norm": 1.0547752380371094, + "learning_rate": 2.3036227072428275e-05, + "loss": 0.3086581230163574, + "step": 179500 + }, + { + "epoch": 0.770673947949134, + "grad_norm": 3.6708216667175293, + "learning_rate": 2.3031915352310652e-05, + "loss": 0.4584041595458984, + "step": 179510 + }, + { + "epoch": 0.7707168800391541, + "grad_norm": 1.5989513397216797, + "learning_rate": 2.302760363219303e-05, + "loss": 0.2776602506637573, + "step": 179520 + }, + { + "epoch": 0.7707598121291741, + "grad_norm": 0.2675705850124359, + "learning_rate": 2.3023291912075403e-05, + "loss": 0.11531891822814941, + "step": 179530 + }, + { + "epoch": 0.770802744219194, + "grad_norm": 0.14942918717861176, + "learning_rate": 2.301898019195778e-05, + "loss": 0.13127764463424682, + "step": 179540 + }, + { + "epoch": 0.7708456763092141, + "grad_norm": 0.02816365659236908, + "learning_rate": 2.3014668471840155e-05, + "loss": 0.17667512893676757, + "step": 179550 + }, + { + "epoch": 0.7708886083992341, + "grad_norm": 0.006725494284182787, + "learning_rate": 2.3010356751722532e-05, + "loss": 0.3269664287567139, + "step": 179560 + }, + { + "epoch": 0.770931540489254, + "grad_norm": 0.016622474417090416, + "learning_rate": 2.300604503160491e-05, + "loss": 0.15728062391281128, + "step": 179570 + }, + { + "epoch": 0.7709744725792741, + "grad_norm": 20.81556510925293, + "learning_rate": 2.3001733311487287e-05, + "loss": 0.17218732833862305, + "step": 179580 + }, + { + "epoch": 0.7710174046692941, + "grad_norm": 0.0007839555619284511, + "learning_rate": 2.299742159136966e-05, + "loss": 0.01884029060602188, + "step": 179590 + }, + { + "epoch": 0.7710603367593141, + "grad_norm": 0.43381252884864807, + "learning_rate": 2.2993109871252038e-05, + "loss": 0.18695248365402223, + "step": 179600 + }, + { + "epoch": 0.7711032688493341, + "grad_norm": 5.024713039398193, + "learning_rate": 2.2988798151134412e-05, + "loss": 0.3955603361129761, + "step": 179610 + }, + { + "epoch": 0.7711462009393542, + "grad_norm": 1.1479946374893188, + "learning_rate": 2.2984486431016793e-05, + "loss": 0.27515287399291993, + "step": 179620 + }, + { + "epoch": 0.7711891330293741, + "grad_norm": 0.017682382836937904, + "learning_rate": 2.2980174710899167e-05, + "loss": 0.20232205390930175, + "step": 179630 + }, + { + "epoch": 0.7712320651193941, + "grad_norm": 0.0059524052776396275, + "learning_rate": 2.2975862990781544e-05, + "loss": 0.2792728185653687, + "step": 179640 + }, + { + "epoch": 0.7712749972094142, + "grad_norm": 0.006699263118207455, + "learning_rate": 2.2971551270663918e-05, + "loss": 0.16753827333450316, + "step": 179650 + }, + { + "epoch": 0.7713179292994341, + "grad_norm": 0.029053257778286934, + "learning_rate": 2.2967239550546295e-05, + "loss": 0.2607213258743286, + "step": 179660 + }, + { + "epoch": 0.7713608613894541, + "grad_norm": 1.745116114616394, + "learning_rate": 2.296292783042867e-05, + "loss": 0.11559852361679077, + "step": 179670 + }, + { + "epoch": 0.7714037934794742, + "grad_norm": 1.3347855806350708, + "learning_rate": 2.295861611031105e-05, + "loss": 0.35223388671875, + "step": 179680 + }, + { + "epoch": 0.7714467255694942, + "grad_norm": 2.656464099884033, + "learning_rate": 2.2954304390193424e-05, + "loss": 0.33100650310516355, + "step": 179690 + }, + { + "epoch": 0.7714896576595142, + "grad_norm": 7.4256391525268555, + "learning_rate": 2.29499926700758e-05, + "loss": 0.35502123832702637, + "step": 179700 + }, + { + "epoch": 0.7715325897495342, + "grad_norm": 0.007397874724119902, + "learning_rate": 2.294568094995818e-05, + "loss": 0.2101654291152954, + "step": 179710 + }, + { + "epoch": 0.7715755218395542, + "grad_norm": 0.0042721061035990715, + "learning_rate": 2.2941369229840553e-05, + "loss": 0.009166765213012695, + "step": 179720 + }, + { + "epoch": 0.7716184539295742, + "grad_norm": 0.07033202797174454, + "learning_rate": 2.293705750972293e-05, + "loss": 0.13368966579437255, + "step": 179730 + }, + { + "epoch": 0.7716613860195942, + "grad_norm": 1.5344808101654053, + "learning_rate": 2.2932745789605307e-05, + "loss": 0.3444932222366333, + "step": 179740 + }, + { + "epoch": 0.7717043181096143, + "grad_norm": 0.7983564138412476, + "learning_rate": 2.2928434069487685e-05, + "loss": 0.033365219831466675, + "step": 179750 + }, + { + "epoch": 0.7717472501996342, + "grad_norm": 3.2751638889312744, + "learning_rate": 2.292412234937006e-05, + "loss": 0.20990748405456544, + "step": 179760 + }, + { + "epoch": 0.7717901822896542, + "grad_norm": 1.1416313648223877, + "learning_rate": 2.2919810629252436e-05, + "loss": 0.15725255012512207, + "step": 179770 + }, + { + "epoch": 0.7718331143796743, + "grad_norm": 8.257407188415527, + "learning_rate": 2.291549890913481e-05, + "loss": 0.17753384113311768, + "step": 179780 + }, + { + "epoch": 0.7718760464696942, + "grad_norm": 0.5357545018196106, + "learning_rate": 2.2911187189017187e-05, + "loss": 0.14562582969665527, + "step": 179790 + }, + { + "epoch": 0.7719189785597143, + "grad_norm": 1.7779492139816284, + "learning_rate": 2.2906875468899565e-05, + "loss": 0.13873924016952516, + "step": 179800 + }, + { + "epoch": 0.7719619106497343, + "grad_norm": 1.9327869415283203, + "learning_rate": 2.2902563748781942e-05, + "loss": 0.37070517539978026, + "step": 179810 + }, + { + "epoch": 0.7720048427397542, + "grad_norm": 2.53825306892395, + "learning_rate": 2.2898252028664316e-05, + "loss": 0.36956005096435546, + "step": 179820 + }, + { + "epoch": 0.7720477748297743, + "grad_norm": 0.028061002492904663, + "learning_rate": 2.2893940308546693e-05, + "loss": 0.05154078602790833, + "step": 179830 + }, + { + "epoch": 0.7720907069197943, + "grad_norm": 0.0300615057349205, + "learning_rate": 2.2889628588429067e-05, + "loss": 0.19126038551330565, + "step": 179840 + }, + { + "epoch": 0.7721336390098142, + "grad_norm": 12.17878246307373, + "learning_rate": 2.2885316868311445e-05, + "loss": 0.2945890426635742, + "step": 179850 + }, + { + "epoch": 0.7721765710998343, + "grad_norm": 5.995707035064697, + "learning_rate": 2.2881005148193822e-05, + "loss": 0.09305886626243591, + "step": 179860 + }, + { + "epoch": 0.7722195031898543, + "grad_norm": 0.06677145510911942, + "learning_rate": 2.28766934280762e-05, + "loss": 0.3105530500411987, + "step": 179870 + }, + { + "epoch": 0.7722624352798743, + "grad_norm": 2.9172847270965576, + "learning_rate": 2.2872381707958573e-05, + "loss": 0.13742368221282958, + "step": 179880 + }, + { + "epoch": 0.7723053673698943, + "grad_norm": 0.015876207500696182, + "learning_rate": 2.286806998784095e-05, + "loss": 0.09454439282417297, + "step": 179890 + }, + { + "epoch": 0.7723482994599143, + "grad_norm": 0.019737502560019493, + "learning_rate": 2.2863758267723324e-05, + "loss": 0.29710509777069094, + "step": 179900 + }, + { + "epoch": 0.7723912315499343, + "grad_norm": 1.5522518157958984, + "learning_rate": 2.2859446547605702e-05, + "loss": 0.12715576887130736, + "step": 179910 + }, + { + "epoch": 0.7724341636399543, + "grad_norm": 2.2886972427368164, + "learning_rate": 2.285513482748808e-05, + "loss": 0.41290721893310545, + "step": 179920 + }, + { + "epoch": 0.7724770957299744, + "grad_norm": 0.04067216068506241, + "learning_rate": 2.2850823107370457e-05, + "loss": 0.11689453125, + "step": 179930 + }, + { + "epoch": 0.7725200278199943, + "grad_norm": 0.002634049393236637, + "learning_rate": 2.284651138725283e-05, + "loss": 0.25855841636657717, + "step": 179940 + }, + { + "epoch": 0.7725629599100143, + "grad_norm": 1.6041786670684814, + "learning_rate": 2.2842199667135208e-05, + "loss": 0.21327719688415528, + "step": 179950 + }, + { + "epoch": 0.7726058920000344, + "grad_norm": 0.007944965735077858, + "learning_rate": 2.2837887947017582e-05, + "loss": 0.22927529811859132, + "step": 179960 + }, + { + "epoch": 0.7726488240900543, + "grad_norm": 0.025026388466358185, + "learning_rate": 2.2833576226899963e-05, + "loss": 0.09137169122695923, + "step": 179970 + }, + { + "epoch": 0.7726917561800744, + "grad_norm": 0.0005246539367362857, + "learning_rate": 2.2829264506782336e-05, + "loss": 0.24379174709320067, + "step": 179980 + }, + { + "epoch": 0.7727346882700944, + "grad_norm": 0.014372066594660282, + "learning_rate": 2.2824952786664714e-05, + "loss": 0.286321234703064, + "step": 179990 + }, + { + "epoch": 0.7727776203601143, + "grad_norm": 0.008647691458463669, + "learning_rate": 2.2820641066547088e-05, + "loss": 0.13580285310745238, + "step": 180000 + }, + { + "epoch": 0.7727776203601143, + "eval_loss": 0.38957008719444275, + "eval_runtime": 27.4406, + "eval_samples_per_second": 3.644, + "eval_steps_per_second": 3.644, + "step": 180000 + }, + { + "epoch": 0.7728205524501344, + "grad_norm": 0.9259838461875916, + "learning_rate": 2.2816329346429465e-05, + "loss": 0.5427755832672119, + "step": 180010 + }, + { + "epoch": 0.7728634845401544, + "grad_norm": 41.22263717651367, + "learning_rate": 2.281201762631184e-05, + "loss": 0.22616727352142335, + "step": 180020 + }, + { + "epoch": 0.7729064166301743, + "grad_norm": 1.4104810953140259, + "learning_rate": 2.280770590619422e-05, + "loss": 0.31243743896484377, + "step": 180030 + }, + { + "epoch": 0.7729493487201944, + "grad_norm": 2.4222309589385986, + "learning_rate": 2.2803394186076594e-05, + "loss": 0.30273444652557374, + "step": 180040 + }, + { + "epoch": 0.7729922808102144, + "grad_norm": 0.28040480613708496, + "learning_rate": 2.279908246595897e-05, + "loss": 0.07433147430419922, + "step": 180050 + }, + { + "epoch": 0.7730352129002344, + "grad_norm": 2.304126739501953, + "learning_rate": 2.279477074584135e-05, + "loss": 0.17433713674545287, + "step": 180060 + }, + { + "epoch": 0.7730781449902544, + "grad_norm": 0.02427615225315094, + "learning_rate": 2.2790459025723722e-05, + "loss": 0.24402711391448975, + "step": 180070 + }, + { + "epoch": 0.7731210770802744, + "grad_norm": 0.0444793738424778, + "learning_rate": 2.27861473056061e-05, + "loss": 0.014486879110336304, + "step": 180080 + }, + { + "epoch": 0.7731640091702944, + "grad_norm": 0.5242799520492554, + "learning_rate": 2.2781835585488477e-05, + "loss": 0.18714048862457275, + "step": 180090 + }, + { + "epoch": 0.7732069412603144, + "grad_norm": 2.1512036323547363, + "learning_rate": 2.2777523865370854e-05, + "loss": 0.14754347801208495, + "step": 180100 + }, + { + "epoch": 0.7732498733503345, + "grad_norm": 0.0005996071267873049, + "learning_rate": 2.277321214525323e-05, + "loss": 0.17570135593414307, + "step": 180110 + }, + { + "epoch": 0.7732928054403545, + "grad_norm": 5.650803565979004, + "learning_rate": 2.2768900425135606e-05, + "loss": 0.2217705011367798, + "step": 180120 + }, + { + "epoch": 0.7733357375303744, + "grad_norm": 2.09537410736084, + "learning_rate": 2.276458870501798e-05, + "loss": 0.15717799663543702, + "step": 180130 + }, + { + "epoch": 0.7733786696203945, + "grad_norm": 0.002615907695144415, + "learning_rate": 2.2760276984900357e-05, + "loss": 0.21549594402313232, + "step": 180140 + }, + { + "epoch": 0.7734216017104145, + "grad_norm": 0.013990242965519428, + "learning_rate": 2.2755965264782734e-05, + "loss": 0.010225190967321395, + "step": 180150 + }, + { + "epoch": 0.7734645338004345, + "grad_norm": 0.33775243163108826, + "learning_rate": 2.2751653544665112e-05, + "loss": 0.34395158290863037, + "step": 180160 + }, + { + "epoch": 0.7735074658904545, + "grad_norm": 0.20428995788097382, + "learning_rate": 2.2747341824547486e-05, + "loss": 0.08059185743331909, + "step": 180170 + }, + { + "epoch": 0.7735503979804745, + "grad_norm": 2.344538450241089, + "learning_rate": 2.2743030104429863e-05, + "loss": 0.4059558391571045, + "step": 180180 + }, + { + "epoch": 0.7735933300704945, + "grad_norm": 2.440988540649414, + "learning_rate": 2.2738718384312237e-05, + "loss": 0.2932596206665039, + "step": 180190 + }, + { + "epoch": 0.7736362621605145, + "grad_norm": 0.0016101880464702845, + "learning_rate": 2.2734406664194614e-05, + "loss": 0.2458946466445923, + "step": 180200 + }, + { + "epoch": 0.7736791942505346, + "grad_norm": 0.2261928915977478, + "learning_rate": 2.273009494407699e-05, + "loss": 0.06624903678894042, + "step": 180210 + }, + { + "epoch": 0.7737221263405545, + "grad_norm": 0.02708575315773487, + "learning_rate": 2.272578322395937e-05, + "loss": 0.11959649324417114, + "step": 180220 + }, + { + "epoch": 0.7737650584305745, + "grad_norm": 11.702322959899902, + "learning_rate": 2.2721471503841743e-05, + "loss": 0.2761650323867798, + "step": 180230 + }, + { + "epoch": 0.7738079905205946, + "grad_norm": 0.006343924440443516, + "learning_rate": 2.271715978372412e-05, + "loss": 0.16064642667770385, + "step": 180240 + }, + { + "epoch": 0.7738509226106145, + "grad_norm": 0.14308874309062958, + "learning_rate": 2.2712848063606494e-05, + "loss": 0.23857133388519286, + "step": 180250 + }, + { + "epoch": 0.7738938547006345, + "grad_norm": 8.168606758117676, + "learning_rate": 2.270853634348887e-05, + "loss": 0.18918099403381347, + "step": 180260 + }, + { + "epoch": 0.7739367867906546, + "grad_norm": 0.03108370304107666, + "learning_rate": 2.270422462337125e-05, + "loss": 0.19499192237854004, + "step": 180270 + }, + { + "epoch": 0.7739797188806745, + "grad_norm": 1.625861644744873, + "learning_rate": 2.2699912903253626e-05, + "loss": 0.39890251159667967, + "step": 180280 + }, + { + "epoch": 0.7740226509706946, + "grad_norm": 0.5996838212013245, + "learning_rate": 2.2695601183136e-05, + "loss": 0.08134393692016602, + "step": 180290 + }, + { + "epoch": 0.7740655830607146, + "grad_norm": 2.1445157527923584, + "learning_rate": 2.2691289463018378e-05, + "loss": 0.22284488677978515, + "step": 180300 + }, + { + "epoch": 0.7741085151507345, + "grad_norm": 0.04387044161558151, + "learning_rate": 2.268697774290075e-05, + "loss": 0.25220816135406493, + "step": 180310 + }, + { + "epoch": 0.7741514472407546, + "grad_norm": 1.3502665758132935, + "learning_rate": 2.2682666022783132e-05, + "loss": 0.2518911600112915, + "step": 180320 + }, + { + "epoch": 0.7741943793307746, + "grad_norm": 2.0550084114074707, + "learning_rate": 2.2678354302665506e-05, + "loss": 0.1281597137451172, + "step": 180330 + }, + { + "epoch": 0.7742373114207945, + "grad_norm": 2.0173451900482178, + "learning_rate": 2.2674042582547884e-05, + "loss": 0.07103241682052612, + "step": 180340 + }, + { + "epoch": 0.7742802435108146, + "grad_norm": 1.8583507537841797, + "learning_rate": 2.2669730862430257e-05, + "loss": 0.2636585235595703, + "step": 180350 + }, + { + "epoch": 0.7743231756008346, + "grad_norm": 0.00575015926733613, + "learning_rate": 2.2665419142312635e-05, + "loss": 0.3370833873748779, + "step": 180360 + }, + { + "epoch": 0.7743661076908546, + "grad_norm": 1.0144314765930176, + "learning_rate": 2.266110742219501e-05, + "loss": 0.16626391410827637, + "step": 180370 + }, + { + "epoch": 0.7744090397808746, + "grad_norm": 0.0012669252464547753, + "learning_rate": 2.265679570207739e-05, + "loss": 0.11038153171539307, + "step": 180380 + }, + { + "epoch": 0.7744519718708947, + "grad_norm": 0.08255119621753693, + "learning_rate": 2.2652483981959763e-05, + "loss": 0.12617795467376708, + "step": 180390 + }, + { + "epoch": 0.7744949039609146, + "grad_norm": 0.012879389338195324, + "learning_rate": 2.264817226184214e-05, + "loss": 0.3277858018875122, + "step": 180400 + }, + { + "epoch": 0.7745378360509346, + "grad_norm": 9.729223251342773, + "learning_rate": 2.2643860541724515e-05, + "loss": 0.20663437843322754, + "step": 180410 + }, + { + "epoch": 0.7745807681409547, + "grad_norm": 0.3886057734489441, + "learning_rate": 2.2639548821606892e-05, + "loss": 0.14941807985305786, + "step": 180420 + }, + { + "epoch": 0.7746237002309746, + "grad_norm": 0.0036090456414967775, + "learning_rate": 2.263523710148927e-05, + "loss": 0.30705966949462893, + "step": 180430 + }, + { + "epoch": 0.7746666323209946, + "grad_norm": 0.02179667167365551, + "learning_rate": 2.2630925381371647e-05, + "loss": 0.13955130577087402, + "step": 180440 + }, + { + "epoch": 0.7747095644110147, + "grad_norm": 0.003469049697741866, + "learning_rate": 2.2626613661254024e-05, + "loss": 0.08940157890319825, + "step": 180450 + }, + { + "epoch": 0.7747524965010346, + "grad_norm": 1.649953007698059, + "learning_rate": 2.2622301941136398e-05, + "loss": 0.2585058450698853, + "step": 180460 + }, + { + "epoch": 0.7747954285910547, + "grad_norm": 0.0027396460063755512, + "learning_rate": 2.2617990221018775e-05, + "loss": 0.20617377758026123, + "step": 180470 + }, + { + "epoch": 0.7748383606810747, + "grad_norm": 0.0049470034427940845, + "learning_rate": 2.261367850090115e-05, + "loss": 0.05881868600845337, + "step": 180480 + }, + { + "epoch": 0.7748812927710946, + "grad_norm": 4.35863733291626, + "learning_rate": 2.2609366780783527e-05, + "loss": 0.24249038696289063, + "step": 180490 + }, + { + "epoch": 0.7749242248611147, + "grad_norm": 2.722135305404663, + "learning_rate": 2.2605055060665904e-05, + "loss": 0.32448766231536863, + "step": 180500 + }, + { + "epoch": 0.7749671569511347, + "grad_norm": 0.010333052836358547, + "learning_rate": 2.260074334054828e-05, + "loss": 0.32258646488189696, + "step": 180510 + }, + { + "epoch": 0.7750100890411546, + "grad_norm": 0.0034809063654392958, + "learning_rate": 2.2596431620430655e-05, + "loss": 0.09903744459152222, + "step": 180520 + }, + { + "epoch": 0.7750530211311747, + "grad_norm": 0.9046080112457275, + "learning_rate": 2.2592119900313033e-05, + "loss": 0.39251620769500734, + "step": 180530 + }, + { + "epoch": 0.7750959532211947, + "grad_norm": 0.0639759823679924, + "learning_rate": 2.2587808180195407e-05, + "loss": 0.021153028309345245, + "step": 180540 + }, + { + "epoch": 0.7751388853112148, + "grad_norm": 3.202190637588501, + "learning_rate": 2.2583496460077784e-05, + "loss": 0.14328465461730958, + "step": 180550 + }, + { + "epoch": 0.7751818174012347, + "grad_norm": 2.1860907077789307, + "learning_rate": 2.257918473996016e-05, + "loss": 0.4133021831512451, + "step": 180560 + }, + { + "epoch": 0.7752247494912547, + "grad_norm": 34.30323791503906, + "learning_rate": 2.257487301984254e-05, + "loss": 0.08191108703613281, + "step": 180570 + }, + { + "epoch": 0.7752676815812748, + "grad_norm": 0.9189937114715576, + "learning_rate": 2.2570561299724913e-05, + "loss": 0.32755978107452394, + "step": 180580 + }, + { + "epoch": 0.7753106136712947, + "grad_norm": 0.04545053094625473, + "learning_rate": 2.256624957960729e-05, + "loss": 0.07546036839485168, + "step": 180590 + }, + { + "epoch": 0.7753535457613148, + "grad_norm": 0.4012831151485443, + "learning_rate": 2.2561937859489664e-05, + "loss": 0.2035548448562622, + "step": 180600 + }, + { + "epoch": 0.7753964778513348, + "grad_norm": 0.5780662298202515, + "learning_rate": 2.255762613937204e-05, + "loss": 0.02327096462249756, + "step": 180610 + }, + { + "epoch": 0.7754394099413547, + "grad_norm": 1.632012963294983, + "learning_rate": 2.255331441925442e-05, + "loss": 0.2648672580718994, + "step": 180620 + }, + { + "epoch": 0.7754823420313748, + "grad_norm": 0.15308120846748352, + "learning_rate": 2.2549002699136796e-05, + "loss": 0.17612071037292482, + "step": 180630 + }, + { + "epoch": 0.7755252741213948, + "grad_norm": 0.047732140868902206, + "learning_rate": 2.254469097901917e-05, + "loss": 0.08916597366333008, + "step": 180640 + }, + { + "epoch": 0.7755682062114148, + "grad_norm": 5.136704444885254, + "learning_rate": 2.2540379258901547e-05, + "loss": 0.42125658988952636, + "step": 180650 + }, + { + "epoch": 0.7756111383014348, + "grad_norm": 0.7114421725273132, + "learning_rate": 2.253606753878392e-05, + "loss": 0.37050158977508546, + "step": 180660 + }, + { + "epoch": 0.7756540703914548, + "grad_norm": 0.008971183560788631, + "learning_rate": 2.25317558186663e-05, + "loss": 0.18429280519485475, + "step": 180670 + }, + { + "epoch": 0.7756970024814748, + "grad_norm": 0.017602117732167244, + "learning_rate": 2.2527444098548676e-05, + "loss": 0.0649482250213623, + "step": 180680 + }, + { + "epoch": 0.7757399345714948, + "grad_norm": 1.1628626585006714, + "learning_rate": 2.2523132378431053e-05, + "loss": 0.2136044979095459, + "step": 180690 + }, + { + "epoch": 0.7757828666615149, + "grad_norm": 0.6008757948875427, + "learning_rate": 2.2518820658313427e-05, + "loss": 0.2509660243988037, + "step": 180700 + }, + { + "epoch": 0.7758257987515348, + "grad_norm": 0.016465460881590843, + "learning_rate": 2.2514508938195805e-05, + "loss": 0.1221784234046936, + "step": 180710 + }, + { + "epoch": 0.7758687308415548, + "grad_norm": 0.5145441889762878, + "learning_rate": 2.251019721807818e-05, + "loss": 0.22736752033233643, + "step": 180720 + }, + { + "epoch": 0.7759116629315749, + "grad_norm": 3.371095895767212, + "learning_rate": 2.250588549796056e-05, + "loss": 0.3835928201675415, + "step": 180730 + }, + { + "epoch": 0.7759545950215948, + "grad_norm": 3.624058485031128, + "learning_rate": 2.2501573777842933e-05, + "loss": 0.15203168392181396, + "step": 180740 + }, + { + "epoch": 0.7759975271116148, + "grad_norm": 6.183959484100342, + "learning_rate": 2.249726205772531e-05, + "loss": 0.13569035530090331, + "step": 180750 + }, + { + "epoch": 0.7760404592016349, + "grad_norm": 0.5464549660682678, + "learning_rate": 2.2492950337607685e-05, + "loss": 0.08163414001464844, + "step": 180760 + }, + { + "epoch": 0.7760833912916548, + "grad_norm": 2.6773977279663086, + "learning_rate": 2.2488638617490062e-05, + "loss": 0.20119915008544922, + "step": 180770 + }, + { + "epoch": 0.7761263233816749, + "grad_norm": 0.17884625494480133, + "learning_rate": 2.2484326897372436e-05, + "loss": 0.31024243831634524, + "step": 180780 + }, + { + "epoch": 0.7761692554716949, + "grad_norm": 0.003842623671516776, + "learning_rate": 2.2480015177254817e-05, + "loss": 0.18495142459869385, + "step": 180790 + }, + { + "epoch": 0.7762121875617148, + "grad_norm": 0.0021976360585540533, + "learning_rate": 2.2475703457137194e-05, + "loss": 0.15571430921554566, + "step": 180800 + }, + { + "epoch": 0.7762551196517349, + "grad_norm": 0.04043019562959671, + "learning_rate": 2.2471391737019568e-05, + "loss": 0.12830135822296143, + "step": 180810 + }, + { + "epoch": 0.7762980517417549, + "grad_norm": 0.7248302698135376, + "learning_rate": 2.2467080016901945e-05, + "loss": 0.2668464183807373, + "step": 180820 + }, + { + "epoch": 0.7763409838317749, + "grad_norm": 0.02985418029129505, + "learning_rate": 2.246276829678432e-05, + "loss": 0.23797941207885742, + "step": 180830 + }, + { + "epoch": 0.7763839159217949, + "grad_norm": 0.005406526383012533, + "learning_rate": 2.2458456576666696e-05, + "loss": 0.13979263305664064, + "step": 180840 + }, + { + "epoch": 0.7764268480118149, + "grad_norm": 0.7003358602523804, + "learning_rate": 2.2454144856549074e-05, + "loss": 0.2024545907974243, + "step": 180850 + }, + { + "epoch": 0.7764697801018349, + "grad_norm": 2.006132125854492, + "learning_rate": 2.244983313643145e-05, + "loss": 0.36295666694641116, + "step": 180860 + }, + { + "epoch": 0.7765127121918549, + "grad_norm": 0.3677394986152649, + "learning_rate": 2.2445521416313825e-05, + "loss": 0.21423032283782958, + "step": 180870 + }, + { + "epoch": 0.776555644281875, + "grad_norm": 0.0009157925960607827, + "learning_rate": 2.2441209696196202e-05, + "loss": 0.19082083702087402, + "step": 180880 + }, + { + "epoch": 0.7765985763718949, + "grad_norm": 0.44521912932395935, + "learning_rate": 2.2436897976078576e-05, + "loss": 0.39336583614349363, + "step": 180890 + }, + { + "epoch": 0.7766415084619149, + "grad_norm": 0.010876310989260674, + "learning_rate": 2.2432586255960954e-05, + "loss": 0.3106126308441162, + "step": 180900 + }, + { + "epoch": 0.776684440551935, + "grad_norm": 2.960257053375244, + "learning_rate": 2.242827453584333e-05, + "loss": 0.27347769737243655, + "step": 180910 + }, + { + "epoch": 0.7767273726419549, + "grad_norm": 0.3393663465976715, + "learning_rate": 2.242396281572571e-05, + "loss": 0.27551591396331787, + "step": 180920 + }, + { + "epoch": 0.776770304731975, + "grad_norm": 1.4277352094650269, + "learning_rate": 2.2419651095608082e-05, + "loss": 0.12645928859710692, + "step": 180930 + }, + { + "epoch": 0.776813236821995, + "grad_norm": 0.007761645596474409, + "learning_rate": 2.241533937549046e-05, + "loss": 0.20199737548828126, + "step": 180940 + }, + { + "epoch": 0.7768561689120149, + "grad_norm": 0.00503843417391181, + "learning_rate": 2.2411027655372834e-05, + "loss": 0.05249316096305847, + "step": 180950 + }, + { + "epoch": 0.776899101002035, + "grad_norm": 0.013779071159660816, + "learning_rate": 2.240671593525521e-05, + "loss": 0.0818875014781952, + "step": 180960 + }, + { + "epoch": 0.776942033092055, + "grad_norm": 0.001981938723474741, + "learning_rate": 2.240240421513759e-05, + "loss": 0.12001742124557495, + "step": 180970 + }, + { + "epoch": 0.776984965182075, + "grad_norm": 0.10726061463356018, + "learning_rate": 2.2398092495019966e-05, + "loss": 0.11583765745162963, + "step": 180980 + }, + { + "epoch": 0.777027897272095, + "grad_norm": 0.0004066908441018313, + "learning_rate": 2.239378077490234e-05, + "loss": 0.07836989164352418, + "step": 180990 + }, + { + "epoch": 0.777070829362115, + "grad_norm": 0.5633981227874756, + "learning_rate": 2.2389469054784717e-05, + "loss": 0.08607017993927002, + "step": 181000 + }, + { + "epoch": 0.777070829362115, + "eval_loss": 0.38105422258377075, + "eval_runtime": 27.5029, + "eval_samples_per_second": 3.636, + "eval_steps_per_second": 3.636, + "step": 181000 + }, + { + "epoch": 0.7771137614521351, + "grad_norm": 0.2542254328727722, + "learning_rate": 2.238515733466709e-05, + "loss": 0.24805686473846436, + "step": 181010 + }, + { + "epoch": 0.777156693542155, + "grad_norm": 2.633180856704712, + "learning_rate": 2.238084561454947e-05, + "loss": 0.12436635494232177, + "step": 181020 + }, + { + "epoch": 0.777199625632175, + "grad_norm": 0.5145475268363953, + "learning_rate": 2.2376533894431846e-05, + "loss": 0.13324408531188964, + "step": 181030 + }, + { + "epoch": 0.7772425577221951, + "grad_norm": 0.002931904746219516, + "learning_rate": 2.2372222174314223e-05, + "loss": 0.13255962133407592, + "step": 181040 + }, + { + "epoch": 0.777285489812215, + "grad_norm": 0.047408297657966614, + "learning_rate": 2.2367910454196597e-05, + "loss": 0.13203340768814087, + "step": 181050 + }, + { + "epoch": 0.7773284219022351, + "grad_norm": 3.426299810409546, + "learning_rate": 2.2363598734078974e-05, + "loss": 0.4311577796936035, + "step": 181060 + }, + { + "epoch": 0.7773713539922551, + "grad_norm": 0.04267265275120735, + "learning_rate": 2.2359287013961348e-05, + "loss": 0.3033830404281616, + "step": 181070 + }, + { + "epoch": 0.777414286082275, + "grad_norm": 1.0460861921310425, + "learning_rate": 2.235497529384373e-05, + "loss": 0.2784090995788574, + "step": 181080 + }, + { + "epoch": 0.7774572181722951, + "grad_norm": 0.10757939517498016, + "learning_rate": 2.2350663573726103e-05, + "loss": 0.2997272491455078, + "step": 181090 + }, + { + "epoch": 0.7775001502623151, + "grad_norm": 0.29957115650177, + "learning_rate": 2.234635185360848e-05, + "loss": 0.16533089876174928, + "step": 181100 + }, + { + "epoch": 0.777543082352335, + "grad_norm": 0.01229582354426384, + "learning_rate": 2.2342040133490854e-05, + "loss": 0.14539920091629027, + "step": 181110 + }, + { + "epoch": 0.7775860144423551, + "grad_norm": 5.837769508361816, + "learning_rate": 2.233772841337323e-05, + "loss": 0.22391026020050048, + "step": 181120 + }, + { + "epoch": 0.7776289465323751, + "grad_norm": 0.005174816586077213, + "learning_rate": 2.2333416693255606e-05, + "loss": 0.1318308472633362, + "step": 181130 + }, + { + "epoch": 0.7776718786223951, + "grad_norm": 1.294174075126648, + "learning_rate": 2.2329104973137986e-05, + "loss": 0.1779860258102417, + "step": 181140 + }, + { + "epoch": 0.7777148107124151, + "grad_norm": 1.2440794706344604, + "learning_rate": 2.2324793253020364e-05, + "loss": 0.5334236145019531, + "step": 181150 + }, + { + "epoch": 0.7777577428024351, + "grad_norm": 59.03296661376953, + "learning_rate": 2.2320481532902738e-05, + "loss": 0.30062379837036135, + "step": 181160 + }, + { + "epoch": 0.7778006748924551, + "grad_norm": 2.072064161300659, + "learning_rate": 2.2316169812785115e-05, + "loss": 0.15776560306549073, + "step": 181170 + }, + { + "epoch": 0.7778436069824751, + "grad_norm": 0.0024535886477679014, + "learning_rate": 2.231185809266749e-05, + "loss": 0.04311685562133789, + "step": 181180 + }, + { + "epoch": 0.7778865390724952, + "grad_norm": 6.453382968902588, + "learning_rate": 2.2307546372549866e-05, + "loss": 0.30128116607666017, + "step": 181190 + }, + { + "epoch": 0.7779294711625151, + "grad_norm": 0.003028257517144084, + "learning_rate": 2.2303234652432244e-05, + "loss": 0.24861340522766112, + "step": 181200 + }, + { + "epoch": 0.7779724032525351, + "grad_norm": 0.5890544056892395, + "learning_rate": 2.229892293231462e-05, + "loss": 0.20256829261779785, + "step": 181210 + }, + { + "epoch": 0.7780153353425552, + "grad_norm": 0.1589733362197876, + "learning_rate": 2.2294611212196995e-05, + "loss": 0.031123009324073792, + "step": 181220 + }, + { + "epoch": 0.7780582674325751, + "grad_norm": 0.0035427564289420843, + "learning_rate": 2.2290299492079372e-05, + "loss": 0.1692456603050232, + "step": 181230 + }, + { + "epoch": 0.7781011995225952, + "grad_norm": 1.0639280080795288, + "learning_rate": 2.2285987771961746e-05, + "loss": 0.19282352924346924, + "step": 181240 + }, + { + "epoch": 0.7781441316126152, + "grad_norm": 0.019336223602294922, + "learning_rate": 2.2281676051844123e-05, + "loss": 0.11884394884109498, + "step": 181250 + }, + { + "epoch": 0.7781870637026351, + "grad_norm": 3.3598930835723877, + "learning_rate": 2.22773643317265e-05, + "loss": 0.11887574195861816, + "step": 181260 + }, + { + "epoch": 0.7782299957926552, + "grad_norm": 0.0136167136952281, + "learning_rate": 2.2273052611608878e-05, + "loss": 0.321718430519104, + "step": 181270 + }, + { + "epoch": 0.7782729278826752, + "grad_norm": 0.06143486872315407, + "learning_rate": 2.2268740891491252e-05, + "loss": 0.01671365797519684, + "step": 181280 + }, + { + "epoch": 0.7783158599726951, + "grad_norm": 0.07697362452745438, + "learning_rate": 2.226442917137363e-05, + "loss": 0.1757973313331604, + "step": 181290 + }, + { + "epoch": 0.7783587920627152, + "grad_norm": 4.821767330169678, + "learning_rate": 2.2260117451256003e-05, + "loss": 0.3337341070175171, + "step": 181300 + }, + { + "epoch": 0.7784017241527352, + "grad_norm": 0.8795390129089355, + "learning_rate": 2.225580573113838e-05, + "loss": 0.24267683029174805, + "step": 181310 + }, + { + "epoch": 0.7784446562427552, + "grad_norm": 6.890802383422852, + "learning_rate": 2.2251494011020758e-05, + "loss": 0.19483572244644165, + "step": 181320 + }, + { + "epoch": 0.7784875883327752, + "grad_norm": 1.7252296209335327, + "learning_rate": 2.2247182290903135e-05, + "loss": 0.3330058574676514, + "step": 181330 + }, + { + "epoch": 0.7785305204227952, + "grad_norm": 0.007271029055118561, + "learning_rate": 2.224287057078551e-05, + "loss": 0.13872404098510743, + "step": 181340 + }, + { + "epoch": 0.7785734525128152, + "grad_norm": 0.10837399959564209, + "learning_rate": 2.2238558850667887e-05, + "loss": 0.08872743844985961, + "step": 181350 + }, + { + "epoch": 0.7786163846028352, + "grad_norm": 0.012545960955321789, + "learning_rate": 2.223424713055026e-05, + "loss": 0.1828855037689209, + "step": 181360 + }, + { + "epoch": 0.7786593166928553, + "grad_norm": 1.8288440704345703, + "learning_rate": 2.2229935410432638e-05, + "loss": 0.32746448516845705, + "step": 181370 + }, + { + "epoch": 0.7787022487828752, + "grad_norm": 0.5537493824958801, + "learning_rate": 2.2225623690315015e-05, + "loss": 0.37675645351409914, + "step": 181380 + }, + { + "epoch": 0.7787451808728952, + "grad_norm": 0.10458727180957794, + "learning_rate": 2.2221311970197393e-05, + "loss": 0.12987738847732544, + "step": 181390 + }, + { + "epoch": 0.7787881129629153, + "grad_norm": 0.005267029628157616, + "learning_rate": 2.2217000250079767e-05, + "loss": 0.10624191761016846, + "step": 181400 + }, + { + "epoch": 0.7788310450529353, + "grad_norm": 13.237812995910645, + "learning_rate": 2.2212688529962144e-05, + "loss": 0.28482346534729003, + "step": 181410 + }, + { + "epoch": 0.7788739771429553, + "grad_norm": 0.005658499430865049, + "learning_rate": 2.2208376809844518e-05, + "loss": 0.0005017845891416073, + "step": 181420 + }, + { + "epoch": 0.7789169092329753, + "grad_norm": 0.040372833609580994, + "learning_rate": 2.2204065089726895e-05, + "loss": 0.0969342827796936, + "step": 181430 + }, + { + "epoch": 0.7789598413229953, + "grad_norm": 0.20668336749076843, + "learning_rate": 2.2199753369609273e-05, + "loss": 0.12974029779434204, + "step": 181440 + }, + { + "epoch": 0.7790027734130153, + "grad_norm": 0.0021882448345422745, + "learning_rate": 2.219544164949165e-05, + "loss": 0.07826072573661805, + "step": 181450 + }, + { + "epoch": 0.7790457055030353, + "grad_norm": 0.03661274537444115, + "learning_rate": 2.2191129929374024e-05, + "loss": 0.1869038701057434, + "step": 181460 + }, + { + "epoch": 0.7790886375930554, + "grad_norm": 0.0007741366280242801, + "learning_rate": 2.21868182092564e-05, + "loss": 0.369673752784729, + "step": 181470 + }, + { + "epoch": 0.7791315696830753, + "grad_norm": 0.009072545915842056, + "learning_rate": 2.2182506489138775e-05, + "loss": 0.10426502227783203, + "step": 181480 + }, + { + "epoch": 0.7791745017730953, + "grad_norm": 0.010089321993291378, + "learning_rate": 2.2178194769021156e-05, + "loss": 0.19302576780319214, + "step": 181490 + }, + { + "epoch": 0.7792174338631154, + "grad_norm": 0.07819779217243195, + "learning_rate": 2.217388304890353e-05, + "loss": 0.1887624144554138, + "step": 181500 + }, + { + "epoch": 0.7792603659531353, + "grad_norm": 1.0896580219268799, + "learning_rate": 2.2169571328785907e-05, + "loss": 0.17373030185699462, + "step": 181510 + }, + { + "epoch": 0.7793032980431553, + "grad_norm": 9.74447830230929e-05, + "learning_rate": 2.2165259608668285e-05, + "loss": 0.0991288185119629, + "step": 181520 + }, + { + "epoch": 0.7793462301331754, + "grad_norm": 0.029820624738931656, + "learning_rate": 2.216094788855066e-05, + "loss": 0.1576171636581421, + "step": 181530 + }, + { + "epoch": 0.7793891622231953, + "grad_norm": 0.5509734153747559, + "learning_rate": 2.2156636168433036e-05, + "loss": 0.18902976512908937, + "step": 181540 + }, + { + "epoch": 0.7794320943132154, + "grad_norm": 5.502527713775635, + "learning_rate": 2.2152324448315413e-05, + "loss": 0.3472958326339722, + "step": 181550 + }, + { + "epoch": 0.7794750264032354, + "grad_norm": 3.362891435623169, + "learning_rate": 2.214801272819779e-05, + "loss": 0.3285074234008789, + "step": 181560 + }, + { + "epoch": 0.7795179584932553, + "grad_norm": 0.019354185089468956, + "learning_rate": 2.2143701008080165e-05, + "loss": 0.11236345767974854, + "step": 181570 + }, + { + "epoch": 0.7795608905832754, + "grad_norm": 0.0021464480087161064, + "learning_rate": 2.2139389287962542e-05, + "loss": 0.05739705562591553, + "step": 181580 + }, + { + "epoch": 0.7796038226732954, + "grad_norm": 2.1688601970672607, + "learning_rate": 2.2135077567844916e-05, + "loss": 0.2590325832366943, + "step": 181590 + }, + { + "epoch": 0.7796467547633154, + "grad_norm": 0.0046370062045753, + "learning_rate": 2.2130765847727293e-05, + "loss": 0.06834167838096619, + "step": 181600 + }, + { + "epoch": 0.7796896868533354, + "grad_norm": 7.68601655960083, + "learning_rate": 2.212645412760967e-05, + "loss": 0.256929874420166, + "step": 181610 + }, + { + "epoch": 0.7797326189433554, + "grad_norm": 0.034900128841400146, + "learning_rate": 2.2122142407492048e-05, + "loss": 0.2113412380218506, + "step": 181620 + }, + { + "epoch": 0.7797755510333754, + "grad_norm": 0.0029798184987157583, + "learning_rate": 2.2117830687374422e-05, + "loss": 0.14006327390670775, + "step": 181630 + }, + { + "epoch": 0.7798184831233954, + "grad_norm": 0.0022323820739984512, + "learning_rate": 2.21135189672568e-05, + "loss": 0.07900501489639282, + "step": 181640 + }, + { + "epoch": 0.7798614152134155, + "grad_norm": 1.9272010326385498, + "learning_rate": 2.2109207247139173e-05, + "loss": 0.33671183586120607, + "step": 181650 + }, + { + "epoch": 0.7799043473034354, + "grad_norm": 0.0035930187441408634, + "learning_rate": 2.210489552702155e-05, + "loss": 0.19001634120941163, + "step": 181660 + }, + { + "epoch": 0.7799472793934554, + "grad_norm": 1.291403889656067, + "learning_rate": 2.2100583806903928e-05, + "loss": 0.39527206420898436, + "step": 181670 + }, + { + "epoch": 0.7799902114834755, + "grad_norm": 0.18798917531967163, + "learning_rate": 2.2096272086786305e-05, + "loss": 0.2213963270187378, + "step": 181680 + }, + { + "epoch": 0.7800331435734954, + "grad_norm": 4.843586444854736, + "learning_rate": 2.209196036666868e-05, + "loss": 0.2306389331817627, + "step": 181690 + }, + { + "epoch": 0.7800760756635154, + "grad_norm": 0.06972838193178177, + "learning_rate": 2.2087648646551056e-05, + "loss": 0.17254316806793213, + "step": 181700 + }, + { + "epoch": 0.7801190077535355, + "grad_norm": 0.00852520577609539, + "learning_rate": 2.208333692643343e-05, + "loss": 0.018256105482578278, + "step": 181710 + }, + { + "epoch": 0.7801619398435554, + "grad_norm": 0.006862110458314419, + "learning_rate": 2.2079025206315808e-05, + "loss": 0.21431200504302977, + "step": 181720 + }, + { + "epoch": 0.7802048719335755, + "grad_norm": 0.39576256275177, + "learning_rate": 2.2074713486198185e-05, + "loss": 0.31564376354217527, + "step": 181730 + }, + { + "epoch": 0.7802478040235955, + "grad_norm": 3.400718927383423, + "learning_rate": 2.2070401766080562e-05, + "loss": 0.22937994003295897, + "step": 181740 + }, + { + "epoch": 0.7802907361136154, + "grad_norm": 0.1543576568365097, + "learning_rate": 2.2066090045962936e-05, + "loss": 0.2346953868865967, + "step": 181750 + }, + { + "epoch": 0.7803336682036355, + "grad_norm": 1.9610964059829712, + "learning_rate": 2.2061778325845314e-05, + "loss": 0.2245539665222168, + "step": 181760 + }, + { + "epoch": 0.7803766002936555, + "grad_norm": 10.084091186523438, + "learning_rate": 2.2057466605727688e-05, + "loss": 0.3631103038787842, + "step": 181770 + }, + { + "epoch": 0.7804195323836755, + "grad_norm": 0.3080121874809265, + "learning_rate": 2.2053154885610065e-05, + "loss": 0.2070967435836792, + "step": 181780 + }, + { + "epoch": 0.7804624644736955, + "grad_norm": 0.010580839589238167, + "learning_rate": 2.2048843165492442e-05, + "loss": 0.3262079477310181, + "step": 181790 + }, + { + "epoch": 0.7805053965637155, + "grad_norm": 0.000504847674164921, + "learning_rate": 2.204453144537482e-05, + "loss": 0.41712441444396975, + "step": 181800 + }, + { + "epoch": 0.7805483286537355, + "grad_norm": 0.0038607046008110046, + "learning_rate": 2.2040219725257194e-05, + "loss": 0.15046488046646117, + "step": 181810 + }, + { + "epoch": 0.7805912607437555, + "grad_norm": 0.027759751304984093, + "learning_rate": 2.203590800513957e-05, + "loss": 0.16718239784240724, + "step": 181820 + }, + { + "epoch": 0.7806341928337756, + "grad_norm": 0.004910132847726345, + "learning_rate": 2.2031596285021945e-05, + "loss": 0.37359812259674074, + "step": 181830 + }, + { + "epoch": 0.7806771249237956, + "grad_norm": 0.013715947978198528, + "learning_rate": 2.2027284564904326e-05, + "loss": 0.19829065799713136, + "step": 181840 + }, + { + "epoch": 0.7807200570138155, + "grad_norm": 0.6874580979347229, + "learning_rate": 2.20229728447867e-05, + "loss": 0.15471491813659669, + "step": 181850 + }, + { + "epoch": 0.7807629891038356, + "grad_norm": 0.02267182432115078, + "learning_rate": 2.2018661124669077e-05, + "loss": 0.12024468183517456, + "step": 181860 + }, + { + "epoch": 0.7808059211938556, + "grad_norm": 0.0022773402743041515, + "learning_rate": 2.201434940455145e-05, + "loss": 0.1695494294166565, + "step": 181870 + }, + { + "epoch": 0.7808488532838755, + "grad_norm": 1.056784749031067, + "learning_rate": 2.201003768443383e-05, + "loss": 0.1925033450126648, + "step": 181880 + }, + { + "epoch": 0.7808917853738956, + "grad_norm": 0.05219695717096329, + "learning_rate": 2.2005725964316206e-05, + "loss": 0.20744595527648926, + "step": 181890 + }, + { + "epoch": 0.7809347174639156, + "grad_norm": 1.8505367040634155, + "learning_rate": 2.2001414244198583e-05, + "loss": 0.38707144260406495, + "step": 181900 + }, + { + "epoch": 0.7809776495539356, + "grad_norm": 0.035705603659152985, + "learning_rate": 2.199710252408096e-05, + "loss": 0.12455270290374756, + "step": 181910 + }, + { + "epoch": 0.7810205816439556, + "grad_norm": 0.4392683804035187, + "learning_rate": 2.1992790803963334e-05, + "loss": 0.22299594879150392, + "step": 181920 + }, + { + "epoch": 0.7810635137339756, + "grad_norm": 14.645524024963379, + "learning_rate": 2.198847908384571e-05, + "loss": 0.17089887857437133, + "step": 181930 + }, + { + "epoch": 0.7811064458239956, + "grad_norm": 0.014898652210831642, + "learning_rate": 2.1984167363728086e-05, + "loss": 0.22336406707763673, + "step": 181940 + }, + { + "epoch": 0.7811493779140156, + "grad_norm": 0.2620161771774292, + "learning_rate": 2.1979855643610463e-05, + "loss": 0.11998844146728516, + "step": 181950 + }, + { + "epoch": 0.7811923100040357, + "grad_norm": 0.5950635075569153, + "learning_rate": 2.197554392349284e-05, + "loss": 0.09730787873268128, + "step": 181960 + }, + { + "epoch": 0.7812352420940556, + "grad_norm": 0.036811236292123795, + "learning_rate": 2.1971232203375218e-05, + "loss": 0.20459918975830077, + "step": 181970 + }, + { + "epoch": 0.7812781741840756, + "grad_norm": 1.3045154809951782, + "learning_rate": 2.196692048325759e-05, + "loss": 0.1823878765106201, + "step": 181980 + }, + { + "epoch": 0.7813211062740957, + "grad_norm": 0.025310710072517395, + "learning_rate": 2.196260876313997e-05, + "loss": 0.11046243906021118, + "step": 181990 + }, + { + "epoch": 0.7813640383641156, + "grad_norm": 0.0037307622842490673, + "learning_rate": 2.1958297043022343e-05, + "loss": 0.2978527307510376, + "step": 182000 + }, + { + "epoch": 0.7813640383641156, + "eval_loss": 0.3825055658817291, + "eval_runtime": 27.4746, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 182000 + }, + { + "epoch": 0.7814069704541357, + "grad_norm": 0.23781545460224152, + "learning_rate": 2.195398532290472e-05, + "loss": 0.09041933417320251, + "step": 182010 + }, + { + "epoch": 0.7814499025441557, + "grad_norm": 0.013464709743857384, + "learning_rate": 2.1949673602787098e-05, + "loss": 0.08655680418014526, + "step": 182020 + }, + { + "epoch": 0.7814928346341756, + "grad_norm": 0.0028776293620467186, + "learning_rate": 2.1945361882669475e-05, + "loss": 0.17070761919021607, + "step": 182030 + }, + { + "epoch": 0.7815357667241957, + "grad_norm": 0.023576032370328903, + "learning_rate": 2.194105016255185e-05, + "loss": 0.1503755569458008, + "step": 182040 + }, + { + "epoch": 0.7815786988142157, + "grad_norm": 4.2293853759765625, + "learning_rate": 2.1936738442434226e-05, + "loss": 0.24403553009033202, + "step": 182050 + }, + { + "epoch": 0.7816216309042356, + "grad_norm": 8.121377944946289, + "learning_rate": 2.19324267223166e-05, + "loss": 0.396437668800354, + "step": 182060 + }, + { + "epoch": 0.7816645629942557, + "grad_norm": 2.9311256408691406, + "learning_rate": 2.1928115002198978e-05, + "loss": 0.24542434215545655, + "step": 182070 + }, + { + "epoch": 0.7817074950842757, + "grad_norm": 1.1242650747299194, + "learning_rate": 2.1923803282081355e-05, + "loss": 0.20290498733520507, + "step": 182080 + }, + { + "epoch": 0.7817504271742957, + "grad_norm": 0.004131468012928963, + "learning_rate": 2.1919491561963732e-05, + "loss": 0.29813075065612793, + "step": 182090 + }, + { + "epoch": 0.7817933592643157, + "grad_norm": 0.9652837514877319, + "learning_rate": 2.1915179841846106e-05, + "loss": 0.05880681276321411, + "step": 182100 + }, + { + "epoch": 0.7818362913543357, + "grad_norm": 0.8078187704086304, + "learning_rate": 2.1910868121728484e-05, + "loss": 0.22447094917297364, + "step": 182110 + }, + { + "epoch": 0.7818792234443557, + "grad_norm": 0.006744046695530415, + "learning_rate": 2.1906556401610857e-05, + "loss": 0.17717311382293702, + "step": 182120 + }, + { + "epoch": 0.7819221555343757, + "grad_norm": 0.02256331779062748, + "learning_rate": 2.1902244681493235e-05, + "loss": 0.08970722556114197, + "step": 182130 + }, + { + "epoch": 0.7819650876243958, + "grad_norm": 7.174070835113525, + "learning_rate": 2.1897932961375612e-05, + "loss": 0.3284448623657227, + "step": 182140 + }, + { + "epoch": 0.7820080197144157, + "grad_norm": 0.005719190463423729, + "learning_rate": 2.189362124125799e-05, + "loss": 0.30143983364105226, + "step": 182150 + }, + { + "epoch": 0.7820509518044357, + "grad_norm": 2.5708513259887695, + "learning_rate": 2.1889309521140363e-05, + "loss": 0.13481109142303466, + "step": 182160 + }, + { + "epoch": 0.7820938838944558, + "grad_norm": 0.009437017142772675, + "learning_rate": 2.188499780102274e-05, + "loss": 0.07504384517669678, + "step": 182170 + }, + { + "epoch": 0.7821368159844757, + "grad_norm": 0.4418356716632843, + "learning_rate": 2.1880686080905115e-05, + "loss": 0.18481459617614746, + "step": 182180 + }, + { + "epoch": 0.7821797480744958, + "grad_norm": 0.9562405347824097, + "learning_rate": 2.1876374360787495e-05, + "loss": 0.2165064811706543, + "step": 182190 + }, + { + "epoch": 0.7822226801645158, + "grad_norm": 1.1468602418899536, + "learning_rate": 2.187206264066987e-05, + "loss": 0.2354745388031006, + "step": 182200 + }, + { + "epoch": 0.7822656122545357, + "grad_norm": 0.0384535938501358, + "learning_rate": 2.1867750920552247e-05, + "loss": 0.2891047477722168, + "step": 182210 + }, + { + "epoch": 0.7823085443445558, + "grad_norm": 0.06366550922393799, + "learning_rate": 2.186343920043462e-05, + "loss": 0.1543177843093872, + "step": 182220 + }, + { + "epoch": 0.7823514764345758, + "grad_norm": 0.36118966341018677, + "learning_rate": 2.1859127480316998e-05, + "loss": 0.16045292615890502, + "step": 182230 + }, + { + "epoch": 0.7823944085245957, + "grad_norm": 0.000742246164008975, + "learning_rate": 2.1854815760199372e-05, + "loss": 0.18205565214157104, + "step": 182240 + }, + { + "epoch": 0.7824373406146158, + "grad_norm": 0.21004563570022583, + "learning_rate": 2.1850504040081753e-05, + "loss": 0.17873560190200805, + "step": 182250 + }, + { + "epoch": 0.7824802727046358, + "grad_norm": 0.27570030093193054, + "learning_rate": 2.184619231996413e-05, + "loss": 0.2319551467895508, + "step": 182260 + }, + { + "epoch": 0.7825232047946559, + "grad_norm": 0.9478753805160522, + "learning_rate": 2.1841880599846504e-05, + "loss": 0.3653576374053955, + "step": 182270 + }, + { + "epoch": 0.7825661368846758, + "grad_norm": 0.0772586390376091, + "learning_rate": 2.183756887972888e-05, + "loss": 0.256859564781189, + "step": 182280 + }, + { + "epoch": 0.7826090689746958, + "grad_norm": 0.24786922335624695, + "learning_rate": 2.1833257159611255e-05, + "loss": 0.06968159675598144, + "step": 182290 + }, + { + "epoch": 0.7826520010647159, + "grad_norm": 1.7795507907867432, + "learning_rate": 2.1828945439493633e-05, + "loss": 0.3153455018997192, + "step": 182300 + }, + { + "epoch": 0.7826949331547358, + "grad_norm": 4.49236536026001, + "learning_rate": 2.182463371937601e-05, + "loss": 0.3862607002258301, + "step": 182310 + }, + { + "epoch": 0.7827378652447559, + "grad_norm": 1.5710570812225342, + "learning_rate": 2.1820321999258387e-05, + "loss": 0.1828963041305542, + "step": 182320 + }, + { + "epoch": 0.7827807973347759, + "grad_norm": 0.00569881172850728, + "learning_rate": 2.181601027914076e-05, + "loss": 0.0007293551228940487, + "step": 182330 + }, + { + "epoch": 0.7828237294247958, + "grad_norm": 0.00949194747954607, + "learning_rate": 2.181169855902314e-05, + "loss": 0.16933388710021974, + "step": 182340 + }, + { + "epoch": 0.7828666615148159, + "grad_norm": 0.008789801970124245, + "learning_rate": 2.1807386838905513e-05, + "loss": 0.23258440494537352, + "step": 182350 + }, + { + "epoch": 0.7829095936048359, + "grad_norm": 2.179302453994751, + "learning_rate": 2.180307511878789e-05, + "loss": 0.19624111652374268, + "step": 182360 + }, + { + "epoch": 0.7829525256948558, + "grad_norm": 4.362630844116211, + "learning_rate": 2.1798763398670267e-05, + "loss": 0.23598828315734863, + "step": 182370 + }, + { + "epoch": 0.7829954577848759, + "grad_norm": 0.09223562479019165, + "learning_rate": 2.1794451678552645e-05, + "loss": 0.0645039439201355, + "step": 182380 + }, + { + "epoch": 0.7830383898748959, + "grad_norm": 0.004430851899087429, + "learning_rate": 2.179013995843502e-05, + "loss": 0.1624495267868042, + "step": 182390 + }, + { + "epoch": 0.7830813219649159, + "grad_norm": 0.00718493340536952, + "learning_rate": 2.1785828238317396e-05, + "loss": 0.20069551467895508, + "step": 182400 + }, + { + "epoch": 0.7831242540549359, + "grad_norm": 3.395489454269409, + "learning_rate": 2.178151651819977e-05, + "loss": 0.17186505794525148, + "step": 182410 + }, + { + "epoch": 0.783167186144956, + "grad_norm": 1.021973729133606, + "learning_rate": 2.1777204798082147e-05, + "loss": 0.1961965322494507, + "step": 182420 + }, + { + "epoch": 0.7832101182349759, + "grad_norm": 0.0013628596207126975, + "learning_rate": 2.1772893077964525e-05, + "loss": 0.1489197015762329, + "step": 182430 + }, + { + "epoch": 0.7832530503249959, + "grad_norm": 0.028677189722657204, + "learning_rate": 2.1768581357846902e-05, + "loss": 0.2267765998840332, + "step": 182440 + }, + { + "epoch": 0.783295982415016, + "grad_norm": 1.3386242389678955, + "learning_rate": 2.1764269637729276e-05, + "loss": 0.35156545639038084, + "step": 182450 + }, + { + "epoch": 0.7833389145050359, + "grad_norm": 0.007181175053119659, + "learning_rate": 2.1759957917611653e-05, + "loss": 0.2567557096481323, + "step": 182460 + }, + { + "epoch": 0.7833818465950559, + "grad_norm": 0.3704771101474762, + "learning_rate": 2.1755646197494027e-05, + "loss": 0.0791439414024353, + "step": 182470 + }, + { + "epoch": 0.783424778685076, + "grad_norm": 0.7793884873390198, + "learning_rate": 2.1751334477376405e-05, + "loss": 0.14850038290023804, + "step": 182480 + }, + { + "epoch": 0.7834677107750959, + "grad_norm": 0.00027104900800623, + "learning_rate": 2.1747022757258782e-05, + "loss": 0.08147647380828857, + "step": 182490 + }, + { + "epoch": 0.783510642865116, + "grad_norm": 0.00521089369431138, + "learning_rate": 2.174271103714116e-05, + "loss": 0.08911651372909546, + "step": 182500 + }, + { + "epoch": 0.783553574955136, + "grad_norm": 0.01887751929461956, + "learning_rate": 2.1738399317023533e-05, + "loss": 0.304192852973938, + "step": 182510 + }, + { + "epoch": 0.7835965070451559, + "grad_norm": 0.02301798388361931, + "learning_rate": 2.173408759690591e-05, + "loss": 0.22890405654907225, + "step": 182520 + }, + { + "epoch": 0.783639439135176, + "grad_norm": 0.018653536215424538, + "learning_rate": 2.1729775876788284e-05, + "loss": 0.19519143104553222, + "step": 182530 + }, + { + "epoch": 0.783682371225196, + "grad_norm": 0.030346734449267387, + "learning_rate": 2.1725464156670662e-05, + "loss": 0.41370511054992676, + "step": 182540 + }, + { + "epoch": 0.783725303315216, + "grad_norm": 0.8571396470069885, + "learning_rate": 2.172115243655304e-05, + "loss": 0.38896913528442384, + "step": 182550 + }, + { + "epoch": 0.783768235405236, + "grad_norm": 0.00412337388843298, + "learning_rate": 2.1716840716435417e-05, + "loss": 0.10326659679412842, + "step": 182560 + }, + { + "epoch": 0.783811167495256, + "grad_norm": 0.6427186727523804, + "learning_rate": 2.171252899631779e-05, + "loss": 0.25321714878082274, + "step": 182570 + }, + { + "epoch": 0.783854099585276, + "grad_norm": 0.001954421168193221, + "learning_rate": 2.1708217276200168e-05, + "loss": 0.05052712559700012, + "step": 182580 + }, + { + "epoch": 0.783897031675296, + "grad_norm": 1.9638227224349976, + "learning_rate": 2.1703905556082542e-05, + "loss": 0.4849149227142334, + "step": 182590 + }, + { + "epoch": 0.783939963765316, + "grad_norm": 0.020015276968479156, + "learning_rate": 2.1699593835964922e-05, + "loss": 0.04313889741897583, + "step": 182600 + }, + { + "epoch": 0.783982895855336, + "grad_norm": 0.020334254950284958, + "learning_rate": 2.16952821158473e-05, + "loss": 0.15066235065460204, + "step": 182610 + }, + { + "epoch": 0.784025827945356, + "grad_norm": 6.323507785797119, + "learning_rate": 2.1690970395729674e-05, + "loss": 0.1465320110321045, + "step": 182620 + }, + { + "epoch": 0.7840687600353761, + "grad_norm": 0.030544867739081383, + "learning_rate": 2.168665867561205e-05, + "loss": 0.06789618730545044, + "step": 182630 + }, + { + "epoch": 0.784111692125396, + "grad_norm": 0.03058551624417305, + "learning_rate": 2.1682346955494425e-05, + "loss": 0.20928442478179932, + "step": 182640 + }, + { + "epoch": 0.784154624215416, + "grad_norm": 0.4168550372123718, + "learning_rate": 2.1678035235376802e-05, + "loss": 0.2982338428497314, + "step": 182650 + }, + { + "epoch": 0.7841975563054361, + "grad_norm": 12.178466796875, + "learning_rate": 2.167372351525918e-05, + "loss": 0.27531182765960693, + "step": 182660 + }, + { + "epoch": 0.784240488395456, + "grad_norm": 0.009957391768693924, + "learning_rate": 2.1669411795141557e-05, + "loss": 0.2303466320037842, + "step": 182670 + }, + { + "epoch": 0.7842834204854761, + "grad_norm": 0.0028455080464482307, + "learning_rate": 2.166510007502393e-05, + "loss": 0.17376034259796141, + "step": 182680 + }, + { + "epoch": 0.7843263525754961, + "grad_norm": 8.281001091003418, + "learning_rate": 2.166078835490631e-05, + "loss": 0.18689534664154053, + "step": 182690 + }, + { + "epoch": 0.7843692846655161, + "grad_norm": 0.005231133662164211, + "learning_rate": 2.1656476634788682e-05, + "loss": 0.2655553102493286, + "step": 182700 + }, + { + "epoch": 0.7844122167555361, + "grad_norm": 5.244409561157227, + "learning_rate": 2.165216491467106e-05, + "loss": 0.32235958576202395, + "step": 182710 + }, + { + "epoch": 0.7844551488455561, + "grad_norm": 0.15298572182655334, + "learning_rate": 2.1647853194553437e-05, + "loss": 0.36844849586486816, + "step": 182720 + }, + { + "epoch": 0.7844980809355762, + "grad_norm": 0.020973147824406624, + "learning_rate": 2.1643541474435814e-05, + "loss": 0.2034377336502075, + "step": 182730 + }, + { + "epoch": 0.7845410130255961, + "grad_norm": 2.0955288410186768, + "learning_rate": 2.163922975431819e-05, + "loss": 0.3749516487121582, + "step": 182740 + }, + { + "epoch": 0.7845839451156161, + "grad_norm": 0.053335897624492645, + "learning_rate": 2.1634918034200566e-05, + "loss": 0.22016558647155762, + "step": 182750 + }, + { + "epoch": 0.7846268772056362, + "grad_norm": 10.606240272521973, + "learning_rate": 2.163060631408294e-05, + "loss": 0.2146967649459839, + "step": 182760 + }, + { + "epoch": 0.7846698092956561, + "grad_norm": 1.1345289945602417, + "learning_rate": 2.1626294593965317e-05, + "loss": 0.4405350685119629, + "step": 182770 + }, + { + "epoch": 0.7847127413856761, + "grad_norm": 0.028475617989897728, + "learning_rate": 2.1621982873847694e-05, + "loss": 0.1836310863494873, + "step": 182780 + }, + { + "epoch": 0.7847556734756962, + "grad_norm": 0.7680911421775818, + "learning_rate": 2.161767115373007e-05, + "loss": 0.19354742765426636, + "step": 182790 + }, + { + "epoch": 0.7847986055657161, + "grad_norm": 0.04010359197854996, + "learning_rate": 2.1613359433612446e-05, + "loss": 0.2988368272781372, + "step": 182800 + }, + { + "epoch": 0.7848415376557362, + "grad_norm": 0.0036592287942767143, + "learning_rate": 2.1609047713494823e-05, + "loss": 0.17033002376556397, + "step": 182810 + }, + { + "epoch": 0.7848844697457562, + "grad_norm": 0.014441991224884987, + "learning_rate": 2.1604735993377197e-05, + "loss": 0.11610093116760253, + "step": 182820 + }, + { + "epoch": 0.7849274018357761, + "grad_norm": 0.3196814954280853, + "learning_rate": 2.1600424273259574e-05, + "loss": 0.23743658065795897, + "step": 182830 + }, + { + "epoch": 0.7849703339257962, + "grad_norm": 1.738743782043457, + "learning_rate": 2.159611255314195e-05, + "loss": 0.36977529525756836, + "step": 182840 + }, + { + "epoch": 0.7850132660158162, + "grad_norm": 0.5812845230102539, + "learning_rate": 2.159180083302433e-05, + "loss": 0.25585949420928955, + "step": 182850 + }, + { + "epoch": 0.7850561981058362, + "grad_norm": 0.008618000894784927, + "learning_rate": 2.1587489112906703e-05, + "loss": 0.17306852340698242, + "step": 182860 + }, + { + "epoch": 0.7850991301958562, + "grad_norm": 3.9255237579345703, + "learning_rate": 2.158317739278908e-05, + "loss": 0.20480165481567383, + "step": 182870 + }, + { + "epoch": 0.7851420622858762, + "grad_norm": 0.004135298077017069, + "learning_rate": 2.1578865672671454e-05, + "loss": 0.37045023441314695, + "step": 182880 + }, + { + "epoch": 0.7851849943758962, + "grad_norm": 0.5705690979957581, + "learning_rate": 2.157455395255383e-05, + "loss": 0.2988840818405151, + "step": 182890 + }, + { + "epoch": 0.7852279264659162, + "grad_norm": 0.005285890772938728, + "learning_rate": 2.157024223243621e-05, + "loss": 0.12949843406677247, + "step": 182900 + }, + { + "epoch": 0.7852708585559363, + "grad_norm": 0.09740712493658066, + "learning_rate": 2.1565930512318586e-05, + "loss": 0.05606417059898376, + "step": 182910 + }, + { + "epoch": 0.7853137906459562, + "grad_norm": 0.09031404554843903, + "learning_rate": 2.156161879220096e-05, + "loss": 0.003790769726037979, + "step": 182920 + }, + { + "epoch": 0.7853567227359762, + "grad_norm": 0.2656407356262207, + "learning_rate": 2.1557307072083338e-05, + "loss": 0.22708549499511718, + "step": 182930 + }, + { + "epoch": 0.7853996548259963, + "grad_norm": 0.6343486309051514, + "learning_rate": 2.155299535196571e-05, + "loss": 0.14608529806137086, + "step": 182940 + }, + { + "epoch": 0.7854425869160162, + "grad_norm": 0.0007862726924940944, + "learning_rate": 2.1548683631848092e-05, + "loss": 0.08486260771751404, + "step": 182950 + }, + { + "epoch": 0.7854855190060362, + "grad_norm": 0.04844307526946068, + "learning_rate": 2.1544371911730466e-05, + "loss": 0.33439657688140867, + "step": 182960 + }, + { + "epoch": 0.7855284510960563, + "grad_norm": 0.0017822891240939498, + "learning_rate": 2.1540060191612844e-05, + "loss": 0.04367310702800751, + "step": 182970 + }, + { + "epoch": 0.7855713831860762, + "grad_norm": 0.01023387722671032, + "learning_rate": 2.153574847149522e-05, + "loss": 0.30081963539123535, + "step": 182980 + }, + { + "epoch": 0.7856143152760963, + "grad_norm": 0.15752474963665009, + "learning_rate": 2.1531436751377595e-05, + "loss": 0.10231451988220215, + "step": 182990 + }, + { + "epoch": 0.7856572473661163, + "grad_norm": 0.0028393957763910294, + "learning_rate": 2.1527125031259972e-05, + "loss": 0.18005791902542115, + "step": 183000 + }, + { + "epoch": 0.7856572473661163, + "eval_loss": 0.3797743618488312, + "eval_runtime": 27.4618, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 183000 + }, + { + "epoch": 0.7857001794561362, + "grad_norm": 0.007138405926525593, + "learning_rate": 2.152281331114235e-05, + "loss": 0.2358877420425415, + "step": 183010 + }, + { + "epoch": 0.7857431115461563, + "grad_norm": 0.0003113711136393249, + "learning_rate": 2.1518501591024727e-05, + "loss": 0.1500988006591797, + "step": 183020 + }, + { + "epoch": 0.7857860436361763, + "grad_norm": 0.007665345445275307, + "learning_rate": 2.15141898709071e-05, + "loss": 0.14550169706344604, + "step": 183030 + }, + { + "epoch": 0.7858289757261963, + "grad_norm": 0.008687633089721203, + "learning_rate": 2.1509878150789478e-05, + "loss": 0.2093245267868042, + "step": 183040 + }, + { + "epoch": 0.7858719078162163, + "grad_norm": 0.0004379673337098211, + "learning_rate": 2.1505566430671852e-05, + "loss": 0.21798477172851563, + "step": 183050 + }, + { + "epoch": 0.7859148399062363, + "grad_norm": 0.05952138453722, + "learning_rate": 2.150125471055423e-05, + "loss": 0.2020576000213623, + "step": 183060 + }, + { + "epoch": 0.7859577719962563, + "grad_norm": 3.7258222103118896, + "learning_rate": 2.1496942990436607e-05, + "loss": 0.3716374635696411, + "step": 183070 + }, + { + "epoch": 0.7860007040862763, + "grad_norm": 0.0039495136588811874, + "learning_rate": 2.1492631270318984e-05, + "loss": 0.1307750701904297, + "step": 183080 + }, + { + "epoch": 0.7860436361762964, + "grad_norm": 0.09609965980052948, + "learning_rate": 2.1488319550201358e-05, + "loss": 0.24323813915252684, + "step": 183090 + }, + { + "epoch": 0.7860865682663163, + "grad_norm": 0.009046729654073715, + "learning_rate": 2.1484007830083735e-05, + "loss": 0.10413626432418824, + "step": 183100 + }, + { + "epoch": 0.7861295003563363, + "grad_norm": 0.0018896028632298112, + "learning_rate": 2.147969610996611e-05, + "loss": 0.27177743911743163, + "step": 183110 + }, + { + "epoch": 0.7861724324463564, + "grad_norm": 3.522463321685791, + "learning_rate": 2.1475384389848487e-05, + "loss": 0.24144039154052735, + "step": 183120 + }, + { + "epoch": 0.7862153645363763, + "grad_norm": 11.466240882873535, + "learning_rate": 2.1471072669730864e-05, + "loss": 0.33338441848754885, + "step": 183130 + }, + { + "epoch": 0.7862582966263963, + "grad_norm": 2.057114362716675, + "learning_rate": 2.146676094961324e-05, + "loss": 0.45223298072814944, + "step": 183140 + }, + { + "epoch": 0.7863012287164164, + "grad_norm": 0.05960441380739212, + "learning_rate": 2.1462449229495615e-05, + "loss": 0.22052621841430664, + "step": 183150 + }, + { + "epoch": 0.7863441608064364, + "grad_norm": 1.7557612657546997, + "learning_rate": 2.1458137509377993e-05, + "loss": 0.2731237173080444, + "step": 183160 + }, + { + "epoch": 0.7863870928964564, + "grad_norm": 1.4419736862182617, + "learning_rate": 2.1453825789260367e-05, + "loss": 0.11441911458969116, + "step": 183170 + }, + { + "epoch": 0.7864300249864764, + "grad_norm": 0.00411709351465106, + "learning_rate": 2.1449514069142744e-05, + "loss": 0.033696231245994565, + "step": 183180 + }, + { + "epoch": 0.7864729570764964, + "grad_norm": 0.04816296696662903, + "learning_rate": 2.144520234902512e-05, + "loss": 0.16660306453704835, + "step": 183190 + }, + { + "epoch": 0.7865158891665164, + "grad_norm": 0.9707366228103638, + "learning_rate": 2.14408906289075e-05, + "loss": 0.24993407726287842, + "step": 183200 + }, + { + "epoch": 0.7865588212565364, + "grad_norm": 0.6969446539878845, + "learning_rate": 2.1436578908789873e-05, + "loss": 0.13606752157211305, + "step": 183210 + }, + { + "epoch": 0.7866017533465565, + "grad_norm": 0.27022868394851685, + "learning_rate": 2.143226718867225e-05, + "loss": 0.13606159687042235, + "step": 183220 + }, + { + "epoch": 0.7866446854365764, + "grad_norm": 1.8256360292434692, + "learning_rate": 2.1427955468554624e-05, + "loss": 0.17958106994628906, + "step": 183230 + }, + { + "epoch": 0.7866876175265964, + "grad_norm": 0.04855459928512573, + "learning_rate": 2.1423643748437e-05, + "loss": 0.1294558048248291, + "step": 183240 + }, + { + "epoch": 0.7867305496166165, + "grad_norm": 0.9606425166130066, + "learning_rate": 2.141933202831938e-05, + "loss": 0.21541991233825683, + "step": 183250 + }, + { + "epoch": 0.7867734817066364, + "grad_norm": 0.0012368131428956985, + "learning_rate": 2.1415020308201756e-05, + "loss": 0.09298962354660034, + "step": 183260 + }, + { + "epoch": 0.7868164137966565, + "grad_norm": 0.004385500680655241, + "learning_rate": 2.141070858808413e-05, + "loss": 0.11553781032562256, + "step": 183270 + }, + { + "epoch": 0.7868593458866765, + "grad_norm": 0.003434704151004553, + "learning_rate": 2.1406396867966507e-05, + "loss": 0.16233421564102174, + "step": 183280 + }, + { + "epoch": 0.7869022779766964, + "grad_norm": 0.028390705585479736, + "learning_rate": 2.140208514784888e-05, + "loss": 0.19993984699249268, + "step": 183290 + }, + { + "epoch": 0.7869452100667165, + "grad_norm": 4.946071624755859, + "learning_rate": 2.139777342773126e-05, + "loss": 0.1852961778640747, + "step": 183300 + }, + { + "epoch": 0.7869881421567365, + "grad_norm": 0.0032075869385153055, + "learning_rate": 2.1393461707613636e-05, + "loss": 0.1300378918647766, + "step": 183310 + }, + { + "epoch": 0.7870310742467564, + "grad_norm": 0.0008391632582060993, + "learning_rate": 2.1389149987496013e-05, + "loss": 0.19580576419830323, + "step": 183320 + }, + { + "epoch": 0.7870740063367765, + "grad_norm": 0.006768613122403622, + "learning_rate": 2.1384838267378387e-05, + "loss": 0.3424041271209717, + "step": 183330 + }, + { + "epoch": 0.7871169384267965, + "grad_norm": 2.0839383602142334, + "learning_rate": 2.1380526547260765e-05, + "loss": 0.1107265830039978, + "step": 183340 + }, + { + "epoch": 0.7871598705168165, + "grad_norm": 1.0905482769012451, + "learning_rate": 2.1376214827143142e-05, + "loss": 0.2123729705810547, + "step": 183350 + }, + { + "epoch": 0.7872028026068365, + "grad_norm": 0.0029551531188189983, + "learning_rate": 2.137190310702552e-05, + "loss": 0.08570262193679809, + "step": 183360 + }, + { + "epoch": 0.7872457346968565, + "grad_norm": 2.3354883193969727, + "learning_rate": 2.1367591386907897e-05, + "loss": 0.3122261047363281, + "step": 183370 + }, + { + "epoch": 0.7872886667868765, + "grad_norm": 0.8970825672149658, + "learning_rate": 2.136327966679027e-05, + "loss": 0.1956690549850464, + "step": 183380 + }, + { + "epoch": 0.7873315988768965, + "grad_norm": 5.209467887878418, + "learning_rate": 2.1358967946672648e-05, + "loss": 0.22057158946990968, + "step": 183390 + }, + { + "epoch": 0.7873745309669166, + "grad_norm": 0.0026834437157958746, + "learning_rate": 2.1354656226555022e-05, + "loss": 0.13024570941925048, + "step": 183400 + }, + { + "epoch": 0.7874174630569365, + "grad_norm": 0.0007249237387441099, + "learning_rate": 2.13503445064374e-05, + "loss": 0.08153098225593566, + "step": 183410 + }, + { + "epoch": 0.7874603951469565, + "grad_norm": 1.7533392906188965, + "learning_rate": 2.1346032786319777e-05, + "loss": 0.275898814201355, + "step": 183420 + }, + { + "epoch": 0.7875033272369766, + "grad_norm": 1.7578963041305542, + "learning_rate": 2.1341721066202154e-05, + "loss": 0.3079784393310547, + "step": 183430 + }, + { + "epoch": 0.7875462593269965, + "grad_norm": 0.024534443393349648, + "learning_rate": 2.1337409346084528e-05, + "loss": 0.216939377784729, + "step": 183440 + }, + { + "epoch": 0.7875891914170166, + "grad_norm": 0.002587397349998355, + "learning_rate": 2.1333097625966905e-05, + "loss": 0.11361846923828126, + "step": 183450 + }, + { + "epoch": 0.7876321235070366, + "grad_norm": 0.0038166409358382225, + "learning_rate": 2.132878590584928e-05, + "loss": 0.07297162413597107, + "step": 183460 + }, + { + "epoch": 0.7876750555970565, + "grad_norm": 0.0017480719834566116, + "learning_rate": 2.1324474185731656e-05, + "loss": 0.11760724782943725, + "step": 183470 + }, + { + "epoch": 0.7877179876870766, + "grad_norm": 93.22285461425781, + "learning_rate": 2.1320162465614034e-05, + "loss": 0.36112000942230227, + "step": 183480 + }, + { + "epoch": 0.7877609197770966, + "grad_norm": 0.9116643667221069, + "learning_rate": 2.131585074549641e-05, + "loss": 0.08869249820709228, + "step": 183490 + }, + { + "epoch": 0.7878038518671165, + "grad_norm": 1.2129038572311401, + "learning_rate": 2.1311539025378785e-05, + "loss": 0.2812972545623779, + "step": 183500 + }, + { + "epoch": 0.7878467839571366, + "grad_norm": 0.002802674425765872, + "learning_rate": 2.1307227305261162e-05, + "loss": 0.17642027139663696, + "step": 183510 + }, + { + "epoch": 0.7878897160471566, + "grad_norm": 0.13792453706264496, + "learning_rate": 2.1302915585143536e-05, + "loss": 0.43254504203796384, + "step": 183520 + }, + { + "epoch": 0.7879326481371766, + "grad_norm": 0.0027651230338960886, + "learning_rate": 2.1298603865025914e-05, + "loss": 0.09086210131645203, + "step": 183530 + }, + { + "epoch": 0.7879755802271966, + "grad_norm": 0.8143436908721924, + "learning_rate": 2.129429214490829e-05, + "loss": 0.22465338706970214, + "step": 183540 + }, + { + "epoch": 0.7880185123172166, + "grad_norm": 1.933145523071289, + "learning_rate": 2.128998042479067e-05, + "loss": 0.2547163486480713, + "step": 183550 + }, + { + "epoch": 0.7880614444072366, + "grad_norm": 7.512142181396484, + "learning_rate": 2.1285668704673042e-05, + "loss": 0.5128835201263428, + "step": 183560 + }, + { + "epoch": 0.7881043764972566, + "grad_norm": 5.832769870758057, + "learning_rate": 2.128135698455542e-05, + "loss": 0.16420665979385377, + "step": 183570 + }, + { + "epoch": 0.7881473085872767, + "grad_norm": 1.8669154644012451, + "learning_rate": 2.1277045264437794e-05, + "loss": 0.41615657806396483, + "step": 183580 + }, + { + "epoch": 0.7881902406772967, + "grad_norm": 0.020469048991799355, + "learning_rate": 2.127273354432017e-05, + "loss": 0.14705488681793213, + "step": 183590 + }, + { + "epoch": 0.7882331727673166, + "grad_norm": 1.6848324537277222, + "learning_rate": 2.126842182420255e-05, + "loss": 0.34337496757507324, + "step": 183600 + }, + { + "epoch": 0.7882761048573367, + "grad_norm": 0.003964392002671957, + "learning_rate": 2.1264110104084926e-05, + "loss": 0.2729069709777832, + "step": 183610 + }, + { + "epoch": 0.7883190369473567, + "grad_norm": 2.254751205444336, + "learning_rate": 2.12597983839673e-05, + "loss": 0.2700617790222168, + "step": 183620 + }, + { + "epoch": 0.7883619690373767, + "grad_norm": 0.0009603060898371041, + "learning_rate": 2.1255486663849677e-05, + "loss": 0.1181606411933899, + "step": 183630 + }, + { + "epoch": 0.7884049011273967, + "grad_norm": 0.055945176631212234, + "learning_rate": 2.125117494373205e-05, + "loss": 0.20723607540130615, + "step": 183640 + }, + { + "epoch": 0.7884478332174167, + "grad_norm": 0.18929432332515717, + "learning_rate": 2.1246863223614428e-05, + "loss": 0.05493208169937134, + "step": 183650 + }, + { + "epoch": 0.7884907653074367, + "grad_norm": 0.05633874610066414, + "learning_rate": 2.1242551503496806e-05, + "loss": 0.1731419086456299, + "step": 183660 + }, + { + "epoch": 0.7885336973974567, + "grad_norm": 0.02658155746757984, + "learning_rate": 2.1238239783379183e-05, + "loss": 0.22599170207977295, + "step": 183670 + }, + { + "epoch": 0.7885766294874768, + "grad_norm": 0.3270963132381439, + "learning_rate": 2.1233928063261557e-05, + "loss": 0.09019438028335572, + "step": 183680 + }, + { + "epoch": 0.7886195615774967, + "grad_norm": 0.005451427306979895, + "learning_rate": 2.1229616343143934e-05, + "loss": 0.11634416580200195, + "step": 183690 + }, + { + "epoch": 0.7886624936675167, + "grad_norm": 1.4958854913711548, + "learning_rate": 2.122530462302631e-05, + "loss": 0.192735755443573, + "step": 183700 + }, + { + "epoch": 0.7887054257575368, + "grad_norm": 0.0020073908381164074, + "learning_rate": 2.122099290290869e-05, + "loss": 0.05960498452186584, + "step": 183710 + }, + { + "epoch": 0.7887483578475567, + "grad_norm": 1.2685023546218872, + "learning_rate": 2.1216681182791066e-05, + "loss": 0.23755450248718263, + "step": 183720 + }, + { + "epoch": 0.7887912899375767, + "grad_norm": 0.039844296872615814, + "learning_rate": 2.121236946267344e-05, + "loss": 0.20982720851898193, + "step": 183730 + }, + { + "epoch": 0.7888342220275968, + "grad_norm": 0.04778163507580757, + "learning_rate": 2.1208057742555818e-05, + "loss": 0.17580852508544922, + "step": 183740 + }, + { + "epoch": 0.7888771541176167, + "grad_norm": 1.0229040384292603, + "learning_rate": 2.120374602243819e-05, + "loss": 0.30660076141357423, + "step": 183750 + }, + { + "epoch": 0.7889200862076368, + "grad_norm": 0.021670211106538773, + "learning_rate": 2.119943430232057e-05, + "loss": 0.08490864038467408, + "step": 183760 + }, + { + "epoch": 0.7889630182976568, + "grad_norm": 7.117254426702857e-05, + "learning_rate": 2.1195122582202946e-05, + "loss": 0.10266214609146118, + "step": 183770 + }, + { + "epoch": 0.7890059503876767, + "grad_norm": 0.0019227155717089772, + "learning_rate": 2.1190810862085324e-05, + "loss": 0.008311575651168824, + "step": 183780 + }, + { + "epoch": 0.7890488824776968, + "grad_norm": 8.819829940795898, + "learning_rate": 2.1186499141967698e-05, + "loss": 0.23230252265930176, + "step": 183790 + }, + { + "epoch": 0.7890918145677168, + "grad_norm": 1.0761563777923584, + "learning_rate": 2.1182187421850075e-05, + "loss": 0.26760454177856446, + "step": 183800 + }, + { + "epoch": 0.7891347466577368, + "grad_norm": 1.6653600931167603, + "learning_rate": 2.117787570173245e-05, + "loss": 0.2435081720352173, + "step": 183810 + }, + { + "epoch": 0.7891776787477568, + "grad_norm": 3.4551095962524414, + "learning_rate": 2.1173563981614826e-05, + "loss": 0.425843334197998, + "step": 183820 + }, + { + "epoch": 0.7892206108377768, + "grad_norm": 1.0570393800735474, + "learning_rate": 2.1169252261497204e-05, + "loss": 0.23131773471832276, + "step": 183830 + }, + { + "epoch": 0.7892635429277968, + "grad_norm": 0.0038749901577830315, + "learning_rate": 2.116494054137958e-05, + "loss": 0.26228177547454834, + "step": 183840 + }, + { + "epoch": 0.7893064750178168, + "grad_norm": 0.29532575607299805, + "learning_rate": 2.1160628821261955e-05, + "loss": 0.28396048545837405, + "step": 183850 + }, + { + "epoch": 0.7893494071078369, + "grad_norm": 0.40147167444229126, + "learning_rate": 2.1156317101144332e-05, + "loss": 0.16280757188796996, + "step": 183860 + }, + { + "epoch": 0.7893923391978568, + "grad_norm": 0.008659133687615395, + "learning_rate": 2.1152005381026706e-05, + "loss": 0.06703677773475647, + "step": 183870 + }, + { + "epoch": 0.7894352712878768, + "grad_norm": 5.348660469055176, + "learning_rate": 2.1147693660909083e-05, + "loss": 0.14614660739898683, + "step": 183880 + }, + { + "epoch": 0.7894782033778969, + "grad_norm": 2.7472760677337646, + "learning_rate": 2.114338194079146e-05, + "loss": 0.27347257137298586, + "step": 183890 + }, + { + "epoch": 0.7895211354679168, + "grad_norm": 1.1811401844024658, + "learning_rate": 2.1139070220673838e-05, + "loss": 0.21110949516296387, + "step": 183900 + }, + { + "epoch": 0.7895640675579368, + "grad_norm": 0.01332367118448019, + "learning_rate": 2.1134758500556212e-05, + "loss": 0.03304066956043243, + "step": 183910 + }, + { + "epoch": 0.7896069996479569, + "grad_norm": 0.013469953089952469, + "learning_rate": 2.113044678043859e-05, + "loss": 0.1550193428993225, + "step": 183920 + }, + { + "epoch": 0.7896499317379768, + "grad_norm": 0.001539850840345025, + "learning_rate": 2.1126135060320963e-05, + "loss": 0.10376846790313721, + "step": 183930 + }, + { + "epoch": 0.7896928638279969, + "grad_norm": 0.7918428182601929, + "learning_rate": 2.112182334020334e-05, + "loss": 0.1784311294555664, + "step": 183940 + }, + { + "epoch": 0.7897357959180169, + "grad_norm": 0.0009629686828702688, + "learning_rate": 2.1117511620085718e-05, + "loss": 0.1645200252532959, + "step": 183950 + }, + { + "epoch": 0.7897787280080368, + "grad_norm": 1.0543266534805298, + "learning_rate": 2.1113199899968095e-05, + "loss": 0.14095029830932618, + "step": 183960 + }, + { + "epoch": 0.7898216600980569, + "grad_norm": 4.017325401306152, + "learning_rate": 2.110888817985047e-05, + "loss": 0.24089696407318115, + "step": 183970 + }, + { + "epoch": 0.7898645921880769, + "grad_norm": 0.9489465355873108, + "learning_rate": 2.1104576459732847e-05, + "loss": 0.5302737712860107, + "step": 183980 + }, + { + "epoch": 0.7899075242780969, + "grad_norm": 0.004499488044530153, + "learning_rate": 2.110026473961522e-05, + "loss": 0.2945434808731079, + "step": 183990 + }, + { + "epoch": 0.7899504563681169, + "grad_norm": 0.016302023082971573, + "learning_rate": 2.1095953019497598e-05, + "loss": 0.19677678346633912, + "step": 184000 + }, + { + "epoch": 0.7899504563681169, + "eval_loss": 0.3860042691230774, + "eval_runtime": 27.41, + "eval_samples_per_second": 3.648, + "eval_steps_per_second": 3.648, + "step": 184000 + }, + { + "epoch": 0.7899933884581369, + "grad_norm": 0.002552248304709792, + "learning_rate": 2.1091641299379975e-05, + "loss": 0.1127932071685791, + "step": 184010 + }, + { + "epoch": 0.790036320548157, + "grad_norm": 2.2927207946777344, + "learning_rate": 2.1087329579262353e-05, + "loss": 0.3413818359375, + "step": 184020 + }, + { + "epoch": 0.7900792526381769, + "grad_norm": 2.3446779251098633, + "learning_rate": 2.1083017859144727e-05, + "loss": 0.1625274896621704, + "step": 184030 + }, + { + "epoch": 0.790122184728197, + "grad_norm": 0.10824047774076462, + "learning_rate": 2.1078706139027104e-05, + "loss": 0.20035688877105712, + "step": 184040 + }, + { + "epoch": 0.790165116818217, + "grad_norm": 0.07350625097751617, + "learning_rate": 2.1074394418909478e-05, + "loss": 0.26237874031066893, + "step": 184050 + }, + { + "epoch": 0.7902080489082369, + "grad_norm": 0.0038361994083970785, + "learning_rate": 2.107008269879186e-05, + "loss": 0.12209371328353882, + "step": 184060 + }, + { + "epoch": 0.790250980998257, + "grad_norm": 0.0032284085173159838, + "learning_rate": 2.1065770978674236e-05, + "loss": 0.20432050228118898, + "step": 184070 + }, + { + "epoch": 0.790293913088277, + "grad_norm": 0.007364768069237471, + "learning_rate": 2.106145925855661e-05, + "loss": 0.05040127635002136, + "step": 184080 + }, + { + "epoch": 0.7903368451782969, + "grad_norm": 0.07383093982934952, + "learning_rate": 2.1057147538438987e-05, + "loss": 0.08585337996482849, + "step": 184090 + }, + { + "epoch": 0.790379777268317, + "grad_norm": 2.5571935176849365, + "learning_rate": 2.105283581832136e-05, + "loss": 0.13882994651794434, + "step": 184100 + }, + { + "epoch": 0.790422709358337, + "grad_norm": 0.917072594165802, + "learning_rate": 2.104852409820374e-05, + "loss": 0.4125373363494873, + "step": 184110 + }, + { + "epoch": 0.790465641448357, + "grad_norm": 0.0016729761846363544, + "learning_rate": 2.1044212378086116e-05, + "loss": 0.38989236354827883, + "step": 184120 + }, + { + "epoch": 0.790508573538377, + "grad_norm": 0.15144653618335724, + "learning_rate": 2.1039900657968493e-05, + "loss": 0.23984091281890868, + "step": 184130 + }, + { + "epoch": 0.790551505628397, + "grad_norm": 0.026266876608133316, + "learning_rate": 2.1035588937850867e-05, + "loss": 0.2721916437149048, + "step": 184140 + }, + { + "epoch": 0.790594437718417, + "grad_norm": 0.0340094156563282, + "learning_rate": 2.1031277217733245e-05, + "loss": 0.10305203199386596, + "step": 184150 + }, + { + "epoch": 0.790637369808437, + "grad_norm": 0.008664273656904697, + "learning_rate": 2.102696549761562e-05, + "loss": 0.2137474536895752, + "step": 184160 + }, + { + "epoch": 0.7906803018984571, + "grad_norm": 0.029949234798550606, + "learning_rate": 2.1022653777497996e-05, + "loss": 0.11488361358642578, + "step": 184170 + }, + { + "epoch": 0.790723233988477, + "grad_norm": 0.003333107102662325, + "learning_rate": 2.1018342057380373e-05, + "loss": 0.21800367832183837, + "step": 184180 + }, + { + "epoch": 0.790766166078497, + "grad_norm": 0.007137620355933905, + "learning_rate": 2.101403033726275e-05, + "loss": 0.2649306058883667, + "step": 184190 + }, + { + "epoch": 0.7908090981685171, + "grad_norm": 1.8278257846832275, + "learning_rate": 2.1009718617145125e-05, + "loss": 0.1403980016708374, + "step": 184200 + }, + { + "epoch": 0.790852030258537, + "grad_norm": 0.0005646710051223636, + "learning_rate": 2.1005406897027502e-05, + "loss": 0.04360083937644958, + "step": 184210 + }, + { + "epoch": 0.790894962348557, + "grad_norm": 0.802178680896759, + "learning_rate": 2.1001095176909876e-05, + "loss": 0.3105415105819702, + "step": 184220 + }, + { + "epoch": 0.7909378944385771, + "grad_norm": 1.269438624382019, + "learning_rate": 2.0996783456792253e-05, + "loss": 0.14836931228637695, + "step": 184230 + }, + { + "epoch": 0.790980826528597, + "grad_norm": 1.6588866710662842, + "learning_rate": 2.099247173667463e-05, + "loss": 0.5411814212799072, + "step": 184240 + }, + { + "epoch": 0.7910237586186171, + "grad_norm": 3.9255900382995605, + "learning_rate": 2.0988160016557008e-05, + "loss": 0.03738404214382172, + "step": 184250 + }, + { + "epoch": 0.7910666907086371, + "grad_norm": 2.704301595687866, + "learning_rate": 2.0983848296439382e-05, + "loss": 0.37105047702789307, + "step": 184260 + }, + { + "epoch": 0.791109622798657, + "grad_norm": 0.024229900911450386, + "learning_rate": 2.097953657632176e-05, + "loss": 0.04928521811962128, + "step": 184270 + }, + { + "epoch": 0.7911525548886771, + "grad_norm": 0.053558625280857086, + "learning_rate": 2.0975224856204133e-05, + "loss": 0.08247345685958862, + "step": 184280 + }, + { + "epoch": 0.7911954869786971, + "grad_norm": 0.04510665684938431, + "learning_rate": 2.097091313608651e-05, + "loss": 0.08157772421836854, + "step": 184290 + }, + { + "epoch": 0.7912384190687171, + "grad_norm": 0.0015568624949082732, + "learning_rate": 2.0966601415968888e-05, + "loss": 0.07714862227439881, + "step": 184300 + }, + { + "epoch": 0.7912813511587371, + "grad_norm": 0.010974083095788956, + "learning_rate": 2.0962289695851265e-05, + "loss": 0.1628153681755066, + "step": 184310 + }, + { + "epoch": 0.7913242832487571, + "grad_norm": 1.7074087858200073, + "learning_rate": 2.095797797573364e-05, + "loss": 0.1585795760154724, + "step": 184320 + }, + { + "epoch": 0.7913672153387771, + "grad_norm": 0.026802221313118935, + "learning_rate": 2.0953666255616016e-05, + "loss": 0.08797412514686584, + "step": 184330 + }, + { + "epoch": 0.7914101474287971, + "grad_norm": 1.5584458112716675, + "learning_rate": 2.094935453549839e-05, + "loss": 0.13183501958847046, + "step": 184340 + }, + { + "epoch": 0.7914530795188172, + "grad_norm": 0.004854326136410236, + "learning_rate": 2.0945042815380768e-05, + "loss": 0.2509061574935913, + "step": 184350 + }, + { + "epoch": 0.7914960116088371, + "grad_norm": 2.4723963737487793, + "learning_rate": 2.0940731095263145e-05, + "loss": 0.2575789451599121, + "step": 184360 + }, + { + "epoch": 0.7915389436988571, + "grad_norm": 10.56302261352539, + "learning_rate": 2.0936419375145522e-05, + "loss": 0.27280776500701903, + "step": 184370 + }, + { + "epoch": 0.7915818757888772, + "grad_norm": 0.034737855195999146, + "learning_rate": 2.0932107655027896e-05, + "loss": 0.020511318743228913, + "step": 184380 + }, + { + "epoch": 0.7916248078788971, + "grad_norm": 8.125020027160645, + "learning_rate": 2.0927795934910274e-05, + "loss": 0.5077376365661621, + "step": 184390 + }, + { + "epoch": 0.7916677399689172, + "grad_norm": 0.2507530450820923, + "learning_rate": 2.0923484214792648e-05, + "loss": 0.15186481475830077, + "step": 184400 + }, + { + "epoch": 0.7917106720589372, + "grad_norm": 0.10388506948947906, + "learning_rate": 2.0919172494675025e-05, + "loss": 0.21182787418365479, + "step": 184410 + }, + { + "epoch": 0.7917536041489571, + "grad_norm": 0.005574772600084543, + "learning_rate": 2.0914860774557402e-05, + "loss": 0.3856783628463745, + "step": 184420 + }, + { + "epoch": 0.7917965362389772, + "grad_norm": 1.1669795513153076, + "learning_rate": 2.091054905443978e-05, + "loss": 0.29858415126800536, + "step": 184430 + }, + { + "epoch": 0.7918394683289972, + "grad_norm": 3.4067726135253906, + "learning_rate": 2.0906237334322157e-05, + "loss": 0.2626990079879761, + "step": 184440 + }, + { + "epoch": 0.7918824004190173, + "grad_norm": 0.049436818808317184, + "learning_rate": 2.090192561420453e-05, + "loss": 0.09174957871437073, + "step": 184450 + }, + { + "epoch": 0.7919253325090372, + "grad_norm": 0.007332088891416788, + "learning_rate": 2.089761389408691e-05, + "loss": 0.17745136022567748, + "step": 184460 + }, + { + "epoch": 0.7919682645990572, + "grad_norm": 1.9990302324295044, + "learning_rate": 2.0893302173969286e-05, + "loss": 0.3661717414855957, + "step": 184470 + }, + { + "epoch": 0.7920111966890773, + "grad_norm": 0.04143689200282097, + "learning_rate": 2.0888990453851663e-05, + "loss": 0.31041841506958007, + "step": 184480 + }, + { + "epoch": 0.7920541287790972, + "grad_norm": 0.012891510501503944, + "learning_rate": 2.0884678733734037e-05, + "loss": 0.3005017042160034, + "step": 184490 + }, + { + "epoch": 0.7920970608691172, + "grad_norm": 4.862680912017822, + "learning_rate": 2.0880367013616414e-05, + "loss": 0.12409036159515381, + "step": 184500 + }, + { + "epoch": 0.7921399929591373, + "grad_norm": 3.8281655311584473, + "learning_rate": 2.087605529349879e-05, + "loss": 0.41316800117492675, + "step": 184510 + }, + { + "epoch": 0.7921829250491572, + "grad_norm": 0.008535422384738922, + "learning_rate": 2.0871743573381166e-05, + "loss": 0.27865374088287354, + "step": 184520 + }, + { + "epoch": 0.7922258571391773, + "grad_norm": 0.008231055922806263, + "learning_rate": 2.0867431853263543e-05, + "loss": 0.3190565586090088, + "step": 184530 + }, + { + "epoch": 0.7922687892291973, + "grad_norm": 1.9232498407363892, + "learning_rate": 2.086312013314592e-05, + "loss": 0.1800924062728882, + "step": 184540 + }, + { + "epoch": 0.7923117213192172, + "grad_norm": 0.0029449264984577894, + "learning_rate": 2.0858808413028294e-05, + "loss": 0.28110227584838865, + "step": 184550 + }, + { + "epoch": 0.7923546534092373, + "grad_norm": 0.4471541941165924, + "learning_rate": 2.085449669291067e-05, + "loss": 0.11063566207885742, + "step": 184560 + }, + { + "epoch": 0.7923975854992573, + "grad_norm": 0.8572136163711548, + "learning_rate": 2.0850184972793046e-05, + "loss": 0.1575225830078125, + "step": 184570 + }, + { + "epoch": 0.7924405175892772, + "grad_norm": 0.008033351972699165, + "learning_rate": 2.0845873252675423e-05, + "loss": 0.1626001477241516, + "step": 184580 + }, + { + "epoch": 0.7924834496792973, + "grad_norm": 2.196404218673706, + "learning_rate": 2.08415615325578e-05, + "loss": 0.30686063766479493, + "step": 184590 + }, + { + "epoch": 0.7925263817693173, + "grad_norm": 0.0012454588431864977, + "learning_rate": 2.0837249812440178e-05, + "loss": 0.14365462064743043, + "step": 184600 + }, + { + "epoch": 0.7925693138593373, + "grad_norm": 0.08527855575084686, + "learning_rate": 2.083293809232255e-05, + "loss": 0.09844621419906616, + "step": 184610 + }, + { + "epoch": 0.7926122459493573, + "grad_norm": 0.22390778362751007, + "learning_rate": 2.082862637220493e-05, + "loss": 0.11322391033172607, + "step": 184620 + }, + { + "epoch": 0.7926551780393774, + "grad_norm": 1.483320951461792, + "learning_rate": 2.0824314652087303e-05, + "loss": 0.2445986747741699, + "step": 184630 + }, + { + "epoch": 0.7926981101293973, + "grad_norm": 0.9645312428474426, + "learning_rate": 2.082000293196968e-05, + "loss": 0.21833391189575196, + "step": 184640 + }, + { + "epoch": 0.7927410422194173, + "grad_norm": 3.154022455215454, + "learning_rate": 2.0815691211852058e-05, + "loss": 0.139963960647583, + "step": 184650 + }, + { + "epoch": 0.7927839743094374, + "grad_norm": 0.00142047053668648, + "learning_rate": 2.0811379491734435e-05, + "loss": 0.16589174270629883, + "step": 184660 + }, + { + "epoch": 0.7928269063994573, + "grad_norm": 0.013610593043267727, + "learning_rate": 2.080706777161681e-05, + "loss": 0.040987375378608706, + "step": 184670 + }, + { + "epoch": 0.7928698384894773, + "grad_norm": 0.0035771746188402176, + "learning_rate": 2.0802756051499186e-05, + "loss": 0.175575852394104, + "step": 184680 + }, + { + "epoch": 0.7929127705794974, + "grad_norm": 24.877038955688477, + "learning_rate": 2.079844433138156e-05, + "loss": 0.01081414669752121, + "step": 184690 + }, + { + "epoch": 0.7929557026695173, + "grad_norm": 0.0351494662463665, + "learning_rate": 2.0794132611263937e-05, + "loss": 0.058581531047821045, + "step": 184700 + }, + { + "epoch": 0.7929986347595374, + "grad_norm": 0.016925616189837456, + "learning_rate": 2.0789820891146315e-05, + "loss": 0.22958390712738036, + "step": 184710 + }, + { + "epoch": 0.7930415668495574, + "grad_norm": 1.6746351718902588, + "learning_rate": 2.0785509171028692e-05, + "loss": 0.2561425924301147, + "step": 184720 + }, + { + "epoch": 0.7930844989395773, + "grad_norm": 0.0013505280949175358, + "learning_rate": 2.0781197450911066e-05, + "loss": 0.09878464341163636, + "step": 184730 + }, + { + "epoch": 0.7931274310295974, + "grad_norm": 0.46645739674568176, + "learning_rate": 2.0776885730793443e-05, + "loss": 0.16808898448944093, + "step": 184740 + }, + { + "epoch": 0.7931703631196174, + "grad_norm": 0.3912404775619507, + "learning_rate": 2.0772574010675817e-05, + "loss": 0.27826414108276365, + "step": 184750 + }, + { + "epoch": 0.7932132952096373, + "grad_norm": 0.05493396148085594, + "learning_rate": 2.0768262290558195e-05, + "loss": 0.11417219638824463, + "step": 184760 + }, + { + "epoch": 0.7932562272996574, + "grad_norm": 3.7654097080230713, + "learning_rate": 2.0763950570440572e-05, + "loss": 0.10978877544403076, + "step": 184770 + }, + { + "epoch": 0.7932991593896774, + "grad_norm": 0.39600008726119995, + "learning_rate": 2.075963885032295e-05, + "loss": 0.08345726132392883, + "step": 184780 + }, + { + "epoch": 0.7933420914796974, + "grad_norm": 1.5910329818725586, + "learning_rate": 2.0755327130205327e-05, + "loss": 0.1979695200920105, + "step": 184790 + }, + { + "epoch": 0.7933850235697174, + "grad_norm": 0.007868641056120396, + "learning_rate": 2.07510154100877e-05, + "loss": 0.16176763772964478, + "step": 184800 + }, + { + "epoch": 0.7934279556597374, + "grad_norm": 0.05180217698216438, + "learning_rate": 2.0746703689970078e-05, + "loss": 0.5233680725097656, + "step": 184810 + }, + { + "epoch": 0.7934708877497574, + "grad_norm": 0.008714994415640831, + "learning_rate": 2.0742391969852455e-05, + "loss": 0.08109192252159118, + "step": 184820 + }, + { + "epoch": 0.7935138198397774, + "grad_norm": 0.012277199886739254, + "learning_rate": 2.0738080249734833e-05, + "loss": 0.30088798999786376, + "step": 184830 + }, + { + "epoch": 0.7935567519297975, + "grad_norm": 0.17794494330883026, + "learning_rate": 2.0733768529617207e-05, + "loss": 0.15534228086471558, + "step": 184840 + }, + { + "epoch": 0.7935996840198174, + "grad_norm": 0.005018056370317936, + "learning_rate": 2.0729456809499584e-05, + "loss": 0.06532618999481202, + "step": 184850 + }, + { + "epoch": 0.7936426161098374, + "grad_norm": 0.010500622913241386, + "learning_rate": 2.0725145089381958e-05, + "loss": 0.2514193534851074, + "step": 184860 + }, + { + "epoch": 0.7936855481998575, + "grad_norm": 11.70617389678955, + "learning_rate": 2.0720833369264335e-05, + "loss": 0.19119045734405518, + "step": 184870 + }, + { + "epoch": 0.7937284802898775, + "grad_norm": 0.007109550293534994, + "learning_rate": 2.0716521649146713e-05, + "loss": 0.037236130237579344, + "step": 184880 + }, + { + "epoch": 0.7937714123798975, + "grad_norm": 12.96017837524414, + "learning_rate": 2.071220992902909e-05, + "loss": 0.19494185447692872, + "step": 184890 + }, + { + "epoch": 0.7938143444699175, + "grad_norm": 0.003496050601825118, + "learning_rate": 2.0707898208911464e-05, + "loss": 0.16270724534988404, + "step": 184900 + }, + { + "epoch": 0.7938572765599375, + "grad_norm": 0.16476179659366608, + "learning_rate": 2.070358648879384e-05, + "loss": 0.2962705850601196, + "step": 184910 + }, + { + "epoch": 0.7939002086499575, + "grad_norm": 2.141551971435547, + "learning_rate": 2.0699274768676215e-05, + "loss": 0.2920737981796265, + "step": 184920 + }, + { + "epoch": 0.7939431407399775, + "grad_norm": 0.0014055408537387848, + "learning_rate": 2.0694963048558593e-05, + "loss": 0.21264221668243408, + "step": 184930 + }, + { + "epoch": 0.7939860728299976, + "grad_norm": 0.008581023663282394, + "learning_rate": 2.069065132844097e-05, + "loss": 0.28402559757232665, + "step": 184940 + }, + { + "epoch": 0.7940290049200175, + "grad_norm": 0.15683463215827942, + "learning_rate": 2.0686339608323347e-05, + "loss": 0.3171902418136597, + "step": 184950 + }, + { + "epoch": 0.7940719370100375, + "grad_norm": 24.406858444213867, + "learning_rate": 2.068202788820572e-05, + "loss": 0.19824904203414917, + "step": 184960 + }, + { + "epoch": 0.7941148691000576, + "grad_norm": 0.009375318884849548, + "learning_rate": 2.06777161680881e-05, + "loss": 0.37527921199798586, + "step": 184970 + }, + { + "epoch": 0.7941578011900775, + "grad_norm": 0.0037970547564327717, + "learning_rate": 2.0673404447970473e-05, + "loss": 0.10710017681121826, + "step": 184980 + }, + { + "epoch": 0.7942007332800975, + "grad_norm": 6.955389499664307, + "learning_rate": 2.066909272785285e-05, + "loss": 0.22487101554870606, + "step": 184990 + }, + { + "epoch": 0.7942436653701176, + "grad_norm": 5.220800876617432, + "learning_rate": 2.0664781007735227e-05, + "loss": 0.08548426628112793, + "step": 185000 + }, + { + "epoch": 0.7942436653701176, + "eval_loss": 0.3935047388076782, + "eval_runtime": 27.4742, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 185000 + }, + { + "epoch": 0.7942865974601375, + "grad_norm": 0.014097069390118122, + "learning_rate": 2.0660469287617605e-05, + "loss": 0.1502092957496643, + "step": 185010 + }, + { + "epoch": 0.7943295295501576, + "grad_norm": 3.219040632247925, + "learning_rate": 2.065615756749998e-05, + "loss": 0.1983199119567871, + "step": 185020 + }, + { + "epoch": 0.7943724616401776, + "grad_norm": 0.0013536418555304408, + "learning_rate": 2.0651845847382356e-05, + "loss": 0.11366220712661743, + "step": 185030 + }, + { + "epoch": 0.7944153937301975, + "grad_norm": 0.010713169351220131, + "learning_rate": 2.064753412726473e-05, + "loss": 0.2043184757232666, + "step": 185040 + }, + { + "epoch": 0.7944583258202176, + "grad_norm": 0.22274163365364075, + "learning_rate": 2.0643222407147107e-05, + "loss": 0.19996603727340698, + "step": 185050 + }, + { + "epoch": 0.7945012579102376, + "grad_norm": 0.02003192901611328, + "learning_rate": 2.0638910687029485e-05, + "loss": 0.23419508934020997, + "step": 185060 + }, + { + "epoch": 0.7945441900002576, + "grad_norm": 0.2899033725261688, + "learning_rate": 2.0634598966911862e-05, + "loss": 0.3487301588058472, + "step": 185070 + }, + { + "epoch": 0.7945871220902776, + "grad_norm": 0.01600206270813942, + "learning_rate": 2.0630287246794236e-05, + "loss": 0.2106010675430298, + "step": 185080 + }, + { + "epoch": 0.7946300541802976, + "grad_norm": 0.014160433784127235, + "learning_rate": 2.0625975526676613e-05, + "loss": 0.15078703165054322, + "step": 185090 + }, + { + "epoch": 0.7946729862703176, + "grad_norm": 2.848785877227783, + "learning_rate": 2.0621663806558987e-05, + "loss": 0.2850050926208496, + "step": 185100 + }, + { + "epoch": 0.7947159183603376, + "grad_norm": 0.011155441403388977, + "learning_rate": 2.0617352086441365e-05, + "loss": 0.1666271209716797, + "step": 185110 + }, + { + "epoch": 0.7947588504503577, + "grad_norm": 1.6798511743545532, + "learning_rate": 2.0613040366323742e-05, + "loss": 0.10716216564178467, + "step": 185120 + }, + { + "epoch": 0.7948017825403776, + "grad_norm": 1.9502683877944946, + "learning_rate": 2.060872864620612e-05, + "loss": 0.1362156629562378, + "step": 185130 + }, + { + "epoch": 0.7948447146303976, + "grad_norm": 0.025498563423752785, + "learning_rate": 2.0604416926088493e-05, + "loss": 0.2359158515930176, + "step": 185140 + }, + { + "epoch": 0.7948876467204177, + "grad_norm": 1.1539736986160278, + "learning_rate": 2.060010520597087e-05, + "loss": 0.1627048969268799, + "step": 185150 + }, + { + "epoch": 0.7949305788104376, + "grad_norm": 0.023429078981280327, + "learning_rate": 2.0595793485853248e-05, + "loss": 0.26216707229614256, + "step": 185160 + }, + { + "epoch": 0.7949735109004576, + "grad_norm": 0.0036843300331383944, + "learning_rate": 2.0591481765735622e-05, + "loss": 0.21936185359954835, + "step": 185170 + }, + { + "epoch": 0.7950164429904777, + "grad_norm": 0.011735700070858002, + "learning_rate": 2.0587170045618003e-05, + "loss": 0.21217544078826905, + "step": 185180 + }, + { + "epoch": 0.7950593750804976, + "grad_norm": 1.6435482501983643, + "learning_rate": 2.0582858325500376e-05, + "loss": 0.24956004619598388, + "step": 185190 + }, + { + "epoch": 0.7951023071705177, + "grad_norm": 0.008881629444658756, + "learning_rate": 2.0578546605382754e-05, + "loss": 0.18325870037078856, + "step": 185200 + }, + { + "epoch": 0.7951452392605377, + "grad_norm": 0.00953991711139679, + "learning_rate": 2.0574234885265128e-05, + "loss": 0.2940868854522705, + "step": 185210 + }, + { + "epoch": 0.7951881713505576, + "grad_norm": 0.005282262340188026, + "learning_rate": 2.0569923165147505e-05, + "loss": 0.11159526109695435, + "step": 185220 + }, + { + "epoch": 0.7952311034405777, + "grad_norm": 0.004339796025305986, + "learning_rate": 2.0565611445029882e-05, + "loss": 0.17705767154693602, + "step": 185230 + }, + { + "epoch": 0.7952740355305977, + "grad_norm": 0.23181919753551483, + "learning_rate": 2.056129972491226e-05, + "loss": 0.24660682678222656, + "step": 185240 + }, + { + "epoch": 0.7953169676206177, + "grad_norm": 1.4440089464187622, + "learning_rate": 2.0556988004794634e-05, + "loss": 0.30808000564575194, + "step": 185250 + }, + { + "epoch": 0.7953598997106377, + "grad_norm": 0.01273949071764946, + "learning_rate": 2.055267628467701e-05, + "loss": 0.02238984853029251, + "step": 185260 + }, + { + "epoch": 0.7954028318006577, + "grad_norm": 0.07017716765403748, + "learning_rate": 2.0548364564559385e-05, + "loss": 0.08472115397453309, + "step": 185270 + }, + { + "epoch": 0.7954457638906777, + "grad_norm": 0.0008051918121054769, + "learning_rate": 2.0544052844441762e-05, + "loss": 0.2069920063018799, + "step": 185280 + }, + { + "epoch": 0.7954886959806977, + "grad_norm": 0.15069304406642914, + "learning_rate": 2.053974112432414e-05, + "loss": 0.17520055770874024, + "step": 185290 + }, + { + "epoch": 0.7955316280707178, + "grad_norm": 7.917318820953369, + "learning_rate": 2.0535429404206517e-05, + "loss": 0.366973352432251, + "step": 185300 + }, + { + "epoch": 0.7955745601607378, + "grad_norm": 0.0027544747572392225, + "learning_rate": 2.053111768408889e-05, + "loss": 0.1479700803756714, + "step": 185310 + }, + { + "epoch": 0.7956174922507577, + "grad_norm": 0.7745925784111023, + "learning_rate": 2.052680596397127e-05, + "loss": 0.05805274844169617, + "step": 185320 + }, + { + "epoch": 0.7956604243407778, + "grad_norm": 1.471014380455017, + "learning_rate": 2.0522494243853642e-05, + "loss": 0.2297032117843628, + "step": 185330 + }, + { + "epoch": 0.7957033564307978, + "grad_norm": 0.4838404357433319, + "learning_rate": 2.051818252373602e-05, + "loss": 0.2138000726699829, + "step": 185340 + }, + { + "epoch": 0.7957462885208177, + "grad_norm": 1.3738874197006226, + "learning_rate": 2.0513870803618397e-05, + "loss": 0.1628502607345581, + "step": 185350 + }, + { + "epoch": 0.7957892206108378, + "grad_norm": 0.1752384454011917, + "learning_rate": 2.0509559083500774e-05, + "loss": 0.33957724571228026, + "step": 185360 + }, + { + "epoch": 0.7958321527008578, + "grad_norm": 0.13606612384319305, + "learning_rate": 2.050524736338315e-05, + "loss": 0.14007097482681274, + "step": 185370 + }, + { + "epoch": 0.7958750847908778, + "grad_norm": 1.536285638809204, + "learning_rate": 2.0500935643265526e-05, + "loss": 0.1621859550476074, + "step": 185380 + }, + { + "epoch": 0.7959180168808978, + "grad_norm": 0.005415162071585655, + "learning_rate": 2.04966239231479e-05, + "loss": 0.07260684967041016, + "step": 185390 + }, + { + "epoch": 0.7959609489709178, + "grad_norm": 3.600752353668213, + "learning_rate": 2.0492312203030277e-05, + "loss": 0.24089322090148926, + "step": 185400 + }, + { + "epoch": 0.7960038810609378, + "grad_norm": 0.05691635608673096, + "learning_rate": 2.0488000482912654e-05, + "loss": 0.13588875532150269, + "step": 185410 + }, + { + "epoch": 0.7960468131509578, + "grad_norm": 0.01960470899939537, + "learning_rate": 2.048368876279503e-05, + "loss": 0.1699157953262329, + "step": 185420 + }, + { + "epoch": 0.7960897452409779, + "grad_norm": 0.011590172536671162, + "learning_rate": 2.0479377042677406e-05, + "loss": 0.024132239818572997, + "step": 185430 + }, + { + "epoch": 0.7961326773309978, + "grad_norm": 0.003582247532904148, + "learning_rate": 2.0475065322559783e-05, + "loss": 0.25007989406585696, + "step": 185440 + }, + { + "epoch": 0.7961756094210178, + "grad_norm": 2.1678757667541504, + "learning_rate": 2.0470753602442157e-05, + "loss": 0.2906757116317749, + "step": 185450 + }, + { + "epoch": 0.7962185415110379, + "grad_norm": 0.019433652982115746, + "learning_rate": 2.0466441882324534e-05, + "loss": 0.05460849404335022, + "step": 185460 + }, + { + "epoch": 0.7962614736010578, + "grad_norm": 0.0070192208513617516, + "learning_rate": 2.046213016220691e-05, + "loss": 0.18471946716308593, + "step": 185470 + }, + { + "epoch": 0.7963044056910779, + "grad_norm": 3.4451112747192383, + "learning_rate": 2.045781844208929e-05, + "loss": 0.19607019424438477, + "step": 185480 + }, + { + "epoch": 0.7963473377810979, + "grad_norm": 0.08804819732904434, + "learning_rate": 2.0453506721971663e-05, + "loss": 0.1398802638053894, + "step": 185490 + }, + { + "epoch": 0.7963902698711178, + "grad_norm": 0.0010080928914248943, + "learning_rate": 2.044919500185404e-05, + "loss": 0.22251415252685547, + "step": 185500 + }, + { + "epoch": 0.7964332019611379, + "grad_norm": 0.05793009698390961, + "learning_rate": 2.0444883281736414e-05, + "loss": 0.12232472896575927, + "step": 185510 + }, + { + "epoch": 0.7964761340511579, + "grad_norm": 0.0047072130255401134, + "learning_rate": 2.044057156161879e-05, + "loss": 0.29519612789154054, + "step": 185520 + }, + { + "epoch": 0.7965190661411778, + "grad_norm": 3.4594709873199463, + "learning_rate": 2.0436259841501172e-05, + "loss": 0.23673882484436035, + "step": 185530 + }, + { + "epoch": 0.7965619982311979, + "grad_norm": 5.806293964385986, + "learning_rate": 2.0431948121383546e-05, + "loss": 0.2166839361190796, + "step": 185540 + }, + { + "epoch": 0.7966049303212179, + "grad_norm": 0.005207410082221031, + "learning_rate": 2.0427636401265924e-05, + "loss": 0.09798675179481506, + "step": 185550 + }, + { + "epoch": 0.7966478624112379, + "grad_norm": 0.0033032442443072796, + "learning_rate": 2.0423324681148298e-05, + "loss": 0.24011375904083251, + "step": 185560 + }, + { + "epoch": 0.7966907945012579, + "grad_norm": 0.04708894342184067, + "learning_rate": 2.0419012961030675e-05, + "loss": 0.15701087713241577, + "step": 185570 + }, + { + "epoch": 0.796733726591278, + "grad_norm": 2.4670138359069824, + "learning_rate": 2.0414701240913052e-05, + "loss": 0.061458778381347653, + "step": 185580 + }, + { + "epoch": 0.7967766586812979, + "grad_norm": 0.0011994473170489073, + "learning_rate": 2.041038952079543e-05, + "loss": 0.21192600727081298, + "step": 185590 + }, + { + "epoch": 0.7968195907713179, + "grad_norm": 7.147767543792725, + "learning_rate": 2.0406077800677803e-05, + "loss": 0.28967599868774413, + "step": 185600 + }, + { + "epoch": 0.796862522861338, + "grad_norm": 3.6588809490203857, + "learning_rate": 2.040176608056018e-05, + "loss": 0.2007124900817871, + "step": 185610 + }, + { + "epoch": 0.7969054549513579, + "grad_norm": 0.0008074496872723103, + "learning_rate": 2.0397454360442555e-05, + "loss": 0.27359185218811033, + "step": 185620 + }, + { + "epoch": 0.7969483870413779, + "grad_norm": 0.04204615205526352, + "learning_rate": 2.0393142640324932e-05, + "loss": 0.15062224864959717, + "step": 185630 + }, + { + "epoch": 0.796991319131398, + "grad_norm": 0.003354994347319007, + "learning_rate": 2.038883092020731e-05, + "loss": 0.16356064081192018, + "step": 185640 + }, + { + "epoch": 0.7970342512214179, + "grad_norm": 0.9174707531929016, + "learning_rate": 2.0384519200089687e-05, + "loss": 0.4482748508453369, + "step": 185650 + }, + { + "epoch": 0.797077183311438, + "grad_norm": 1.6391700506210327, + "learning_rate": 2.038020747997206e-05, + "loss": 0.2696021795272827, + "step": 185660 + }, + { + "epoch": 0.797120115401458, + "grad_norm": 0.0044967783614993095, + "learning_rate": 2.0375895759854438e-05, + "loss": 0.09318976402282715, + "step": 185670 + }, + { + "epoch": 0.7971630474914779, + "grad_norm": 2.8881003856658936, + "learning_rate": 2.0371584039736812e-05, + "loss": 0.1859677791595459, + "step": 185680 + }, + { + "epoch": 0.797205979581498, + "grad_norm": 2.529358386993408, + "learning_rate": 2.036727231961919e-05, + "loss": 0.05045361518859863, + "step": 185690 + }, + { + "epoch": 0.797248911671518, + "grad_norm": 2.6487114429473877, + "learning_rate": 2.0362960599501567e-05, + "loss": 0.23644933700561524, + "step": 185700 + }, + { + "epoch": 0.7972918437615379, + "grad_norm": 0.009652736596763134, + "learning_rate": 2.0358648879383944e-05, + "loss": 0.31275110244750975, + "step": 185710 + }, + { + "epoch": 0.797334775851558, + "grad_norm": 0.001559227122925222, + "learning_rate": 2.0354337159266318e-05, + "loss": 0.12550193071365356, + "step": 185720 + }, + { + "epoch": 0.797377707941578, + "grad_norm": 0.02826523408293724, + "learning_rate": 2.0350025439148695e-05, + "loss": 0.30759053230285643, + "step": 185730 + }, + { + "epoch": 0.7974206400315981, + "grad_norm": 3.503047227859497, + "learning_rate": 2.034571371903107e-05, + "loss": 0.2664123296737671, + "step": 185740 + }, + { + "epoch": 0.797463572121618, + "grad_norm": 0.008845411241054535, + "learning_rate": 2.0341401998913447e-05, + "loss": 0.24550697803497315, + "step": 185750 + }, + { + "epoch": 0.797506504211638, + "grad_norm": 0.24258193373680115, + "learning_rate": 2.0337090278795824e-05, + "loss": 0.18062769174575805, + "step": 185760 + }, + { + "epoch": 0.7975494363016581, + "grad_norm": 0.05451073870062828, + "learning_rate": 2.03327785586782e-05, + "loss": 0.2377575159072876, + "step": 185770 + }, + { + "epoch": 0.797592368391678, + "grad_norm": 0.006921617779880762, + "learning_rate": 2.0328466838560575e-05, + "loss": 0.004407922551035881, + "step": 185780 + }, + { + "epoch": 0.7976353004816981, + "grad_norm": 0.395277202129364, + "learning_rate": 2.0324155118442953e-05, + "loss": 0.21247997283935546, + "step": 185790 + }, + { + "epoch": 0.7976782325717181, + "grad_norm": 0.0005243797786533833, + "learning_rate": 2.0319843398325327e-05, + "loss": 0.20616483688354492, + "step": 185800 + }, + { + "epoch": 0.797721164661738, + "grad_norm": 0.016554752364754677, + "learning_rate": 2.0315531678207704e-05, + "loss": 0.26093599796295164, + "step": 185810 + }, + { + "epoch": 0.7977640967517581, + "grad_norm": 1.9969154596328735, + "learning_rate": 2.031121995809008e-05, + "loss": 0.20317411422729492, + "step": 185820 + }, + { + "epoch": 0.7978070288417781, + "grad_norm": 0.2166758030653, + "learning_rate": 2.030690823797246e-05, + "loss": 0.1044198989868164, + "step": 185830 + }, + { + "epoch": 0.797849960931798, + "grad_norm": 1.184893250465393, + "learning_rate": 2.0302596517854833e-05, + "loss": 0.30167474746704104, + "step": 185840 + }, + { + "epoch": 0.7978928930218181, + "grad_norm": 0.007805278990417719, + "learning_rate": 2.029828479773721e-05, + "loss": 0.09440180659294128, + "step": 185850 + }, + { + "epoch": 0.7979358251118381, + "grad_norm": 0.6124712228775024, + "learning_rate": 2.0293973077619584e-05, + "loss": 0.07313104271888733, + "step": 185860 + }, + { + "epoch": 0.7979787572018581, + "grad_norm": 0.0013098136987537146, + "learning_rate": 2.028966135750196e-05, + "loss": 0.22837140560150146, + "step": 185870 + }, + { + "epoch": 0.7980216892918781, + "grad_norm": 2.9226720333099365, + "learning_rate": 2.0285349637384342e-05, + "loss": 0.34854779243469236, + "step": 185880 + }, + { + "epoch": 0.7980646213818982, + "grad_norm": 2.7549538612365723, + "learning_rate": 2.0281037917266716e-05, + "loss": 0.18072230815887452, + "step": 185890 + }, + { + "epoch": 0.7981075534719181, + "grad_norm": 0.19986467063426971, + "learning_rate": 2.0276726197149093e-05, + "loss": 0.005468511208891869, + "step": 185900 + }, + { + "epoch": 0.7981504855619381, + "grad_norm": 0.000706440070644021, + "learning_rate": 2.0272414477031467e-05, + "loss": 0.30676515102386476, + "step": 185910 + }, + { + "epoch": 0.7981934176519582, + "grad_norm": 9.663665771484375, + "learning_rate": 2.0268102756913845e-05, + "loss": 0.34456477165222166, + "step": 185920 + }, + { + "epoch": 0.7982363497419781, + "grad_norm": 0.51984041929245, + "learning_rate": 2.0263791036796222e-05, + "loss": 0.18212833404541015, + "step": 185930 + }, + { + "epoch": 0.7982792818319981, + "grad_norm": 0.41934898495674133, + "learning_rate": 2.02594793166786e-05, + "loss": 0.05728796124458313, + "step": 185940 + }, + { + "epoch": 0.7983222139220182, + "grad_norm": 0.3425225019454956, + "learning_rate": 2.0255167596560973e-05, + "loss": 0.223008131980896, + "step": 185950 + }, + { + "epoch": 0.7983651460120381, + "grad_norm": 0.0046521383337676525, + "learning_rate": 2.025085587644335e-05, + "loss": 0.31833226680755616, + "step": 185960 + }, + { + "epoch": 0.7984080781020582, + "grad_norm": 1.873736023902893, + "learning_rate": 2.0246544156325725e-05, + "loss": 0.2819650888442993, + "step": 185970 + }, + { + "epoch": 0.7984510101920782, + "grad_norm": 0.053500592708587646, + "learning_rate": 2.0242232436208102e-05, + "loss": 0.15295974016189576, + "step": 185980 + }, + { + "epoch": 0.7984939422820981, + "grad_norm": 2.2874643802642822, + "learning_rate": 2.023792071609048e-05, + "loss": 0.09558444619178771, + "step": 185990 + }, + { + "epoch": 0.7985368743721182, + "grad_norm": 0.42870280146598816, + "learning_rate": 2.0233608995972857e-05, + "loss": 0.14517755508422853, + "step": 186000 + }, + { + "epoch": 0.7985368743721182, + "eval_loss": 0.383025199174881, + "eval_runtime": 27.3953, + "eval_samples_per_second": 3.65, + "eval_steps_per_second": 3.65, + "step": 186000 + }, + { + "epoch": 0.7985798064621382, + "grad_norm": 0.0867031067609787, + "learning_rate": 2.022929727585523e-05, + "loss": 0.09830026030540466, + "step": 186010 + }, + { + "epoch": 0.7986227385521582, + "grad_norm": 1.7900984287261963, + "learning_rate": 2.0224985555737608e-05, + "loss": 0.15121421813964844, + "step": 186020 + }, + { + "epoch": 0.7986656706421782, + "grad_norm": 0.07163318246603012, + "learning_rate": 2.0220673835619982e-05, + "loss": 0.43063783645629883, + "step": 186030 + }, + { + "epoch": 0.7987086027321982, + "grad_norm": 0.010341783985495567, + "learning_rate": 2.021636211550236e-05, + "loss": 0.17103235721588134, + "step": 186040 + }, + { + "epoch": 0.7987515348222182, + "grad_norm": 9.039875984191895, + "learning_rate": 2.0212050395384736e-05, + "loss": 0.3425628185272217, + "step": 186050 + }, + { + "epoch": 0.7987944669122382, + "grad_norm": 0.012275336310267448, + "learning_rate": 2.0207738675267114e-05, + "loss": 0.21186771392822265, + "step": 186060 + }, + { + "epoch": 0.7988373990022583, + "grad_norm": 0.09523310512304306, + "learning_rate": 2.0203426955149488e-05, + "loss": 0.22170612812042237, + "step": 186070 + }, + { + "epoch": 0.7988803310922782, + "grad_norm": 0.8266873955726624, + "learning_rate": 2.0199115235031865e-05, + "loss": 0.21422924995422363, + "step": 186080 + }, + { + "epoch": 0.7989232631822982, + "grad_norm": 1.8576383590698242, + "learning_rate": 2.019480351491424e-05, + "loss": 0.17523318529129028, + "step": 186090 + }, + { + "epoch": 0.7989661952723183, + "grad_norm": 2.276369333267212, + "learning_rate": 2.0190491794796616e-05, + "loss": 0.18289344310760497, + "step": 186100 + }, + { + "epoch": 0.7990091273623382, + "grad_norm": 0.03286011144518852, + "learning_rate": 2.0186180074678994e-05, + "loss": 0.06531772613525391, + "step": 186110 + }, + { + "epoch": 0.7990520594523582, + "grad_norm": 0.5831666588783264, + "learning_rate": 2.018186835456137e-05, + "loss": 0.22013142108917236, + "step": 186120 + }, + { + "epoch": 0.7990949915423783, + "grad_norm": 3.7149910926818848, + "learning_rate": 2.0177556634443745e-05, + "loss": 0.2326035737991333, + "step": 186130 + }, + { + "epoch": 0.7991379236323982, + "grad_norm": 2.462045907974243, + "learning_rate": 2.0173244914326122e-05, + "loss": 0.2145132303237915, + "step": 186140 + }, + { + "epoch": 0.7991808557224183, + "grad_norm": 0.06917181611061096, + "learning_rate": 2.0168933194208496e-05, + "loss": 0.26174378395080566, + "step": 186150 + }, + { + "epoch": 0.7992237878124383, + "grad_norm": 3.2315165996551514, + "learning_rate": 2.0164621474090874e-05, + "loss": 0.3010656118392944, + "step": 186160 + }, + { + "epoch": 0.7992667199024583, + "grad_norm": 1.9338715076446533, + "learning_rate": 2.016030975397325e-05, + "loss": 0.10233894586563111, + "step": 186170 + }, + { + "epoch": 0.7993096519924783, + "grad_norm": 0.004972547292709351, + "learning_rate": 2.015599803385563e-05, + "loss": 0.15367188453674316, + "step": 186180 + }, + { + "epoch": 0.7993525840824983, + "grad_norm": 0.46371036767959595, + "learning_rate": 2.0151686313738002e-05, + "loss": 0.20945146083831787, + "step": 186190 + }, + { + "epoch": 0.7993955161725184, + "grad_norm": 0.004476912785321474, + "learning_rate": 2.014737459362038e-05, + "loss": 0.002007796801626682, + "step": 186200 + }, + { + "epoch": 0.7994384482625383, + "grad_norm": 0.006324408575892448, + "learning_rate": 2.0143062873502754e-05, + "loss": 0.22494516372680665, + "step": 186210 + }, + { + "epoch": 0.7994813803525583, + "grad_norm": 0.007815426215529442, + "learning_rate": 2.013875115338513e-05, + "loss": 0.15336228609085084, + "step": 186220 + }, + { + "epoch": 0.7995243124425784, + "grad_norm": 0.001811459893360734, + "learning_rate": 2.013443943326751e-05, + "loss": 0.31596338748931885, + "step": 186230 + }, + { + "epoch": 0.7995672445325983, + "grad_norm": 0.00441304175183177, + "learning_rate": 2.0130127713149886e-05, + "loss": 0.0335442453622818, + "step": 186240 + }, + { + "epoch": 0.7996101766226184, + "grad_norm": 0.0016258248360827565, + "learning_rate": 2.0125815993032263e-05, + "loss": 0.1325811982154846, + "step": 186250 + }, + { + "epoch": 0.7996531087126384, + "grad_norm": 0.014790796674787998, + "learning_rate": 2.0121504272914637e-05, + "loss": 0.10485789775848389, + "step": 186260 + }, + { + "epoch": 0.7996960408026583, + "grad_norm": 0.02397337555885315, + "learning_rate": 2.0117192552797014e-05, + "loss": 0.08896262049674988, + "step": 186270 + }, + { + "epoch": 0.7997389728926784, + "grad_norm": 0.14343607425689697, + "learning_rate": 2.0112880832679388e-05, + "loss": 0.2278761386871338, + "step": 186280 + }, + { + "epoch": 0.7997819049826984, + "grad_norm": 0.0002532574872020632, + "learning_rate": 2.010856911256177e-05, + "loss": 0.13488831520080566, + "step": 186290 + }, + { + "epoch": 0.7998248370727183, + "grad_norm": 0.0023948042653501034, + "learning_rate": 2.0104257392444143e-05, + "loss": 0.21678998470306396, + "step": 186300 + }, + { + "epoch": 0.7998677691627384, + "grad_norm": 0.10748161375522614, + "learning_rate": 2.009994567232652e-05, + "loss": 0.048986378312110904, + "step": 186310 + }, + { + "epoch": 0.7999107012527584, + "grad_norm": 0.025953242555260658, + "learning_rate": 2.0095633952208894e-05, + "loss": 0.12238609790802002, + "step": 186320 + }, + { + "epoch": 0.7999536333427784, + "grad_norm": 0.04679807275533676, + "learning_rate": 2.009132223209127e-05, + "loss": 0.2825323581695557, + "step": 186330 + }, + { + "epoch": 0.7999965654327984, + "grad_norm": 0.0020087333396077156, + "learning_rate": 2.008701051197365e-05, + "loss": 0.14617905616760254, + "step": 186340 + }, + { + "epoch": 0.8000394975228184, + "grad_norm": 0.02344387024641037, + "learning_rate": 2.0082698791856026e-05, + "loss": 0.11631090641021728, + "step": 186350 + }, + { + "epoch": 0.8000824296128384, + "grad_norm": 2.009197950363159, + "learning_rate": 2.00783870717384e-05, + "loss": 0.17129077911376953, + "step": 186360 + }, + { + "epoch": 0.8001253617028584, + "grad_norm": 0.011406494304537773, + "learning_rate": 2.0074075351620778e-05, + "loss": 0.07285622358322144, + "step": 186370 + }, + { + "epoch": 0.8001682937928785, + "grad_norm": 0.0726480782032013, + "learning_rate": 2.006976363150315e-05, + "loss": 0.1225576400756836, + "step": 186380 + }, + { + "epoch": 0.8002112258828984, + "grad_norm": 0.015434571541845798, + "learning_rate": 2.006545191138553e-05, + "loss": 0.05374734997749329, + "step": 186390 + }, + { + "epoch": 0.8002541579729184, + "grad_norm": 0.10589390993118286, + "learning_rate": 2.0061140191267906e-05, + "loss": 0.16505370140075684, + "step": 186400 + }, + { + "epoch": 0.8002970900629385, + "grad_norm": 0.01193720381706953, + "learning_rate": 2.0056828471150284e-05, + "loss": 0.17347509860992433, + "step": 186410 + }, + { + "epoch": 0.8003400221529584, + "grad_norm": 0.02078915387392044, + "learning_rate": 2.0052516751032658e-05, + "loss": 0.15373240709304808, + "step": 186420 + }, + { + "epoch": 0.8003829542429785, + "grad_norm": 0.003241701051592827, + "learning_rate": 2.0048205030915035e-05, + "loss": 0.15771273374557496, + "step": 186430 + }, + { + "epoch": 0.8004258863329985, + "grad_norm": 0.03669734671711922, + "learning_rate": 2.004389331079741e-05, + "loss": 0.16545580625534057, + "step": 186440 + }, + { + "epoch": 0.8004688184230184, + "grad_norm": 1.4811112880706787, + "learning_rate": 2.0039581590679786e-05, + "loss": 0.3451890468597412, + "step": 186450 + }, + { + "epoch": 0.8005117505130385, + "grad_norm": 0.0017724055796861649, + "learning_rate": 2.0035269870562164e-05, + "loss": 0.14023756980895996, + "step": 186460 + }, + { + "epoch": 0.8005546826030585, + "grad_norm": 0.17874978482723236, + "learning_rate": 2.003095815044454e-05, + "loss": 0.1794173836708069, + "step": 186470 + }, + { + "epoch": 0.8005976146930784, + "grad_norm": 1.1632425785064697, + "learning_rate": 2.0026646430326915e-05, + "loss": 0.25994980335235596, + "step": 186480 + }, + { + "epoch": 0.8006405467830985, + "grad_norm": 1.2274190187454224, + "learning_rate": 2.0022334710209292e-05, + "loss": 0.21253445148468017, + "step": 186490 + }, + { + "epoch": 0.8006834788731185, + "grad_norm": 0.09271473437547684, + "learning_rate": 2.0018022990091666e-05, + "loss": 0.08092796206474304, + "step": 186500 + }, + { + "epoch": 0.8007264109631385, + "grad_norm": 0.025559542700648308, + "learning_rate": 2.0013711269974043e-05, + "loss": 0.3423144817352295, + "step": 186510 + }, + { + "epoch": 0.8007693430531585, + "grad_norm": 0.018450139090418816, + "learning_rate": 2.000939954985642e-05, + "loss": 0.2602881669998169, + "step": 186520 + }, + { + "epoch": 0.8008122751431785, + "grad_norm": 0.0023915015626698732, + "learning_rate": 2.0005087829738798e-05, + "loss": 0.11772937774658203, + "step": 186530 + }, + { + "epoch": 0.8008552072331985, + "grad_norm": 0.9385748505592346, + "learning_rate": 2.0000776109621172e-05, + "loss": 0.21836485862731933, + "step": 186540 + }, + { + "epoch": 0.8008981393232185, + "grad_norm": 0.007567053660750389, + "learning_rate": 1.999646438950355e-05, + "loss": 0.1203912615776062, + "step": 186550 + }, + { + "epoch": 0.8009410714132386, + "grad_norm": 0.0016851389082148671, + "learning_rate": 1.9992152669385923e-05, + "loss": 0.2831920623779297, + "step": 186560 + }, + { + "epoch": 0.8009840035032585, + "grad_norm": 0.2422313392162323, + "learning_rate": 1.99878409492683e-05, + "loss": 0.11574677228927613, + "step": 186570 + }, + { + "epoch": 0.8010269355932785, + "grad_norm": 0.00440719211474061, + "learning_rate": 1.9983529229150678e-05, + "loss": 0.19264068603515624, + "step": 186580 + }, + { + "epoch": 0.8010698676832986, + "grad_norm": 0.12345883995294571, + "learning_rate": 1.9979217509033055e-05, + "loss": 0.2742461204528809, + "step": 186590 + }, + { + "epoch": 0.8011127997733186, + "grad_norm": 0.0027105410117655993, + "learning_rate": 1.997490578891543e-05, + "loss": 0.19218816757202148, + "step": 186600 + }, + { + "epoch": 0.8011557318633385, + "grad_norm": 0.3615066409111023, + "learning_rate": 1.9970594068797807e-05, + "loss": 0.1688565731048584, + "step": 186610 + }, + { + "epoch": 0.8011986639533586, + "grad_norm": 4.431537628173828, + "learning_rate": 1.9966282348680184e-05, + "loss": 0.2278277635574341, + "step": 186620 + }, + { + "epoch": 0.8012415960433786, + "grad_norm": 2.2264537811279297, + "learning_rate": 1.9961970628562558e-05, + "loss": 0.13971056938171386, + "step": 186630 + }, + { + "epoch": 0.8012845281333986, + "grad_norm": 2.352553129196167, + "learning_rate": 1.995765890844494e-05, + "loss": 0.5238365173339844, + "step": 186640 + }, + { + "epoch": 0.8013274602234186, + "grad_norm": 0.05882478877902031, + "learning_rate": 1.9953347188327313e-05, + "loss": 0.2581619739532471, + "step": 186650 + }, + { + "epoch": 0.8013703923134387, + "grad_norm": 0.046139009296894073, + "learning_rate": 1.994903546820969e-05, + "loss": 0.07872686386108399, + "step": 186660 + }, + { + "epoch": 0.8014133244034586, + "grad_norm": 0.03067977912724018, + "learning_rate": 1.9944723748092064e-05, + "loss": 0.34165639877319337, + "step": 186670 + }, + { + "epoch": 0.8014562564934786, + "grad_norm": 0.010820715688169003, + "learning_rate": 1.994041202797444e-05, + "loss": 0.3078367471694946, + "step": 186680 + }, + { + "epoch": 0.8014991885834987, + "grad_norm": 0.08466701209545135, + "learning_rate": 1.993610030785682e-05, + "loss": 0.23001487255096437, + "step": 186690 + }, + { + "epoch": 0.8015421206735186, + "grad_norm": 0.0826585441827774, + "learning_rate": 1.9931788587739196e-05, + "loss": 0.2410907506942749, + "step": 186700 + }, + { + "epoch": 0.8015850527635386, + "grad_norm": 2.120948314666748, + "learning_rate": 1.992747686762157e-05, + "loss": 0.1979671001434326, + "step": 186710 + }, + { + "epoch": 0.8016279848535587, + "grad_norm": 0.002734254812821746, + "learning_rate": 1.9923165147503947e-05, + "loss": 0.36019628047943114, + "step": 186720 + }, + { + "epoch": 0.8016709169435786, + "grad_norm": 1.2035413980484009, + "learning_rate": 1.991885342738632e-05, + "loss": 0.18466417789459227, + "step": 186730 + }, + { + "epoch": 0.8017138490335987, + "grad_norm": 0.09759000688791275, + "learning_rate": 1.99145417072687e-05, + "loss": 0.1547027587890625, + "step": 186740 + }, + { + "epoch": 0.8017567811236187, + "grad_norm": 0.01482541672885418, + "learning_rate": 1.9910229987151076e-05, + "loss": 0.30575516223907473, + "step": 186750 + }, + { + "epoch": 0.8017997132136386, + "grad_norm": 0.03860694542527199, + "learning_rate": 1.9905918267033453e-05, + "loss": 0.33325586318969724, + "step": 186760 + }, + { + "epoch": 0.8018426453036587, + "grad_norm": 0.00036169207305647433, + "learning_rate": 1.9901606546915827e-05, + "loss": 0.1686161994934082, + "step": 186770 + }, + { + "epoch": 0.8018855773936787, + "grad_norm": 1.8329875469207764, + "learning_rate": 1.9897294826798205e-05, + "loss": 0.2805971622467041, + "step": 186780 + }, + { + "epoch": 0.8019285094836986, + "grad_norm": 2.861274242401123, + "learning_rate": 1.989298310668058e-05, + "loss": 0.3051000118255615, + "step": 186790 + }, + { + "epoch": 0.8019714415737187, + "grad_norm": 51.22409439086914, + "learning_rate": 1.9888671386562956e-05, + "loss": 0.158194899559021, + "step": 186800 + }, + { + "epoch": 0.8020143736637387, + "grad_norm": 1.1815828084945679, + "learning_rate": 1.9884359666445333e-05, + "loss": 0.3367149353027344, + "step": 186810 + }, + { + "epoch": 0.8020573057537587, + "grad_norm": 0.05270637199282646, + "learning_rate": 1.988004794632771e-05, + "loss": 0.3387028932571411, + "step": 186820 + }, + { + "epoch": 0.8021002378437787, + "grad_norm": 0.08214150369167328, + "learning_rate": 1.9875736226210085e-05, + "loss": 0.18951514959335328, + "step": 186830 + }, + { + "epoch": 0.8021431699337987, + "grad_norm": 1.066209077835083, + "learning_rate": 1.9871424506092462e-05, + "loss": 0.2531846046447754, + "step": 186840 + }, + { + "epoch": 0.8021861020238187, + "grad_norm": 0.666606068611145, + "learning_rate": 1.9867112785974836e-05, + "loss": 0.14989641904830933, + "step": 186850 + }, + { + "epoch": 0.8022290341138387, + "grad_norm": 0.18978695571422577, + "learning_rate": 1.9862801065857213e-05, + "loss": 0.20271751880645753, + "step": 186860 + }, + { + "epoch": 0.8022719662038588, + "grad_norm": 0.2546761631965637, + "learning_rate": 1.985848934573959e-05, + "loss": 0.315608811378479, + "step": 186870 + }, + { + "epoch": 0.8023148982938787, + "grad_norm": 0.004536745138466358, + "learning_rate": 1.9854177625621968e-05, + "loss": 0.20478010177612305, + "step": 186880 + }, + { + "epoch": 0.8023578303838987, + "grad_norm": 0.11876603215932846, + "learning_rate": 1.9849865905504342e-05, + "loss": 0.03800502419471741, + "step": 186890 + }, + { + "epoch": 0.8024007624739188, + "grad_norm": 2.519597053527832, + "learning_rate": 1.984555418538672e-05, + "loss": 0.2677901268005371, + "step": 186900 + }, + { + "epoch": 0.8024436945639387, + "grad_norm": 0.019811009988188744, + "learning_rate": 1.9841242465269093e-05, + "loss": 0.27901058197021483, + "step": 186910 + }, + { + "epoch": 0.8024866266539588, + "grad_norm": 0.23691457509994507, + "learning_rate": 1.983693074515147e-05, + "loss": 0.24831018447875977, + "step": 186920 + }, + { + "epoch": 0.8025295587439788, + "grad_norm": 2.062676429748535, + "learning_rate": 1.9832619025033848e-05, + "loss": 0.18976922035217286, + "step": 186930 + }, + { + "epoch": 0.8025724908339987, + "grad_norm": 2.7888567447662354, + "learning_rate": 1.9828307304916225e-05, + "loss": 0.1329728126525879, + "step": 186940 + }, + { + "epoch": 0.8026154229240188, + "grad_norm": 0.053935494273900986, + "learning_rate": 1.98239955847986e-05, + "loss": 0.3322454929351807, + "step": 186950 + }, + { + "epoch": 0.8026583550140388, + "grad_norm": 0.01403512991964817, + "learning_rate": 1.9819683864680976e-05, + "loss": 0.28524141311645507, + "step": 186960 + }, + { + "epoch": 0.8027012871040587, + "grad_norm": 1.594042181968689, + "learning_rate": 1.981537214456335e-05, + "loss": 0.13451719284057617, + "step": 186970 + }, + { + "epoch": 0.8027442191940788, + "grad_norm": 1.3952689170837402, + "learning_rate": 1.9811060424445728e-05, + "loss": 0.08733786344528198, + "step": 186980 + }, + { + "epoch": 0.8027871512840988, + "grad_norm": 0.07780808955430984, + "learning_rate": 1.980674870432811e-05, + "loss": 0.1476304292678833, + "step": 186990 + }, + { + "epoch": 0.8028300833741188, + "grad_norm": 1.0146738290786743, + "learning_rate": 1.9802436984210482e-05, + "loss": 0.11834737062454223, + "step": 187000 + }, + { + "epoch": 0.8028300833741188, + "eval_loss": 0.3865559697151184, + "eval_runtime": 27.422, + "eval_samples_per_second": 3.647, + "eval_steps_per_second": 3.647, + "step": 187000 + }, + { + "epoch": 0.8028730154641388, + "grad_norm": 6.436366081237793, + "learning_rate": 1.979812526409286e-05, + "loss": 0.206487774848938, + "step": 187010 + }, + { + "epoch": 0.8029159475541588, + "grad_norm": 0.007422825321555138, + "learning_rate": 1.9793813543975234e-05, + "loss": 0.18670098781585692, + "step": 187020 + }, + { + "epoch": 0.8029588796441789, + "grad_norm": 0.001183122512884438, + "learning_rate": 1.978950182385761e-05, + "loss": 0.278668475151062, + "step": 187030 + }, + { + "epoch": 0.8030018117341988, + "grad_norm": 11.426277160644531, + "learning_rate": 1.9785190103739985e-05, + "loss": 0.3917843818664551, + "step": 187040 + }, + { + "epoch": 0.8030447438242189, + "grad_norm": 1.441265344619751, + "learning_rate": 1.9780878383622366e-05, + "loss": 0.06324902772903443, + "step": 187050 + }, + { + "epoch": 0.8030876759142389, + "grad_norm": 0.1833934485912323, + "learning_rate": 1.977656666350474e-05, + "loss": 0.14141279458999634, + "step": 187060 + }, + { + "epoch": 0.8031306080042588, + "grad_norm": 1.207807183265686, + "learning_rate": 1.9772254943387117e-05, + "loss": 0.15223703384399415, + "step": 187070 + }, + { + "epoch": 0.8031735400942789, + "grad_norm": 0.013842697255313396, + "learning_rate": 1.976794322326949e-05, + "loss": 0.24720118045806885, + "step": 187080 + }, + { + "epoch": 0.8032164721842989, + "grad_norm": 0.0008404516847804189, + "learning_rate": 1.976363150315187e-05, + "loss": 0.1951699137687683, + "step": 187090 + }, + { + "epoch": 0.8032594042743189, + "grad_norm": 1.2827842235565186, + "learning_rate": 1.9759319783034246e-05, + "loss": 0.2023831605911255, + "step": 187100 + }, + { + "epoch": 0.8033023363643389, + "grad_norm": 0.00881370808929205, + "learning_rate": 1.9755008062916623e-05, + "loss": 0.21812007427215577, + "step": 187110 + }, + { + "epoch": 0.8033452684543589, + "grad_norm": 0.0040892851538956165, + "learning_rate": 1.9750696342798997e-05, + "loss": 0.13440535068511963, + "step": 187120 + }, + { + "epoch": 0.8033882005443789, + "grad_norm": 4.08140754699707, + "learning_rate": 1.9746384622681374e-05, + "loss": 0.1526040554046631, + "step": 187130 + }, + { + "epoch": 0.8034311326343989, + "grad_norm": 16.14797019958496, + "learning_rate": 1.9742072902563748e-05, + "loss": 0.19586637020111083, + "step": 187140 + }, + { + "epoch": 0.803474064724419, + "grad_norm": 3.350125789642334, + "learning_rate": 1.9737761182446126e-05, + "loss": 0.10762113332748413, + "step": 187150 + }, + { + "epoch": 0.8035169968144389, + "grad_norm": 0.20307861268520355, + "learning_rate": 1.9733449462328503e-05, + "loss": 0.1554766535758972, + "step": 187160 + }, + { + "epoch": 0.8035599289044589, + "grad_norm": 0.5534129738807678, + "learning_rate": 1.972913774221088e-05, + "loss": 0.20793559551239013, + "step": 187170 + }, + { + "epoch": 0.803602860994479, + "grad_norm": 0.3242033123970032, + "learning_rate": 1.9724826022093254e-05, + "loss": 0.2671414852142334, + "step": 187180 + }, + { + "epoch": 0.8036457930844989, + "grad_norm": 4.204552173614502, + "learning_rate": 1.972051430197563e-05, + "loss": 0.1226415753364563, + "step": 187190 + }, + { + "epoch": 0.803688725174519, + "grad_norm": 0.008825673721730709, + "learning_rate": 1.9716202581858006e-05, + "loss": 0.1894465208053589, + "step": 187200 + }, + { + "epoch": 0.803731657264539, + "grad_norm": 0.07439620047807693, + "learning_rate": 1.9711890861740383e-05, + "loss": 0.03914215862751007, + "step": 187210 + }, + { + "epoch": 0.8037745893545589, + "grad_norm": 0.0796755999326706, + "learning_rate": 1.970757914162276e-05, + "loss": 0.4778743267059326, + "step": 187220 + }, + { + "epoch": 0.803817521444579, + "grad_norm": 0.4283967912197113, + "learning_rate": 1.9703267421505138e-05, + "loss": 0.0604019284248352, + "step": 187230 + }, + { + "epoch": 0.803860453534599, + "grad_norm": 68.16015625, + "learning_rate": 1.969895570138751e-05, + "loss": 0.26521375179290774, + "step": 187240 + }, + { + "epoch": 0.8039033856246189, + "grad_norm": 0.04783042520284653, + "learning_rate": 1.969464398126989e-05, + "loss": 0.35483057498931886, + "step": 187250 + }, + { + "epoch": 0.803946317714639, + "grad_norm": 0.005563311744481325, + "learning_rate": 1.9690332261152263e-05, + "loss": 0.11798173189163208, + "step": 187260 + }, + { + "epoch": 0.803989249804659, + "grad_norm": 0.029166478663682938, + "learning_rate": 1.968602054103464e-05, + "loss": 0.021522310376167298, + "step": 187270 + }, + { + "epoch": 0.804032181894679, + "grad_norm": 0.005799205508083105, + "learning_rate": 1.9681708820917018e-05, + "loss": 0.297559928894043, + "step": 187280 + }, + { + "epoch": 0.804075113984699, + "grad_norm": 2.34651517868042, + "learning_rate": 1.9677397100799395e-05, + "loss": 0.18867183923721315, + "step": 187290 + }, + { + "epoch": 0.804118046074719, + "grad_norm": 0.002682226477190852, + "learning_rate": 1.967308538068177e-05, + "loss": 0.18364322185516357, + "step": 187300 + }, + { + "epoch": 0.804160978164739, + "grad_norm": 0.08684167265892029, + "learning_rate": 1.9668773660564146e-05, + "loss": 0.21005053520202638, + "step": 187310 + }, + { + "epoch": 0.804203910254759, + "grad_norm": 0.00973292626440525, + "learning_rate": 1.966446194044652e-05, + "loss": 0.2818763256072998, + "step": 187320 + }, + { + "epoch": 0.8042468423447791, + "grad_norm": 0.16216319799423218, + "learning_rate": 1.9660150220328897e-05, + "loss": 0.3723745346069336, + "step": 187330 + }, + { + "epoch": 0.804289774434799, + "grad_norm": 4.925543308258057, + "learning_rate": 1.9655838500211278e-05, + "loss": 0.2016061782836914, + "step": 187340 + }, + { + "epoch": 0.804332706524819, + "grad_norm": 1.2502108812332153, + "learning_rate": 1.9651526780093652e-05, + "loss": 0.12285642623901367, + "step": 187350 + }, + { + "epoch": 0.8043756386148391, + "grad_norm": 1.50983726978302, + "learning_rate": 1.964721505997603e-05, + "loss": 0.32901337146759035, + "step": 187360 + }, + { + "epoch": 0.804418570704859, + "grad_norm": 0.02697085589170456, + "learning_rate": 1.9642903339858403e-05, + "loss": 0.0628973662853241, + "step": 187370 + }, + { + "epoch": 0.804461502794879, + "grad_norm": 0.12497185170650482, + "learning_rate": 1.963859161974078e-05, + "loss": 0.022782574594020843, + "step": 187380 + }, + { + "epoch": 0.8045044348848991, + "grad_norm": 2.2022886276245117, + "learning_rate": 1.9634279899623155e-05, + "loss": 0.3713336229324341, + "step": 187390 + }, + { + "epoch": 0.804547366974919, + "grad_norm": 1.3997927904129028, + "learning_rate": 1.9629968179505535e-05, + "loss": 0.18089509010314941, + "step": 187400 + }, + { + "epoch": 0.8045902990649391, + "grad_norm": 0.012205325998365879, + "learning_rate": 1.962565645938791e-05, + "loss": 0.17987641096115112, + "step": 187410 + }, + { + "epoch": 0.8046332311549591, + "grad_norm": 0.6075366735458374, + "learning_rate": 1.9621344739270287e-05, + "loss": 0.3212277889251709, + "step": 187420 + }, + { + "epoch": 0.804676163244979, + "grad_norm": 7.286690711975098, + "learning_rate": 1.961703301915266e-05, + "loss": 0.29967246055603025, + "step": 187430 + }, + { + "epoch": 0.8047190953349991, + "grad_norm": 0.0046836817637085915, + "learning_rate": 1.9612721299035038e-05, + "loss": 0.05317643880844116, + "step": 187440 + }, + { + "epoch": 0.8047620274250191, + "grad_norm": 0.4176381528377533, + "learning_rate": 1.9608409578917415e-05, + "loss": 0.23051493167877196, + "step": 187450 + }, + { + "epoch": 0.8048049595150392, + "grad_norm": 0.037199754267930984, + "learning_rate": 1.9604097858799793e-05, + "loss": 0.2971806526184082, + "step": 187460 + }, + { + "epoch": 0.8048478916050591, + "grad_norm": 0.19382435083389282, + "learning_rate": 1.9599786138682167e-05, + "loss": 0.1703810214996338, + "step": 187470 + }, + { + "epoch": 0.8048908236950791, + "grad_norm": 0.015945645049214363, + "learning_rate": 1.9595474418564544e-05, + "loss": 0.18769537210464476, + "step": 187480 + }, + { + "epoch": 0.8049337557850992, + "grad_norm": 0.009000780060887337, + "learning_rate": 1.9591162698446918e-05, + "loss": 0.11943295001983642, + "step": 187490 + }, + { + "epoch": 0.8049766878751191, + "grad_norm": 1.8981505632400513, + "learning_rate": 1.9586850978329295e-05, + "loss": 0.27698822021484376, + "step": 187500 + }, + { + "epoch": 0.8050196199651392, + "grad_norm": 0.04114925488829613, + "learning_rate": 1.9582539258211673e-05, + "loss": 0.16432760953903197, + "step": 187510 + }, + { + "epoch": 0.8050625520551592, + "grad_norm": 0.04115109518170357, + "learning_rate": 1.957822753809405e-05, + "loss": 0.14959651231765747, + "step": 187520 + }, + { + "epoch": 0.8051054841451791, + "grad_norm": 1.73521888256073, + "learning_rate": 1.9573915817976424e-05, + "loss": 0.11037262678146362, + "step": 187530 + }, + { + "epoch": 0.8051484162351992, + "grad_norm": 0.05841144919395447, + "learning_rate": 1.95696040978588e-05, + "loss": 0.19754847288131713, + "step": 187540 + }, + { + "epoch": 0.8051913483252192, + "grad_norm": 0.0010160811943933368, + "learning_rate": 1.9565292377741175e-05, + "loss": 0.04744173586368561, + "step": 187550 + }, + { + "epoch": 0.8052342804152391, + "grad_norm": 0.029853256419301033, + "learning_rate": 1.9560980657623553e-05, + "loss": 0.19616028070449829, + "step": 187560 + }, + { + "epoch": 0.8052772125052592, + "grad_norm": 1.63491690158844, + "learning_rate": 1.955666893750593e-05, + "loss": 0.34844026565551756, + "step": 187570 + }, + { + "epoch": 0.8053201445952792, + "grad_norm": 0.16325415670871735, + "learning_rate": 1.9552357217388307e-05, + "loss": 0.09422051906585693, + "step": 187580 + }, + { + "epoch": 0.8053630766852992, + "grad_norm": 1.076112985610962, + "learning_rate": 1.954804549727068e-05, + "loss": 0.1710420608520508, + "step": 187590 + }, + { + "epoch": 0.8054060087753192, + "grad_norm": 1.230614423751831, + "learning_rate": 1.954373377715306e-05, + "loss": 0.36779584884643557, + "step": 187600 + }, + { + "epoch": 0.8054489408653392, + "grad_norm": 1.0910813808441162, + "learning_rate": 1.9539422057035433e-05, + "loss": 0.15935710668563843, + "step": 187610 + }, + { + "epoch": 0.8054918729553592, + "grad_norm": 0.0033349471632391214, + "learning_rate": 1.953511033691781e-05, + "loss": 0.2521167755126953, + "step": 187620 + }, + { + "epoch": 0.8055348050453792, + "grad_norm": 0.13731007277965546, + "learning_rate": 1.9530798616800187e-05, + "loss": 0.3199592590332031, + "step": 187630 + }, + { + "epoch": 0.8055777371353993, + "grad_norm": 0.5971689820289612, + "learning_rate": 1.9526486896682565e-05, + "loss": 0.2866949558258057, + "step": 187640 + }, + { + "epoch": 0.8056206692254192, + "grad_norm": 0.01081762369722128, + "learning_rate": 1.952217517656494e-05, + "loss": 0.1536438822746277, + "step": 187650 + }, + { + "epoch": 0.8056636013154392, + "grad_norm": 0.5095075368881226, + "learning_rate": 1.9517863456447316e-05, + "loss": 0.14425853490829468, + "step": 187660 + }, + { + "epoch": 0.8057065334054593, + "grad_norm": 0.009507289156317711, + "learning_rate": 1.951355173632969e-05, + "loss": 0.028446447849273682, + "step": 187670 + }, + { + "epoch": 0.8057494654954792, + "grad_norm": 0.30559566617012024, + "learning_rate": 1.9509240016212067e-05, + "loss": 0.2078967809677124, + "step": 187680 + }, + { + "epoch": 0.8057923975854993, + "grad_norm": 15.7970609664917, + "learning_rate": 1.9504928296094445e-05, + "loss": 0.18506253957748414, + "step": 187690 + }, + { + "epoch": 0.8058353296755193, + "grad_norm": 0.49444466829299927, + "learning_rate": 1.9500616575976822e-05, + "loss": 0.10527991056442261, + "step": 187700 + }, + { + "epoch": 0.8058782617655392, + "grad_norm": 1.964054822921753, + "learning_rate": 1.94963048558592e-05, + "loss": 0.3385364770889282, + "step": 187710 + }, + { + "epoch": 0.8059211938555593, + "grad_norm": 0.007375969551503658, + "learning_rate": 1.9491993135741573e-05, + "loss": 0.35996806621551514, + "step": 187720 + }, + { + "epoch": 0.8059641259455793, + "grad_norm": 0.014366790652275085, + "learning_rate": 1.948768141562395e-05, + "loss": 0.1376686930656433, + "step": 187730 + }, + { + "epoch": 0.8060070580355992, + "grad_norm": 0.9235551953315735, + "learning_rate": 1.9483369695506324e-05, + "loss": 0.08212153911590576, + "step": 187740 + }, + { + "epoch": 0.8060499901256193, + "grad_norm": 1.7573333978652954, + "learning_rate": 1.9479057975388705e-05, + "loss": 0.15899146795272828, + "step": 187750 + }, + { + "epoch": 0.8060929222156393, + "grad_norm": 1.1374258995056152, + "learning_rate": 1.947474625527108e-05, + "loss": 0.3773771047592163, + "step": 187760 + }, + { + "epoch": 0.8061358543056593, + "grad_norm": 0.06261592358350754, + "learning_rate": 1.9470434535153457e-05, + "loss": 0.07383977174758911, + "step": 187770 + }, + { + "epoch": 0.8061787863956793, + "grad_norm": 1.3004719018936157, + "learning_rate": 1.946612281503583e-05, + "loss": 0.18768155574798584, + "step": 187780 + }, + { + "epoch": 0.8062217184856993, + "grad_norm": 0.0006582170026376843, + "learning_rate": 1.9461811094918208e-05, + "loss": 0.16706053018569947, + "step": 187790 + }, + { + "epoch": 0.8062646505757193, + "grad_norm": 0.011071824468672276, + "learning_rate": 1.9457499374800585e-05, + "loss": 0.15217964649200438, + "step": 187800 + }, + { + "epoch": 0.8063075826657393, + "grad_norm": 0.02180718258023262, + "learning_rate": 1.9453187654682963e-05, + "loss": 0.011281723529100418, + "step": 187810 + }, + { + "epoch": 0.8063505147557594, + "grad_norm": 0.1587686687707901, + "learning_rate": 1.9448875934565336e-05, + "loss": 0.04129020571708679, + "step": 187820 + }, + { + "epoch": 0.8063934468457793, + "grad_norm": 0.01058216579258442, + "learning_rate": 1.9444564214447714e-05, + "loss": 0.060017770528793334, + "step": 187830 + }, + { + "epoch": 0.8064363789357993, + "grad_norm": 0.00888375099748373, + "learning_rate": 1.9440252494330088e-05, + "loss": 0.1673647165298462, + "step": 187840 + }, + { + "epoch": 0.8064793110258194, + "grad_norm": 0.00042585397022776306, + "learning_rate": 1.9435940774212465e-05, + "loss": 0.11510329246520996, + "step": 187850 + }, + { + "epoch": 0.8065222431158393, + "grad_norm": 0.004985545761883259, + "learning_rate": 1.9431629054094842e-05, + "loss": 0.20480480194091796, + "step": 187860 + }, + { + "epoch": 0.8065651752058594, + "grad_norm": 0.010661100968718529, + "learning_rate": 1.942731733397722e-05, + "loss": 0.11720261573791504, + "step": 187870 + }, + { + "epoch": 0.8066081072958794, + "grad_norm": 0.09563350677490234, + "learning_rate": 1.9423005613859594e-05, + "loss": 0.23833897113800048, + "step": 187880 + }, + { + "epoch": 0.8066510393858994, + "grad_norm": 9.614541053771973, + "learning_rate": 1.941869389374197e-05, + "loss": 0.15743416547775269, + "step": 187890 + }, + { + "epoch": 0.8066939714759194, + "grad_norm": 0.007036667782813311, + "learning_rate": 1.9414382173624345e-05, + "loss": 0.19440302848815919, + "step": 187900 + }, + { + "epoch": 0.8067369035659394, + "grad_norm": 0.11198631674051285, + "learning_rate": 1.9410070453506722e-05, + "loss": 0.29474101066589353, + "step": 187910 + }, + { + "epoch": 0.8067798356559595, + "grad_norm": 1.7913647890090942, + "learning_rate": 1.94057587333891e-05, + "loss": 0.19286303520202636, + "step": 187920 + }, + { + "epoch": 0.8068227677459794, + "grad_norm": 0.38198572397232056, + "learning_rate": 1.9401447013271477e-05, + "loss": 0.114966881275177, + "step": 187930 + }, + { + "epoch": 0.8068656998359994, + "grad_norm": 1.1205809116363525, + "learning_rate": 1.939713529315385e-05, + "loss": 0.2618088960647583, + "step": 187940 + }, + { + "epoch": 0.8069086319260195, + "grad_norm": 0.11646323651075363, + "learning_rate": 1.939282357303623e-05, + "loss": 0.1016423225402832, + "step": 187950 + }, + { + "epoch": 0.8069515640160394, + "grad_norm": 0.042885858565568924, + "learning_rate": 1.9388511852918602e-05, + "loss": 0.22202568054199218, + "step": 187960 + }, + { + "epoch": 0.8069944961060594, + "grad_norm": 0.2142648547887802, + "learning_rate": 1.938420013280098e-05, + "loss": 0.09741895794868469, + "step": 187970 + }, + { + "epoch": 0.8070374281960795, + "grad_norm": 0.0022309094201773405, + "learning_rate": 1.9379888412683357e-05, + "loss": 0.22670445442199708, + "step": 187980 + }, + { + "epoch": 0.8070803602860994, + "grad_norm": 0.004053886979818344, + "learning_rate": 1.9375576692565734e-05, + "loss": 0.3911715269088745, + "step": 187990 + }, + { + "epoch": 0.8071232923761195, + "grad_norm": 0.003097526729106903, + "learning_rate": 1.9371264972448108e-05, + "loss": 0.11114219427108765, + "step": 188000 + }, + { + "epoch": 0.8071232923761195, + "eval_loss": 0.3755682110786438, + "eval_runtime": 27.5879, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 3.625, + "step": 188000 + }, + { + "epoch": 0.8071662244661395, + "grad_norm": 1.1802542209625244, + "learning_rate": 1.9366953252330486e-05, + "loss": 0.1982245683670044, + "step": 188010 + }, + { + "epoch": 0.8072091565561594, + "grad_norm": 3.8327369689941406, + "learning_rate": 1.936264153221286e-05, + "loss": 0.18961336612701415, + "step": 188020 + }, + { + "epoch": 0.8072520886461795, + "grad_norm": 0.017420997843146324, + "learning_rate": 1.9358329812095237e-05, + "loss": 0.1820637345314026, + "step": 188030 + }, + { + "epoch": 0.8072950207361995, + "grad_norm": 2.7999138832092285, + "learning_rate": 1.9354018091977614e-05, + "loss": 0.36874754428863527, + "step": 188040 + }, + { + "epoch": 0.8073379528262195, + "grad_norm": 8.329748153686523, + "learning_rate": 1.934970637185999e-05, + "loss": 0.1601884126663208, + "step": 188050 + }, + { + "epoch": 0.8073808849162395, + "grad_norm": 0.3798222243785858, + "learning_rate": 1.9345394651742366e-05, + "loss": 0.24167709350585936, + "step": 188060 + }, + { + "epoch": 0.8074238170062595, + "grad_norm": 2.208631753921509, + "learning_rate": 1.9341082931624743e-05, + "loss": 0.12863118648529054, + "step": 188070 + }, + { + "epoch": 0.8074667490962795, + "grad_norm": 0.00045348392450250685, + "learning_rate": 1.933677121150712e-05, + "loss": 0.3917485952377319, + "step": 188080 + }, + { + "epoch": 0.8075096811862995, + "grad_norm": 0.968796968460083, + "learning_rate": 1.9332459491389494e-05, + "loss": 0.11934627294540405, + "step": 188090 + }, + { + "epoch": 0.8075526132763196, + "grad_norm": 0.0765795037150383, + "learning_rate": 1.9328147771271875e-05, + "loss": 0.5342819690704346, + "step": 188100 + }, + { + "epoch": 0.8075955453663395, + "grad_norm": 1.340894341468811, + "learning_rate": 1.932383605115425e-05, + "loss": 0.33205289840698243, + "step": 188110 + }, + { + "epoch": 0.8076384774563595, + "grad_norm": 0.06477386504411697, + "learning_rate": 1.9319524331036626e-05, + "loss": 0.20810203552246093, + "step": 188120 + }, + { + "epoch": 0.8076814095463796, + "grad_norm": 0.41469255089759827, + "learning_rate": 1.9315212610919e-05, + "loss": 0.12332984209060668, + "step": 188130 + }, + { + "epoch": 0.8077243416363995, + "grad_norm": 0.010204517282545567, + "learning_rate": 1.9310900890801378e-05, + "loss": 0.17447993755340577, + "step": 188140 + }, + { + "epoch": 0.8077672737264195, + "grad_norm": 3.1295204162597656, + "learning_rate": 1.930658917068375e-05, + "loss": 0.06377485990524293, + "step": 188150 + }, + { + "epoch": 0.8078102058164396, + "grad_norm": 10.702325820922852, + "learning_rate": 1.9302277450566132e-05, + "loss": 0.3309658050537109, + "step": 188160 + }, + { + "epoch": 0.8078531379064595, + "grad_norm": 0.003194740740582347, + "learning_rate": 1.9297965730448506e-05, + "loss": 0.22385261058807374, + "step": 188170 + }, + { + "epoch": 0.8078960699964796, + "grad_norm": 0.018159620463848114, + "learning_rate": 1.9293654010330884e-05, + "loss": 0.00289649311453104, + "step": 188180 + }, + { + "epoch": 0.8079390020864996, + "grad_norm": 0.029929209500551224, + "learning_rate": 1.9289342290213257e-05, + "loss": 0.11803474426269531, + "step": 188190 + }, + { + "epoch": 0.8079819341765195, + "grad_norm": 0.000580483756493777, + "learning_rate": 1.9285030570095635e-05, + "loss": 0.13644762039184571, + "step": 188200 + }, + { + "epoch": 0.8080248662665396, + "grad_norm": 1.9198402166366577, + "learning_rate": 1.9280718849978012e-05, + "loss": 0.10175681114196777, + "step": 188210 + }, + { + "epoch": 0.8080677983565596, + "grad_norm": 5.698566913604736, + "learning_rate": 1.927640712986039e-05, + "loss": 0.15870132446289062, + "step": 188220 + }, + { + "epoch": 0.8081107304465796, + "grad_norm": 183.16192626953125, + "learning_rate": 1.9272095409742763e-05, + "loss": 0.22849838733673095, + "step": 188230 + }, + { + "epoch": 0.8081536625365996, + "grad_norm": 0.007106813136488199, + "learning_rate": 1.926778368962514e-05, + "loss": 0.21907241344451905, + "step": 188240 + }, + { + "epoch": 0.8081965946266196, + "grad_norm": 0.0013693399960175157, + "learning_rate": 1.9263471969507515e-05, + "loss": 0.1918390989303589, + "step": 188250 + }, + { + "epoch": 0.8082395267166396, + "grad_norm": 0.0004091776499990374, + "learning_rate": 1.9259160249389892e-05, + "loss": 0.13080551624298095, + "step": 188260 + }, + { + "epoch": 0.8082824588066596, + "grad_norm": 2.989788055419922, + "learning_rate": 1.925484852927227e-05, + "loss": 0.20496292114257814, + "step": 188270 + }, + { + "epoch": 0.8083253908966797, + "grad_norm": 0.13609616458415985, + "learning_rate": 1.9250536809154647e-05, + "loss": 0.04478162825107575, + "step": 188280 + }, + { + "epoch": 0.8083683229866996, + "grad_norm": 0.8474799394607544, + "learning_rate": 1.924622508903702e-05, + "loss": 0.11303468942642211, + "step": 188290 + }, + { + "epoch": 0.8084112550767196, + "grad_norm": 0.9259027242660522, + "learning_rate": 1.9241913368919398e-05, + "loss": 0.10569697618484497, + "step": 188300 + }, + { + "epoch": 0.8084541871667397, + "grad_norm": 0.005927898921072483, + "learning_rate": 1.9237601648801772e-05, + "loss": 0.16273220777511596, + "step": 188310 + }, + { + "epoch": 0.8084971192567597, + "grad_norm": 0.05544566735625267, + "learning_rate": 1.923328992868415e-05, + "loss": 0.23240218162536622, + "step": 188320 + }, + { + "epoch": 0.8085400513467796, + "grad_norm": 1.6250989437103271, + "learning_rate": 1.9228978208566527e-05, + "loss": 0.13480372428894044, + "step": 188330 + }, + { + "epoch": 0.8085829834367997, + "grad_norm": 1.868417739868164, + "learning_rate": 1.9224666488448904e-05, + "loss": 0.21295087337493895, + "step": 188340 + }, + { + "epoch": 0.8086259155268197, + "grad_norm": 0.011199853383004665, + "learning_rate": 1.9220354768331278e-05, + "loss": 0.314319372177124, + "step": 188350 + }, + { + "epoch": 0.8086688476168397, + "grad_norm": 2.2199318408966064, + "learning_rate": 1.9216043048213655e-05, + "loss": 0.2382221221923828, + "step": 188360 + }, + { + "epoch": 0.8087117797068597, + "grad_norm": 4.999961853027344, + "learning_rate": 1.921173132809603e-05, + "loss": 0.1867772102355957, + "step": 188370 + }, + { + "epoch": 0.8087547117968797, + "grad_norm": 2.920680046081543, + "learning_rate": 1.9207419607978407e-05, + "loss": 0.20712087154388428, + "step": 188380 + }, + { + "epoch": 0.8087976438868997, + "grad_norm": 0.06474506855010986, + "learning_rate": 1.9203107887860784e-05, + "loss": 0.16149909496307374, + "step": 188390 + }, + { + "epoch": 0.8088405759769197, + "grad_norm": 1.951294183731079, + "learning_rate": 1.919879616774316e-05, + "loss": 0.2100539207458496, + "step": 188400 + }, + { + "epoch": 0.8088835080669398, + "grad_norm": 1.694394826889038, + "learning_rate": 1.9194484447625535e-05, + "loss": 0.3848715782165527, + "step": 188410 + }, + { + "epoch": 0.8089264401569597, + "grad_norm": 0.07759755849838257, + "learning_rate": 1.9190172727507913e-05, + "loss": 0.10030233860015869, + "step": 188420 + }, + { + "epoch": 0.8089693722469797, + "grad_norm": 1.2198158502578735, + "learning_rate": 1.918586100739029e-05, + "loss": 0.20859754085540771, + "step": 188430 + }, + { + "epoch": 0.8090123043369998, + "grad_norm": 6.012630462646484, + "learning_rate": 1.9181549287272664e-05, + "loss": 0.16353113651275636, + "step": 188440 + }, + { + "epoch": 0.8090552364270197, + "grad_norm": 0.10439054667949677, + "learning_rate": 1.9177237567155045e-05, + "loss": 0.24715774059295653, + "step": 188450 + }, + { + "epoch": 0.8090981685170398, + "grad_norm": 16.584360122680664, + "learning_rate": 1.917292584703742e-05, + "loss": 0.26418266296386717, + "step": 188460 + }, + { + "epoch": 0.8091411006070598, + "grad_norm": 0.0019134439062327147, + "learning_rate": 1.9168614126919796e-05, + "loss": 0.10769253969192505, + "step": 188470 + }, + { + "epoch": 0.8091840326970797, + "grad_norm": 0.37607043981552124, + "learning_rate": 1.916430240680217e-05, + "loss": 0.11277778148651123, + "step": 188480 + }, + { + "epoch": 0.8092269647870998, + "grad_norm": 0.0044091567397117615, + "learning_rate": 1.9159990686684547e-05, + "loss": 0.12072253227233887, + "step": 188490 + }, + { + "epoch": 0.8092698968771198, + "grad_norm": 0.018574392423033714, + "learning_rate": 1.915567896656692e-05, + "loss": 0.2124023199081421, + "step": 188500 + }, + { + "epoch": 0.8093128289671397, + "grad_norm": 0.0848187655210495, + "learning_rate": 1.9151367246449302e-05, + "loss": 0.22679450511932372, + "step": 188510 + }, + { + "epoch": 0.8093557610571598, + "grad_norm": 0.003953431732952595, + "learning_rate": 1.9147055526331676e-05, + "loss": 0.03668028116226196, + "step": 188520 + }, + { + "epoch": 0.8093986931471798, + "grad_norm": 0.130598783493042, + "learning_rate": 1.9142743806214053e-05, + "loss": 0.3036438226699829, + "step": 188530 + }, + { + "epoch": 0.8094416252371998, + "grad_norm": 0.002816180931404233, + "learning_rate": 1.9138432086096427e-05, + "loss": 0.1459151029586792, + "step": 188540 + }, + { + "epoch": 0.8094845573272198, + "grad_norm": 0.04123365506529808, + "learning_rate": 1.9134120365978805e-05, + "loss": 0.12174561023712158, + "step": 188550 + }, + { + "epoch": 0.8095274894172398, + "grad_norm": 3.4493002891540527, + "learning_rate": 1.9129808645861182e-05, + "loss": 0.2582594156265259, + "step": 188560 + }, + { + "epoch": 0.8095704215072598, + "grad_norm": 3.131918430328369, + "learning_rate": 1.912549692574356e-05, + "loss": 0.19951019287109376, + "step": 188570 + }, + { + "epoch": 0.8096133535972798, + "grad_norm": 0.002401916077360511, + "learning_rate": 1.9121185205625933e-05, + "loss": 0.1220403790473938, + "step": 188580 + }, + { + "epoch": 0.8096562856872999, + "grad_norm": 0.11982887238264084, + "learning_rate": 1.911687348550831e-05, + "loss": 0.10341262817382812, + "step": 188590 + }, + { + "epoch": 0.8096992177773198, + "grad_norm": 4.427649974822998, + "learning_rate": 1.9112561765390685e-05, + "loss": 0.267809534072876, + "step": 188600 + }, + { + "epoch": 0.8097421498673398, + "grad_norm": 0.36299610137939453, + "learning_rate": 1.9108250045273062e-05, + "loss": 0.07970675826072693, + "step": 188610 + }, + { + "epoch": 0.8097850819573599, + "grad_norm": 3.0310583114624023, + "learning_rate": 1.910393832515544e-05, + "loss": 0.28852736949920654, + "step": 188620 + }, + { + "epoch": 0.8098280140473798, + "grad_norm": 0.010732367634773254, + "learning_rate": 1.9099626605037817e-05, + "loss": 0.14984482526779175, + "step": 188630 + }, + { + "epoch": 0.8098709461373998, + "grad_norm": 0.003036505077034235, + "learning_rate": 1.909531488492019e-05, + "loss": 0.1123344898223877, + "step": 188640 + }, + { + "epoch": 0.8099138782274199, + "grad_norm": 0.003406435251235962, + "learning_rate": 1.9091003164802568e-05, + "loss": 0.17016907930374145, + "step": 188650 + }, + { + "epoch": 0.8099568103174398, + "grad_norm": 0.07481379061937332, + "learning_rate": 1.9086691444684942e-05, + "loss": 0.0838213324546814, + "step": 188660 + }, + { + "epoch": 0.8099997424074599, + "grad_norm": 0.09323481470346451, + "learning_rate": 1.908237972456732e-05, + "loss": 0.019658631086349486, + "step": 188670 + }, + { + "epoch": 0.8100426744974799, + "grad_norm": 0.0037671183235943317, + "learning_rate": 1.9078068004449696e-05, + "loss": 0.047585475444793704, + "step": 188680 + }, + { + "epoch": 0.8100856065874998, + "grad_norm": 0.024669989943504333, + "learning_rate": 1.9073756284332074e-05, + "loss": 0.170916748046875, + "step": 188690 + }, + { + "epoch": 0.8101285386775199, + "grad_norm": 7.089773654937744, + "learning_rate": 1.9069444564214448e-05, + "loss": 0.1541322350502014, + "step": 188700 + }, + { + "epoch": 0.8101714707675399, + "grad_norm": 1.925876259803772, + "learning_rate": 1.9065132844096825e-05, + "loss": 0.3852232933044434, + "step": 188710 + }, + { + "epoch": 0.8102144028575599, + "grad_norm": 0.03101220354437828, + "learning_rate": 1.90608211239792e-05, + "loss": 0.056364941596984866, + "step": 188720 + }, + { + "epoch": 0.8102573349475799, + "grad_norm": 1.4782829284667969, + "learning_rate": 1.9056509403861576e-05, + "loss": 0.11494901180267333, + "step": 188730 + }, + { + "epoch": 0.8103002670375999, + "grad_norm": 0.12165243923664093, + "learning_rate": 1.9052197683743954e-05, + "loss": 0.12389819622039795, + "step": 188740 + }, + { + "epoch": 0.81034319912762, + "grad_norm": 4.626972675323486, + "learning_rate": 1.904788596362633e-05, + "loss": 0.124202561378479, + "step": 188750 + }, + { + "epoch": 0.8103861312176399, + "grad_norm": 1.9608211517333984, + "learning_rate": 1.9043574243508705e-05, + "loss": 0.28276519775390624, + "step": 188760 + }, + { + "epoch": 0.81042906330766, + "grad_norm": 0.014439301565289497, + "learning_rate": 1.9039262523391082e-05, + "loss": 0.3626107931137085, + "step": 188770 + }, + { + "epoch": 0.81047199539768, + "grad_norm": 0.04803458973765373, + "learning_rate": 1.9034950803273456e-05, + "loss": 0.16060223579406738, + "step": 188780 + }, + { + "epoch": 0.8105149274876999, + "grad_norm": 0.6107069849967957, + "learning_rate": 1.9030639083155834e-05, + "loss": 0.10360321998596192, + "step": 188790 + }, + { + "epoch": 0.81055785957772, + "grad_norm": 0.05219374597072601, + "learning_rate": 1.902632736303821e-05, + "loss": 0.1694638252258301, + "step": 188800 + }, + { + "epoch": 0.81060079166774, + "grad_norm": 0.4738638997077942, + "learning_rate": 1.902201564292059e-05, + "loss": 0.21596572399139405, + "step": 188810 + }, + { + "epoch": 0.81064372375776, + "grad_norm": 0.43081095814704895, + "learning_rate": 1.9017703922802966e-05, + "loss": 0.1942771077156067, + "step": 188820 + }, + { + "epoch": 0.81068665584778, + "grad_norm": 0.016578922048211098, + "learning_rate": 1.901339220268534e-05, + "loss": 0.1609804630279541, + "step": 188830 + }, + { + "epoch": 0.8107295879378, + "grad_norm": 3.316399574279785, + "learning_rate": 1.9009080482567717e-05, + "loss": 0.16462740898132325, + "step": 188840 + }, + { + "epoch": 0.81077252002782, + "grad_norm": 3.450151205062866, + "learning_rate": 1.900476876245009e-05, + "loss": 0.34598455429077146, + "step": 188850 + }, + { + "epoch": 0.81081545211784, + "grad_norm": 0.0006724316626787186, + "learning_rate": 1.9000457042332472e-05, + "loss": 0.1317456841468811, + "step": 188860 + }, + { + "epoch": 0.81085838420786, + "grad_norm": 0.0004210352199152112, + "learning_rate": 1.8996145322214846e-05, + "loss": 0.0758798897266388, + "step": 188870 + }, + { + "epoch": 0.81090131629788, + "grad_norm": 0.001141284592449665, + "learning_rate": 1.8991833602097223e-05, + "loss": 0.1845982313156128, + "step": 188880 + }, + { + "epoch": 0.8109442483879, + "grad_norm": 0.002732800552621484, + "learning_rate": 1.8987521881979597e-05, + "loss": 0.2241126298904419, + "step": 188890 + }, + { + "epoch": 0.8109871804779201, + "grad_norm": 1.1904443502426147, + "learning_rate": 1.8983210161861974e-05, + "loss": 0.4464689254760742, + "step": 188900 + }, + { + "epoch": 0.81103011256794, + "grad_norm": 0.06258974969387054, + "learning_rate": 1.8978898441744348e-05, + "loss": 0.06665477752685547, + "step": 188910 + }, + { + "epoch": 0.81107304465796, + "grad_norm": 1.2643054723739624, + "learning_rate": 1.897458672162673e-05, + "loss": 0.18965506553649902, + "step": 188920 + }, + { + "epoch": 0.8111159767479801, + "grad_norm": 0.11003031581640244, + "learning_rate": 1.8970275001509103e-05, + "loss": 0.0030824728310108185, + "step": 188930 + }, + { + "epoch": 0.811158908838, + "grad_norm": 0.01650061085820198, + "learning_rate": 1.896596328139148e-05, + "loss": 0.16442786455154418, + "step": 188940 + }, + { + "epoch": 0.8112018409280201, + "grad_norm": 0.006386617664247751, + "learning_rate": 1.8961651561273854e-05, + "loss": 0.18398728370666503, + "step": 188950 + }, + { + "epoch": 0.8112447730180401, + "grad_norm": 2.380122661590576, + "learning_rate": 1.895733984115623e-05, + "loss": 0.14523251056671144, + "step": 188960 + }, + { + "epoch": 0.81128770510806, + "grad_norm": 0.021122131496667862, + "learning_rate": 1.895302812103861e-05, + "loss": 0.22241947650909424, + "step": 188970 + }, + { + "epoch": 0.8113306371980801, + "grad_norm": 4.08834171295166, + "learning_rate": 1.8948716400920986e-05, + "loss": 0.44264769554138184, + "step": 188980 + }, + { + "epoch": 0.8113735692881001, + "grad_norm": 0.006722049321979284, + "learning_rate": 1.894440468080336e-05, + "loss": 0.298035717010498, + "step": 188990 + }, + { + "epoch": 0.81141650137812, + "grad_norm": 1.6981254816055298, + "learning_rate": 1.8940092960685738e-05, + "loss": 0.14734795093536376, + "step": 189000 + }, + { + "epoch": 0.81141650137812, + "eval_loss": 0.38009634613990784, + "eval_runtime": 27.5037, + "eval_samples_per_second": 3.636, + "eval_steps_per_second": 3.636, + "step": 189000 + }, + { + "epoch": 0.8114594334681401, + "grad_norm": 0.4784759283065796, + "learning_rate": 1.893578124056811e-05, + "loss": 0.2854416608810425, + "step": 189010 + }, + { + "epoch": 0.8115023655581601, + "grad_norm": 1.5554957389831543, + "learning_rate": 1.893146952045049e-05, + "loss": 0.32027492523193357, + "step": 189020 + }, + { + "epoch": 0.8115452976481801, + "grad_norm": 1.664131999015808, + "learning_rate": 1.8927157800332866e-05, + "loss": 0.16618586778640748, + "step": 189030 + }, + { + "epoch": 0.8115882297382001, + "grad_norm": 0.34115976095199585, + "learning_rate": 1.8922846080215244e-05, + "loss": 0.15978493690490722, + "step": 189040 + }, + { + "epoch": 0.8116311618282201, + "grad_norm": 1.0413432121276855, + "learning_rate": 1.8918534360097618e-05, + "loss": 0.26266343593597413, + "step": 189050 + }, + { + "epoch": 0.8116740939182401, + "grad_norm": 0.5915712714195251, + "learning_rate": 1.8914222639979995e-05, + "loss": 0.0370695948600769, + "step": 189060 + }, + { + "epoch": 0.8117170260082601, + "grad_norm": 0.01841799169778824, + "learning_rate": 1.890991091986237e-05, + "loss": 0.11278891563415527, + "step": 189070 + }, + { + "epoch": 0.8117599580982802, + "grad_norm": 6.7392964363098145, + "learning_rate": 1.8905599199744746e-05, + "loss": 0.45865182876586913, + "step": 189080 + }, + { + "epoch": 0.8118028901883001, + "grad_norm": 1.3006116151809692, + "learning_rate": 1.8901287479627123e-05, + "loss": 0.14712069034576417, + "step": 189090 + }, + { + "epoch": 0.8118458222783201, + "grad_norm": 0.007229759357869625, + "learning_rate": 1.88969757595095e-05, + "loss": 0.07043436765670777, + "step": 189100 + }, + { + "epoch": 0.8118887543683402, + "grad_norm": 0.012855586595833302, + "learning_rate": 1.8892664039391875e-05, + "loss": 0.13676822185516357, + "step": 189110 + }, + { + "epoch": 0.8119316864583601, + "grad_norm": 8.256521224975586, + "learning_rate": 1.8888352319274252e-05, + "loss": 0.151786732673645, + "step": 189120 + }, + { + "epoch": 0.8119746185483802, + "grad_norm": 0.0016009919345378876, + "learning_rate": 1.8884040599156626e-05, + "loss": 0.22647907733917236, + "step": 189130 + }, + { + "epoch": 0.8120175506384002, + "grad_norm": 0.039503589272499084, + "learning_rate": 1.8879728879039003e-05, + "loss": 0.2703474283218384, + "step": 189140 + }, + { + "epoch": 0.8120604827284201, + "grad_norm": 0.0002003997069550678, + "learning_rate": 1.887541715892138e-05, + "loss": 0.2675285577774048, + "step": 189150 + }, + { + "epoch": 0.8121034148184402, + "grad_norm": 0.0024399063549935818, + "learning_rate": 1.8871105438803758e-05, + "loss": 0.10987521409988403, + "step": 189160 + }, + { + "epoch": 0.8121463469084602, + "grad_norm": 9.465962409973145, + "learning_rate": 1.8866793718686135e-05, + "loss": 0.11177196502685546, + "step": 189170 + }, + { + "epoch": 0.8121892789984803, + "grad_norm": 0.016127515584230423, + "learning_rate": 1.886248199856851e-05, + "loss": 0.1671124815940857, + "step": 189180 + }, + { + "epoch": 0.8122322110885002, + "grad_norm": 0.6212892532348633, + "learning_rate": 1.8858170278450887e-05, + "loss": 0.2125856876373291, + "step": 189190 + }, + { + "epoch": 0.8122751431785202, + "grad_norm": 0.16576534509658813, + "learning_rate": 1.885385855833326e-05, + "loss": 0.19760308265686036, + "step": 189200 + }, + { + "epoch": 0.8123180752685403, + "grad_norm": 0.008632770739495754, + "learning_rate": 1.884954683821564e-05, + "loss": 0.17998934984207154, + "step": 189210 + }, + { + "epoch": 0.8123610073585602, + "grad_norm": 0.004760857205837965, + "learning_rate": 1.8845235118098015e-05, + "loss": 0.10525168180465698, + "step": 189220 + }, + { + "epoch": 0.8124039394485802, + "grad_norm": 7.089337348937988, + "learning_rate": 1.8840923397980393e-05, + "loss": 0.0909703254699707, + "step": 189230 + }, + { + "epoch": 0.8124468715386003, + "grad_norm": 0.01754256896674633, + "learning_rate": 1.8836611677862767e-05, + "loss": 0.21712646484375, + "step": 189240 + }, + { + "epoch": 0.8124898036286202, + "grad_norm": 1.2798113822937012, + "learning_rate": 1.8832299957745144e-05, + "loss": 0.2140897274017334, + "step": 189250 + }, + { + "epoch": 0.8125327357186403, + "grad_norm": 25.59235191345215, + "learning_rate": 1.8827988237627518e-05, + "loss": 0.2592212915420532, + "step": 189260 + }, + { + "epoch": 0.8125756678086603, + "grad_norm": 1.0722297430038452, + "learning_rate": 1.88236765175099e-05, + "loss": 0.20234456062316894, + "step": 189270 + }, + { + "epoch": 0.8126185998986802, + "grad_norm": 0.06800615787506104, + "learning_rate": 1.8819364797392273e-05, + "loss": 0.17045010328292848, + "step": 189280 + }, + { + "epoch": 0.8126615319887003, + "grad_norm": 1.14657723903656, + "learning_rate": 1.881505307727465e-05, + "loss": 0.167192280292511, + "step": 189290 + }, + { + "epoch": 0.8127044640787203, + "grad_norm": 31.580127716064453, + "learning_rate": 1.8810741357157024e-05, + "loss": 0.2977290153503418, + "step": 189300 + }, + { + "epoch": 0.8127473961687403, + "grad_norm": 2.1737892627716064, + "learning_rate": 1.88064296370394e-05, + "loss": 0.2161936044692993, + "step": 189310 + }, + { + "epoch": 0.8127903282587603, + "grad_norm": 0.0015936638228595257, + "learning_rate": 1.880211791692178e-05, + "loss": 0.08559054732322693, + "step": 189320 + }, + { + "epoch": 0.8128332603487803, + "grad_norm": 0.04431832209229469, + "learning_rate": 1.8797806196804156e-05, + "loss": 0.2089853048324585, + "step": 189330 + }, + { + "epoch": 0.8128761924388003, + "grad_norm": 0.013524886220693588, + "learning_rate": 1.879349447668653e-05, + "loss": 0.10106925964355469, + "step": 189340 + }, + { + "epoch": 0.8129191245288203, + "grad_norm": 0.21533319354057312, + "learning_rate": 1.8789182756568907e-05, + "loss": 0.1647628664970398, + "step": 189350 + }, + { + "epoch": 0.8129620566188404, + "grad_norm": 0.007309638429433107, + "learning_rate": 1.878487103645128e-05, + "loss": 0.280776047706604, + "step": 189360 + }, + { + "epoch": 0.8130049887088603, + "grad_norm": 0.030792390927672386, + "learning_rate": 1.878055931633366e-05, + "loss": 0.23691213130950928, + "step": 189370 + }, + { + "epoch": 0.8130479207988803, + "grad_norm": 0.0014829770661890507, + "learning_rate": 1.8776247596216036e-05, + "loss": 0.06938903331756592, + "step": 189380 + }, + { + "epoch": 0.8130908528889004, + "grad_norm": 1.4969149827957153, + "learning_rate": 1.8771935876098413e-05, + "loss": 0.23304481506347657, + "step": 189390 + }, + { + "epoch": 0.8131337849789203, + "grad_norm": 5.537669658660889, + "learning_rate": 1.8767624155980787e-05, + "loss": 0.15186458826065063, + "step": 189400 + }, + { + "epoch": 0.8131767170689403, + "grad_norm": 1.8411829471588135, + "learning_rate": 1.8763312435863165e-05, + "loss": 0.14897530078887938, + "step": 189410 + }, + { + "epoch": 0.8132196491589604, + "grad_norm": 0.00035322734038345516, + "learning_rate": 1.875900071574554e-05, + "loss": 0.257629919052124, + "step": 189420 + }, + { + "epoch": 0.8132625812489803, + "grad_norm": 0.004576101899147034, + "learning_rate": 1.8754688995627916e-05, + "loss": 0.1692986011505127, + "step": 189430 + }, + { + "epoch": 0.8133055133390004, + "grad_norm": 1.1908814907073975, + "learning_rate": 1.8750377275510293e-05, + "loss": 0.1332242012023926, + "step": 189440 + }, + { + "epoch": 0.8133484454290204, + "grad_norm": 1.6177889108657837, + "learning_rate": 1.874606555539267e-05, + "loss": 0.23381438255310058, + "step": 189450 + }, + { + "epoch": 0.8133913775190403, + "grad_norm": 0.0006076296558603644, + "learning_rate": 1.8741753835275045e-05, + "loss": 0.35434651374816895, + "step": 189460 + }, + { + "epoch": 0.8134343096090604, + "grad_norm": 0.008793370798230171, + "learning_rate": 1.8737442115157422e-05, + "loss": 0.2348698139190674, + "step": 189470 + }, + { + "epoch": 0.8134772416990804, + "grad_norm": 0.10453282296657562, + "learning_rate": 1.8733130395039796e-05, + "loss": 0.16681669950485228, + "step": 189480 + }, + { + "epoch": 0.8135201737891004, + "grad_norm": 0.01704632118344307, + "learning_rate": 1.8728818674922173e-05, + "loss": 0.23047642707824706, + "step": 189490 + }, + { + "epoch": 0.8135631058791204, + "grad_norm": 0.005977040156722069, + "learning_rate": 1.872450695480455e-05, + "loss": 0.0015470744110643864, + "step": 189500 + }, + { + "epoch": 0.8136060379691404, + "grad_norm": 0.014360605739057064, + "learning_rate": 1.8720195234686928e-05, + "loss": 0.03445011377334595, + "step": 189510 + }, + { + "epoch": 0.8136489700591604, + "grad_norm": 0.002405626932159066, + "learning_rate": 1.8715883514569305e-05, + "loss": 0.06739037036895752, + "step": 189520 + }, + { + "epoch": 0.8136919021491804, + "grad_norm": 0.00946758408099413, + "learning_rate": 1.871157179445168e-05, + "loss": 0.026663467288017273, + "step": 189530 + }, + { + "epoch": 0.8137348342392005, + "grad_norm": 0.0020533869974315166, + "learning_rate": 1.8707260074334056e-05, + "loss": 0.10337278842926026, + "step": 189540 + }, + { + "epoch": 0.8137777663292204, + "grad_norm": 1.2318427562713623, + "learning_rate": 1.870294835421643e-05, + "loss": 0.17301219701766968, + "step": 189550 + }, + { + "epoch": 0.8138206984192404, + "grad_norm": 0.0025602129753679037, + "learning_rate": 1.869863663409881e-05, + "loss": 0.2267094373703003, + "step": 189560 + }, + { + "epoch": 0.8138636305092605, + "grad_norm": 8.636144638061523, + "learning_rate": 1.8694324913981185e-05, + "loss": 0.6510757446289063, + "step": 189570 + }, + { + "epoch": 0.8139065625992804, + "grad_norm": 16.425983428955078, + "learning_rate": 1.8690013193863562e-05, + "loss": 0.23902950286865235, + "step": 189580 + }, + { + "epoch": 0.8139494946893004, + "grad_norm": 0.005090423859655857, + "learning_rate": 1.8685701473745936e-05, + "loss": 0.38922483921051027, + "step": 189590 + }, + { + "epoch": 0.8139924267793205, + "grad_norm": 3.3574795722961426, + "learning_rate": 1.8681389753628314e-05, + "loss": 0.33330717086791994, + "step": 189600 + }, + { + "epoch": 0.8140353588693405, + "grad_norm": 0.035569801926612854, + "learning_rate": 1.8677078033510688e-05, + "loss": 0.2678635835647583, + "step": 189610 + }, + { + "epoch": 0.8140782909593605, + "grad_norm": 0.08557265251874924, + "learning_rate": 1.867276631339307e-05, + "loss": 0.10242055654525757, + "step": 189620 + }, + { + "epoch": 0.8141212230493805, + "grad_norm": 6.003871440887451, + "learning_rate": 1.8668454593275442e-05, + "loss": 0.20186066627502441, + "step": 189630 + }, + { + "epoch": 0.8141641551394005, + "grad_norm": 0.0007279837154783309, + "learning_rate": 1.866414287315782e-05, + "loss": 0.3290987730026245, + "step": 189640 + }, + { + "epoch": 0.8142070872294205, + "grad_norm": 0.07561841607093811, + "learning_rate": 1.8659831153040194e-05, + "loss": 0.2522317409515381, + "step": 189650 + }, + { + "epoch": 0.8142500193194405, + "grad_norm": 1.2994825839996338, + "learning_rate": 1.865551943292257e-05, + "loss": 0.13057563304901124, + "step": 189660 + }, + { + "epoch": 0.8142929514094606, + "grad_norm": 1.358086347579956, + "learning_rate": 1.865120771280495e-05, + "loss": 0.27964789867401124, + "step": 189670 + }, + { + "epoch": 0.8143358834994805, + "grad_norm": 2.5652987957000732, + "learning_rate": 1.8646895992687326e-05, + "loss": 0.4078275203704834, + "step": 189680 + }, + { + "epoch": 0.8143788155895005, + "grad_norm": 1.2036330699920654, + "learning_rate": 1.86425842725697e-05, + "loss": 0.3373491048812866, + "step": 189690 + }, + { + "epoch": 0.8144217476795206, + "grad_norm": 0.638670027256012, + "learning_rate": 1.8638272552452077e-05, + "loss": 0.10838404893875123, + "step": 189700 + }, + { + "epoch": 0.8144646797695405, + "grad_norm": 2.393080472946167, + "learning_rate": 1.863396083233445e-05, + "loss": 0.23193233013153075, + "step": 189710 + }, + { + "epoch": 0.8145076118595606, + "grad_norm": 0.7426817417144775, + "learning_rate": 1.862964911221683e-05, + "loss": 0.148024320602417, + "step": 189720 + }, + { + "epoch": 0.8145505439495806, + "grad_norm": 0.6907213926315308, + "learning_rate": 1.8625337392099206e-05, + "loss": 0.13800256252288817, + "step": 189730 + }, + { + "epoch": 0.8145934760396005, + "grad_norm": 2.134216547012329, + "learning_rate": 1.8621025671981583e-05, + "loss": 0.16400604248046874, + "step": 189740 + }, + { + "epoch": 0.8146364081296206, + "grad_norm": 0.004457848146557808, + "learning_rate": 1.8616713951863957e-05, + "loss": 0.01569632887840271, + "step": 189750 + }, + { + "epoch": 0.8146793402196406, + "grad_norm": 0.025374725461006165, + "learning_rate": 1.8612402231746334e-05, + "loss": 0.11625961065292359, + "step": 189760 + }, + { + "epoch": 0.8147222723096605, + "grad_norm": 6.314393043518066, + "learning_rate": 1.8608090511628708e-05, + "loss": 0.3577938795089722, + "step": 189770 + }, + { + "epoch": 0.8147652043996806, + "grad_norm": 0.018094485625624657, + "learning_rate": 1.8603778791511086e-05, + "loss": 0.19390039443969725, + "step": 189780 + }, + { + "epoch": 0.8148081364897006, + "grad_norm": 0.022799383848905563, + "learning_rate": 1.8599467071393463e-05, + "loss": 0.1557063341140747, + "step": 189790 + }, + { + "epoch": 0.8148510685797206, + "grad_norm": 0.0029004893731325865, + "learning_rate": 1.859515535127584e-05, + "loss": 0.024601130187511443, + "step": 189800 + }, + { + "epoch": 0.8148940006697406, + "grad_norm": 0.009202632121741772, + "learning_rate": 1.8590843631158214e-05, + "loss": 0.27263336181640624, + "step": 189810 + }, + { + "epoch": 0.8149369327597606, + "grad_norm": 0.00577400391921401, + "learning_rate": 1.858653191104059e-05, + "loss": 0.04384286105632782, + "step": 189820 + }, + { + "epoch": 0.8149798648497806, + "grad_norm": 0.00864651519805193, + "learning_rate": 1.8582220190922966e-05, + "loss": 0.16352792978286743, + "step": 189830 + }, + { + "epoch": 0.8150227969398006, + "grad_norm": 0.06453585624694824, + "learning_rate": 1.8577908470805343e-05, + "loss": 0.25619847774505616, + "step": 189840 + }, + { + "epoch": 0.8150657290298207, + "grad_norm": 0.8390404582023621, + "learning_rate": 1.857359675068772e-05, + "loss": 0.1784263014793396, + "step": 189850 + }, + { + "epoch": 0.8151086611198406, + "grad_norm": 1.5671316385269165, + "learning_rate": 1.8569285030570098e-05, + "loss": 0.1716221332550049, + "step": 189860 + }, + { + "epoch": 0.8151515932098606, + "grad_norm": 0.026777099817991257, + "learning_rate": 1.856497331045247e-05, + "loss": 0.14569276571273804, + "step": 189870 + }, + { + "epoch": 0.8151945252998807, + "grad_norm": 0.06430796533823013, + "learning_rate": 1.856066159033485e-05, + "loss": 0.15251225233078003, + "step": 189880 + }, + { + "epoch": 0.8152374573899006, + "grad_norm": 2.121525764465332, + "learning_rate": 1.8556349870217226e-05, + "loss": 0.27808551788330077, + "step": 189890 + }, + { + "epoch": 0.8152803894799207, + "grad_norm": 0.003509877948090434, + "learning_rate": 1.85520381500996e-05, + "loss": 0.18043378591537476, + "step": 189900 + }, + { + "epoch": 0.8153233215699407, + "grad_norm": 1.6782100200653076, + "learning_rate": 1.8547726429981978e-05, + "loss": 0.2832813262939453, + "step": 189910 + }, + { + "epoch": 0.8153662536599606, + "grad_norm": 0.006824735086411238, + "learning_rate": 1.8543414709864355e-05, + "loss": 0.35706157684326173, + "step": 189920 + }, + { + "epoch": 0.8154091857499807, + "grad_norm": 0.0013534731697291136, + "learning_rate": 1.8539102989746732e-05, + "loss": 0.08758707642555237, + "step": 189930 + }, + { + "epoch": 0.8154521178400007, + "grad_norm": 1.16303551197052, + "learning_rate": 1.8534791269629106e-05, + "loss": 0.41260361671447754, + "step": 189940 + }, + { + "epoch": 0.8154950499300206, + "grad_norm": 0.008056594990193844, + "learning_rate": 1.8530479549511483e-05, + "loss": 0.2539925813674927, + "step": 189950 + }, + { + "epoch": 0.8155379820200407, + "grad_norm": 2.1018102169036865, + "learning_rate": 1.8526167829393857e-05, + "loss": 0.25430138111114503, + "step": 189960 + }, + { + "epoch": 0.8155809141100607, + "grad_norm": 2.4627227783203125, + "learning_rate": 1.8521856109276238e-05, + "loss": 0.29794859886169434, + "step": 189970 + }, + { + "epoch": 0.8156238462000807, + "grad_norm": 0.7906169295310974, + "learning_rate": 1.8517544389158612e-05, + "loss": 0.17653955221176149, + "step": 189980 + }, + { + "epoch": 0.8156667782901007, + "grad_norm": 0.1337890774011612, + "learning_rate": 1.851323266904099e-05, + "loss": 0.17861900329589844, + "step": 189990 + }, + { + "epoch": 0.8157097103801207, + "grad_norm": 0.7123793363571167, + "learning_rate": 1.8508920948923363e-05, + "loss": 0.22133033275604247, + "step": 190000 + }, + { + "epoch": 0.8157097103801207, + "eval_loss": 0.3748077154159546, + "eval_runtime": 27.4732, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 190000 + }, + { + "epoch": 0.8157526424701407, + "grad_norm": 0.008327401243150234, + "learning_rate": 1.850460922880574e-05, + "loss": 0.14534246921539307, + "step": 190010 + }, + { + "epoch": 0.8157955745601607, + "grad_norm": 0.032320182770490646, + "learning_rate": 1.8500297508688115e-05, + "loss": 0.28674242496490476, + "step": 190020 + }, + { + "epoch": 0.8158385066501808, + "grad_norm": 0.01738484762609005, + "learning_rate": 1.8495985788570495e-05, + "loss": 0.03833106756210327, + "step": 190030 + }, + { + "epoch": 0.8158814387402008, + "grad_norm": 0.005880672950297594, + "learning_rate": 1.849167406845287e-05, + "loss": 0.04911441206932068, + "step": 190040 + }, + { + "epoch": 0.8159243708302207, + "grad_norm": 0.037115778774023056, + "learning_rate": 1.8487362348335247e-05, + "loss": 0.2223417043685913, + "step": 190050 + }, + { + "epoch": 0.8159673029202408, + "grad_norm": 0.9087185859680176, + "learning_rate": 1.848305062821762e-05, + "loss": 0.12857097387313843, + "step": 190060 + }, + { + "epoch": 0.8160102350102608, + "grad_norm": 0.000869360170327127, + "learning_rate": 1.8478738908099998e-05, + "loss": 0.22225430011749267, + "step": 190070 + }, + { + "epoch": 0.8160531671002808, + "grad_norm": 0.011036857031285763, + "learning_rate": 1.8474427187982375e-05, + "loss": 0.1313633918762207, + "step": 190080 + }, + { + "epoch": 0.8160960991903008, + "grad_norm": 0.19269496202468872, + "learning_rate": 1.8470115467864753e-05, + "loss": 0.038611036539077756, + "step": 190090 + }, + { + "epoch": 0.8161390312803208, + "grad_norm": 1.4396089315414429, + "learning_rate": 1.8465803747747127e-05, + "loss": 0.14506794214248658, + "step": 190100 + }, + { + "epoch": 0.8161819633703408, + "grad_norm": 0.0028870003297924995, + "learning_rate": 1.8461492027629504e-05, + "loss": 0.13069781064987182, + "step": 190110 + }, + { + "epoch": 0.8162248954603608, + "grad_norm": 1.1406079530715942, + "learning_rate": 1.8457180307511878e-05, + "loss": 0.4691906929016113, + "step": 190120 + }, + { + "epoch": 0.8162678275503809, + "grad_norm": 0.00298871798440814, + "learning_rate": 1.8452868587394255e-05, + "loss": 0.12180900573730469, + "step": 190130 + }, + { + "epoch": 0.8163107596404008, + "grad_norm": 0.05643042176961899, + "learning_rate": 1.8448556867276633e-05, + "loss": 0.26197865009307864, + "step": 190140 + }, + { + "epoch": 0.8163536917304208, + "grad_norm": 0.010706533677875996, + "learning_rate": 1.844424514715901e-05, + "loss": 0.05873526334762573, + "step": 190150 + }, + { + "epoch": 0.8163966238204409, + "grad_norm": 2.701686143875122, + "learning_rate": 1.8439933427041384e-05, + "loss": 0.09920306205749511, + "step": 190160 + }, + { + "epoch": 0.8164395559104608, + "grad_norm": 0.010354182682931423, + "learning_rate": 1.843562170692376e-05, + "loss": 0.23539597988128663, + "step": 190170 + }, + { + "epoch": 0.8164824880004808, + "grad_norm": 3.4639413356781006, + "learning_rate": 1.8431309986806135e-05, + "loss": 0.2738173007965088, + "step": 190180 + }, + { + "epoch": 0.8165254200905009, + "grad_norm": 0.8891342878341675, + "learning_rate": 1.8426998266688513e-05, + "loss": 0.0747622311115265, + "step": 190190 + }, + { + "epoch": 0.8165683521805208, + "grad_norm": 3.6724092960357666, + "learning_rate": 1.842268654657089e-05, + "loss": 0.3093080520629883, + "step": 190200 + }, + { + "epoch": 0.8166112842705409, + "grad_norm": 2.065101146697998, + "learning_rate": 1.8418374826453267e-05, + "loss": 0.24554228782653809, + "step": 190210 + }, + { + "epoch": 0.8166542163605609, + "grad_norm": 0.7292916178703308, + "learning_rate": 1.841406310633564e-05, + "loss": 0.28256423473358155, + "step": 190220 + }, + { + "epoch": 0.8166971484505808, + "grad_norm": 0.032051555812358856, + "learning_rate": 1.840975138621802e-05, + "loss": 0.16433157920837402, + "step": 190230 + }, + { + "epoch": 0.8167400805406009, + "grad_norm": 0.0009634624002501369, + "learning_rate": 1.8405439666100393e-05, + "loss": 0.1607471823692322, + "step": 190240 + }, + { + "epoch": 0.8167830126306209, + "grad_norm": 0.1358780860900879, + "learning_rate": 1.840112794598277e-05, + "loss": 0.19506868124008178, + "step": 190250 + }, + { + "epoch": 0.8168259447206409, + "grad_norm": 0.05440731719136238, + "learning_rate": 1.8396816225865147e-05, + "loss": 0.10299488306045532, + "step": 190260 + }, + { + "epoch": 0.8168688768106609, + "grad_norm": 0.0032140351831912994, + "learning_rate": 1.8392504505747525e-05, + "loss": 0.18394207954406738, + "step": 190270 + }, + { + "epoch": 0.8169118089006809, + "grad_norm": 0.058177292346954346, + "learning_rate": 1.8388192785629902e-05, + "loss": 0.1681948184967041, + "step": 190280 + }, + { + "epoch": 0.8169547409907009, + "grad_norm": 0.0030108222272247076, + "learning_rate": 1.8383881065512276e-05, + "loss": 0.2944432258605957, + "step": 190290 + }, + { + "epoch": 0.8169976730807209, + "grad_norm": 0.32893991470336914, + "learning_rate": 1.8379569345394653e-05, + "loss": 0.10108593702316285, + "step": 190300 + }, + { + "epoch": 0.817040605170741, + "grad_norm": 0.3538655936717987, + "learning_rate": 1.8375257625277027e-05, + "loss": 0.30429155826568605, + "step": 190310 + }, + { + "epoch": 0.8170835372607609, + "grad_norm": 3.212498426437378, + "learning_rate": 1.8370945905159408e-05, + "loss": 0.06719596982002259, + "step": 190320 + }, + { + "epoch": 0.8171264693507809, + "grad_norm": 0.026576699689030647, + "learning_rate": 1.8366634185041782e-05, + "loss": 0.09907512664794922, + "step": 190330 + }, + { + "epoch": 0.817169401440801, + "grad_norm": 0.9718691110610962, + "learning_rate": 1.836232246492416e-05, + "loss": 0.1221091628074646, + "step": 190340 + }, + { + "epoch": 0.8172123335308209, + "grad_norm": 0.8951011300086975, + "learning_rate": 1.8358010744806533e-05, + "loss": 0.13755905628204346, + "step": 190350 + }, + { + "epoch": 0.8172552656208409, + "grad_norm": 0.9757001996040344, + "learning_rate": 1.835369902468891e-05, + "loss": 0.27019762992858887, + "step": 190360 + }, + { + "epoch": 0.817298197710861, + "grad_norm": 0.0059048268012702465, + "learning_rate": 1.8349387304571284e-05, + "loss": 0.24034535884857178, + "step": 190370 + }, + { + "epoch": 0.8173411298008809, + "grad_norm": 0.9905226230621338, + "learning_rate": 1.8345075584453665e-05, + "loss": 0.34694108963012693, + "step": 190380 + }, + { + "epoch": 0.817384061890901, + "grad_norm": 3.6334996223449707, + "learning_rate": 1.834076386433604e-05, + "loss": 0.22657389640808107, + "step": 190390 + }, + { + "epoch": 0.817426993980921, + "grad_norm": 1.3295059204101562, + "learning_rate": 1.8336452144218416e-05, + "loss": 0.12326008081436157, + "step": 190400 + }, + { + "epoch": 0.8174699260709409, + "grad_norm": 0.014637632295489311, + "learning_rate": 1.833214042410079e-05, + "loss": 0.3915814161300659, + "step": 190410 + }, + { + "epoch": 0.817512858160961, + "grad_norm": 0.007784396409988403, + "learning_rate": 1.8327828703983168e-05, + "loss": 0.11680706739425659, + "step": 190420 + }, + { + "epoch": 0.817555790250981, + "grad_norm": 0.010571125894784927, + "learning_rate": 1.8323516983865545e-05, + "loss": 0.08030446767807006, + "step": 190430 + }, + { + "epoch": 0.817598722341001, + "grad_norm": 0.01794949173927307, + "learning_rate": 1.8319205263747922e-05, + "loss": 0.3555665731430054, + "step": 190440 + }, + { + "epoch": 0.817641654431021, + "grad_norm": 0.02172680012881756, + "learning_rate": 1.8314893543630296e-05, + "loss": 0.12549080848693847, + "step": 190450 + }, + { + "epoch": 0.817684586521041, + "grad_norm": 0.057841021567583084, + "learning_rate": 1.8310581823512674e-05, + "loss": 0.27305548191070556, + "step": 190460 + }, + { + "epoch": 0.8177275186110611, + "grad_norm": 0.006068886257708073, + "learning_rate": 1.8306270103395048e-05, + "loss": 0.20729308128356932, + "step": 190470 + }, + { + "epoch": 0.817770450701081, + "grad_norm": 0.02753814309835434, + "learning_rate": 1.8301958383277425e-05, + "loss": 0.2326343536376953, + "step": 190480 + }, + { + "epoch": 0.817813382791101, + "grad_norm": 0.08779291808605194, + "learning_rate": 1.8297646663159802e-05, + "loss": 0.05305823087692261, + "step": 190490 + }, + { + "epoch": 0.8178563148811211, + "grad_norm": 1.9648560285568237, + "learning_rate": 1.829333494304218e-05, + "loss": 0.20017385482788086, + "step": 190500 + }, + { + "epoch": 0.817899246971141, + "grad_norm": 0.003839155426248908, + "learning_rate": 1.8289023222924554e-05, + "loss": 0.2160346508026123, + "step": 190510 + }, + { + "epoch": 0.8179421790611611, + "grad_norm": 1.4220077991485596, + "learning_rate": 1.828471150280693e-05, + "loss": 0.3003256320953369, + "step": 190520 + }, + { + "epoch": 0.8179851111511811, + "grad_norm": 0.06661306321620941, + "learning_rate": 1.8280399782689305e-05, + "loss": 0.16826083660125732, + "step": 190530 + }, + { + "epoch": 0.818028043241201, + "grad_norm": 1.7498247623443604, + "learning_rate": 1.8276088062571682e-05, + "loss": 0.14414217472076415, + "step": 190540 + }, + { + "epoch": 0.8180709753312211, + "grad_norm": 0.0036016765516251326, + "learning_rate": 1.827177634245406e-05, + "loss": 0.2631456136703491, + "step": 190550 + }, + { + "epoch": 0.8181139074212411, + "grad_norm": 0.011093534529209137, + "learning_rate": 1.8267464622336437e-05, + "loss": 0.0830884575843811, + "step": 190560 + }, + { + "epoch": 0.8181568395112611, + "grad_norm": 0.008177113719284534, + "learning_rate": 1.826315290221881e-05, + "loss": 0.16356415748596193, + "step": 190570 + }, + { + "epoch": 0.8181997716012811, + "grad_norm": 1.0167497396469116, + "learning_rate": 1.825884118210119e-05, + "loss": 0.2114093065261841, + "step": 190580 + }, + { + "epoch": 0.8182427036913011, + "grad_norm": 1.0505751371383667, + "learning_rate": 1.8254529461983562e-05, + "loss": 0.09989173412322998, + "step": 190590 + }, + { + "epoch": 0.8182856357813211, + "grad_norm": 0.003051972948014736, + "learning_rate": 1.825021774186594e-05, + "loss": 0.21981940269470215, + "step": 190600 + }, + { + "epoch": 0.8183285678713411, + "grad_norm": 0.012448975816369057, + "learning_rate": 1.8245906021748317e-05, + "loss": 0.04102669060230255, + "step": 190610 + }, + { + "epoch": 0.8183714999613612, + "grad_norm": 0.29564034938812256, + "learning_rate": 1.8241594301630694e-05, + "loss": 0.16563717126846314, + "step": 190620 + }, + { + "epoch": 0.8184144320513811, + "grad_norm": 0.0020702462643384933, + "learning_rate": 1.823728258151307e-05, + "loss": 0.3559636116027832, + "step": 190630 + }, + { + "epoch": 0.8184573641414011, + "grad_norm": 0.8853886723518372, + "learning_rate": 1.8232970861395446e-05, + "loss": 0.3207648754119873, + "step": 190640 + }, + { + "epoch": 0.8185002962314212, + "grad_norm": 0.05311070755124092, + "learning_rate": 1.8228659141277823e-05, + "loss": 0.22375433444976806, + "step": 190650 + }, + { + "epoch": 0.8185432283214411, + "grad_norm": 6.844063758850098, + "learning_rate": 1.8224347421160197e-05, + "loss": 0.32745258808135985, + "step": 190660 + }, + { + "epoch": 0.8185861604114612, + "grad_norm": 0.017273802310228348, + "learning_rate": 1.8220035701042574e-05, + "loss": 0.34038119316101073, + "step": 190670 + }, + { + "epoch": 0.8186290925014812, + "grad_norm": 0.05780120566487312, + "learning_rate": 1.821572398092495e-05, + "loss": 0.16594982147216797, + "step": 190680 + }, + { + "epoch": 0.8186720245915011, + "grad_norm": 0.04340917989611626, + "learning_rate": 1.821141226080733e-05, + "loss": 0.23679041862487793, + "step": 190690 + }, + { + "epoch": 0.8187149566815212, + "grad_norm": 2.4382174015045166, + "learning_rate": 1.8207100540689703e-05, + "loss": 0.21798980236053467, + "step": 190700 + }, + { + "epoch": 0.8187578887715412, + "grad_norm": 0.02230762504041195, + "learning_rate": 1.820278882057208e-05, + "loss": 0.28459992408752444, + "step": 190710 + }, + { + "epoch": 0.8188008208615611, + "grad_norm": 2.178717613220215, + "learning_rate": 1.8198477100454454e-05, + "loss": 0.29400684833526614, + "step": 190720 + }, + { + "epoch": 0.8188437529515812, + "grad_norm": 0.004009547643363476, + "learning_rate": 1.8194165380336835e-05, + "loss": 0.06440883874893188, + "step": 190730 + }, + { + "epoch": 0.8188866850416012, + "grad_norm": 0.9320793747901917, + "learning_rate": 1.818985366021921e-05, + "loss": 0.08484240174293518, + "step": 190740 + }, + { + "epoch": 0.8189296171316212, + "grad_norm": 2.0037152767181396, + "learning_rate": 1.8185541940101586e-05, + "loss": 0.15250011682510375, + "step": 190750 + }, + { + "epoch": 0.8189725492216412, + "grad_norm": 1.7748454809188843, + "learning_rate": 1.818123021998396e-05, + "loss": 0.17670098543167115, + "step": 190760 + }, + { + "epoch": 0.8190154813116612, + "grad_norm": 0.17334459722042084, + "learning_rate": 1.8176918499866338e-05, + "loss": 0.30178070068359375, + "step": 190770 + }, + { + "epoch": 0.8190584134016812, + "grad_norm": 0.11208771914243698, + "learning_rate": 1.817260677974871e-05, + "loss": 0.23756444454193115, + "step": 190780 + }, + { + "epoch": 0.8191013454917012, + "grad_norm": 0.048918429762125015, + "learning_rate": 1.8168295059631092e-05, + "loss": 0.19704781770706176, + "step": 190790 + }, + { + "epoch": 0.8191442775817213, + "grad_norm": 1.5001906156539917, + "learning_rate": 1.8163983339513466e-05, + "loss": 0.11418488025665283, + "step": 190800 + }, + { + "epoch": 0.8191872096717412, + "grad_norm": 1.576877474784851, + "learning_rate": 1.8159671619395844e-05, + "loss": 0.1952579140663147, + "step": 190810 + }, + { + "epoch": 0.8192301417617612, + "grad_norm": 0.02885427139699459, + "learning_rate": 1.8155359899278217e-05, + "loss": 0.21382324695587157, + "step": 190820 + }, + { + "epoch": 0.8192730738517813, + "grad_norm": 4.801934242248535, + "learning_rate": 1.8151048179160595e-05, + "loss": 0.19394272565841675, + "step": 190830 + }, + { + "epoch": 0.8193160059418012, + "grad_norm": 0.022001752629876137, + "learning_rate": 1.8146736459042972e-05, + "loss": 0.11965030431747437, + "step": 190840 + }, + { + "epoch": 0.8193589380318212, + "grad_norm": 0.5711457133293152, + "learning_rate": 1.814242473892535e-05, + "loss": 0.2572173118591309, + "step": 190850 + }, + { + "epoch": 0.8194018701218413, + "grad_norm": 0.10609620809555054, + "learning_rate": 1.8138113018807723e-05, + "loss": 0.19334434270858764, + "step": 190860 + }, + { + "epoch": 0.8194448022118612, + "grad_norm": 0.18748600780963898, + "learning_rate": 1.81338012986901e-05, + "loss": 0.03161357641220093, + "step": 190870 + }, + { + "epoch": 0.8194877343018813, + "grad_norm": 6.887355327606201, + "learning_rate": 1.8129489578572475e-05, + "loss": 0.4630523681640625, + "step": 190880 + }, + { + "epoch": 0.8195306663919013, + "grad_norm": 0.8817146420478821, + "learning_rate": 1.8125177858454852e-05, + "loss": 0.2447819471359253, + "step": 190890 + }, + { + "epoch": 0.8195735984819214, + "grad_norm": 1.8151564598083496, + "learning_rate": 1.812086613833723e-05, + "loss": 0.20715341567993165, + "step": 190900 + }, + { + "epoch": 0.8196165305719413, + "grad_norm": 0.008164191618561745, + "learning_rate": 1.8116554418219607e-05, + "loss": 0.14329957962036133, + "step": 190910 + }, + { + "epoch": 0.8196594626619613, + "grad_norm": 0.007723241113126278, + "learning_rate": 1.811224269810198e-05, + "loss": 0.2750748872756958, + "step": 190920 + }, + { + "epoch": 0.8197023947519814, + "grad_norm": 0.2240218222141266, + "learning_rate": 1.8107930977984358e-05, + "loss": 0.11921135187149048, + "step": 190930 + }, + { + "epoch": 0.8197453268420013, + "grad_norm": 0.17240864038467407, + "learning_rate": 1.8103619257866732e-05, + "loss": 0.34458763599395753, + "step": 190940 + }, + { + "epoch": 0.8197882589320213, + "grad_norm": 0.014422965236008167, + "learning_rate": 1.809930753774911e-05, + "loss": 0.024203298985958098, + "step": 190950 + }, + { + "epoch": 0.8198311910220414, + "grad_norm": 0.1409555822610855, + "learning_rate": 1.8094995817631487e-05, + "loss": 0.21380703449249266, + "step": 190960 + }, + { + "epoch": 0.8198741231120613, + "grad_norm": 0.059442151337862015, + "learning_rate": 1.8090684097513864e-05, + "loss": 0.24995136260986328, + "step": 190970 + }, + { + "epoch": 0.8199170552020814, + "grad_norm": 0.0010440107434988022, + "learning_rate": 1.808637237739624e-05, + "loss": 0.07371382713317871, + "step": 190980 + }, + { + "epoch": 0.8199599872921014, + "grad_norm": 0.0026696210261434317, + "learning_rate": 1.8082060657278615e-05, + "loss": 0.09124609231948852, + "step": 190990 + }, + { + "epoch": 0.8200029193821213, + "grad_norm": 1.8029237985610962, + "learning_rate": 1.8077748937160993e-05, + "loss": 0.11664470434188842, + "step": 191000 + }, + { + "epoch": 0.8200029193821213, + "eval_loss": 0.38155049085617065, + "eval_runtime": 27.5094, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 3.635, + "step": 191000 + }, + { + "epoch": 0.8200458514721414, + "grad_norm": 0.22230297327041626, + "learning_rate": 1.8073437217043367e-05, + "loss": 0.24210076332092284, + "step": 191010 + }, + { + "epoch": 0.8200887835621614, + "grad_norm": 0.12822383642196655, + "learning_rate": 1.8069125496925744e-05, + "loss": 0.22140724658966066, + "step": 191020 + }, + { + "epoch": 0.8201317156521813, + "grad_norm": 0.13210682570934296, + "learning_rate": 1.806481377680812e-05, + "loss": 0.11168990135192872, + "step": 191030 + }, + { + "epoch": 0.8201746477422014, + "grad_norm": 0.1504775583744049, + "learning_rate": 1.80605020566905e-05, + "loss": 0.26407132148742674, + "step": 191040 + }, + { + "epoch": 0.8202175798322214, + "grad_norm": 1.0668368339538574, + "learning_rate": 1.8056190336572873e-05, + "loss": 0.27573869228363035, + "step": 191050 + }, + { + "epoch": 0.8202605119222414, + "grad_norm": 1.6537013053894043, + "learning_rate": 1.805187861645525e-05, + "loss": 0.23031814098358155, + "step": 191060 + }, + { + "epoch": 0.8203034440122614, + "grad_norm": 0.7904632091522217, + "learning_rate": 1.8047566896337624e-05, + "loss": 0.13049904108047486, + "step": 191070 + }, + { + "epoch": 0.8203463761022814, + "grad_norm": 0.023739825934171677, + "learning_rate": 1.8043255176220005e-05, + "loss": 0.07214347720146179, + "step": 191080 + }, + { + "epoch": 0.8203893081923014, + "grad_norm": 0.03254947438836098, + "learning_rate": 1.803894345610238e-05, + "loss": 0.29176888465881345, + "step": 191090 + }, + { + "epoch": 0.8204322402823214, + "grad_norm": 4.219489574432373, + "learning_rate": 1.8034631735984756e-05, + "loss": 0.2843551397323608, + "step": 191100 + }, + { + "epoch": 0.8204751723723415, + "grad_norm": 0.014289355836808681, + "learning_rate": 1.803032001586713e-05, + "loss": 0.18762102127075195, + "step": 191110 + }, + { + "epoch": 0.8205181044623614, + "grad_norm": 2.8176822662353516, + "learning_rate": 1.8026008295749507e-05, + "loss": 0.15682103633880615, + "step": 191120 + }, + { + "epoch": 0.8205610365523814, + "grad_norm": 41.578189849853516, + "learning_rate": 1.802169657563188e-05, + "loss": 0.29941635131835936, + "step": 191130 + }, + { + "epoch": 0.8206039686424015, + "grad_norm": 0.001329735154286027, + "learning_rate": 1.8017384855514262e-05, + "loss": 0.23812174797058105, + "step": 191140 + }, + { + "epoch": 0.8206469007324214, + "grad_norm": 1.0163871049880981, + "learning_rate": 1.8013073135396636e-05, + "loss": 0.17960180044174195, + "step": 191150 + }, + { + "epoch": 0.8206898328224415, + "grad_norm": 0.07313176989555359, + "learning_rate": 1.8008761415279013e-05, + "loss": 0.19614267349243164, + "step": 191160 + }, + { + "epoch": 0.8207327649124615, + "grad_norm": 0.6908106207847595, + "learning_rate": 1.8004449695161387e-05, + "loss": 0.09613000750541686, + "step": 191170 + }, + { + "epoch": 0.8207756970024814, + "grad_norm": 0.036255963146686554, + "learning_rate": 1.8000137975043765e-05, + "loss": 0.19044407606124877, + "step": 191180 + }, + { + "epoch": 0.8208186290925015, + "grad_norm": 1.8935225009918213, + "learning_rate": 1.7995826254926142e-05, + "loss": 0.06331070661544799, + "step": 191190 + }, + { + "epoch": 0.8208615611825215, + "grad_norm": 0.046459510922431946, + "learning_rate": 1.799151453480852e-05, + "loss": 0.04903110563755035, + "step": 191200 + }, + { + "epoch": 0.8209044932725414, + "grad_norm": 0.1087096557021141, + "learning_rate": 1.7987202814690893e-05, + "loss": 0.280530309677124, + "step": 191210 + }, + { + "epoch": 0.8209474253625615, + "grad_norm": 1.053376317024231, + "learning_rate": 1.798289109457327e-05, + "loss": 0.38856456279754636, + "step": 191220 + }, + { + "epoch": 0.8209903574525815, + "grad_norm": 0.001331243198364973, + "learning_rate": 1.7978579374455644e-05, + "loss": 0.23703274726867676, + "step": 191230 + }, + { + "epoch": 0.8210332895426015, + "grad_norm": 2.703815460205078, + "learning_rate": 1.7974267654338022e-05, + "loss": 0.2822533130645752, + "step": 191240 + }, + { + "epoch": 0.8210762216326215, + "grad_norm": 0.015311984345316887, + "learning_rate": 1.79699559342204e-05, + "loss": 0.4106620788574219, + "step": 191250 + }, + { + "epoch": 0.8211191537226415, + "grad_norm": 50.689842224121094, + "learning_rate": 1.7965644214102777e-05, + "loss": 0.11393247842788697, + "step": 191260 + }, + { + "epoch": 0.8211620858126615, + "grad_norm": 0.03459545969963074, + "learning_rate": 1.796133249398515e-05, + "loss": 0.10530201196670533, + "step": 191270 + }, + { + "epoch": 0.8212050179026815, + "grad_norm": 1.1778295040130615, + "learning_rate": 1.7957020773867528e-05, + "loss": 0.16542747020721435, + "step": 191280 + }, + { + "epoch": 0.8212479499927016, + "grad_norm": 0.05201715975999832, + "learning_rate": 1.7952709053749902e-05, + "loss": 0.12944258451461793, + "step": 191290 + }, + { + "epoch": 0.8212908820827215, + "grad_norm": 0.005311310291290283, + "learning_rate": 1.794839733363228e-05, + "loss": 0.11700173616409301, + "step": 191300 + }, + { + "epoch": 0.8213338141727415, + "grad_norm": 0.009006867185235023, + "learning_rate": 1.7944085613514656e-05, + "loss": 0.054128849506378175, + "step": 191310 + }, + { + "epoch": 0.8213767462627616, + "grad_norm": 0.05511913821101189, + "learning_rate": 1.7939773893397034e-05, + "loss": 0.09677992463111877, + "step": 191320 + }, + { + "epoch": 0.8214196783527816, + "grad_norm": 0.012460976839065552, + "learning_rate": 1.7935462173279408e-05, + "loss": 0.12651125192642212, + "step": 191330 + }, + { + "epoch": 0.8214626104428016, + "grad_norm": 0.05311845242977142, + "learning_rate": 1.7931150453161785e-05, + "loss": 0.2608399152755737, + "step": 191340 + }, + { + "epoch": 0.8215055425328216, + "grad_norm": 0.8644061088562012, + "learning_rate": 1.7926838733044162e-05, + "loss": 0.2154980182647705, + "step": 191350 + }, + { + "epoch": 0.8215484746228416, + "grad_norm": 1.3083879947662354, + "learning_rate": 1.7922527012926536e-05, + "loss": 0.18745582103729247, + "step": 191360 + }, + { + "epoch": 0.8215914067128616, + "grad_norm": 0.7998777627944946, + "learning_rate": 1.7918215292808914e-05, + "loss": 0.37282159328460696, + "step": 191370 + }, + { + "epoch": 0.8216343388028816, + "grad_norm": 1.0937479734420776, + "learning_rate": 1.791390357269129e-05, + "loss": 0.28621206283569334, + "step": 191380 + }, + { + "epoch": 0.8216772708929017, + "grad_norm": 0.003337725531309843, + "learning_rate": 1.790959185257367e-05, + "loss": 0.1956787347793579, + "step": 191390 + }, + { + "epoch": 0.8217202029829216, + "grad_norm": 4.987943172454834, + "learning_rate": 1.7905280132456042e-05, + "loss": 0.2712759256362915, + "step": 191400 + }, + { + "epoch": 0.8217631350729416, + "grad_norm": 0.0192168690264225, + "learning_rate": 1.790096841233842e-05, + "loss": 0.05187936425209046, + "step": 191410 + }, + { + "epoch": 0.8218060671629617, + "grad_norm": 0.8645950555801392, + "learning_rate": 1.7896656692220794e-05, + "loss": 0.23125803470611572, + "step": 191420 + }, + { + "epoch": 0.8218489992529816, + "grad_norm": 1.6332647800445557, + "learning_rate": 1.7892344972103174e-05, + "loss": 0.17718969583511351, + "step": 191430 + }, + { + "epoch": 0.8218919313430016, + "grad_norm": 3.3261234760284424, + "learning_rate": 1.788803325198555e-05, + "loss": 0.24678752422332764, + "step": 191440 + }, + { + "epoch": 0.8219348634330217, + "grad_norm": 3.7975666522979736, + "learning_rate": 1.7883721531867926e-05, + "loss": 0.19560292959213257, + "step": 191450 + }, + { + "epoch": 0.8219777955230416, + "grad_norm": 0.3274807631969452, + "learning_rate": 1.78794098117503e-05, + "loss": 0.351909875869751, + "step": 191460 + }, + { + "epoch": 0.8220207276130617, + "grad_norm": 0.0043863472528755665, + "learning_rate": 1.7875098091632677e-05, + "loss": 0.16393966674804689, + "step": 191470 + }, + { + "epoch": 0.8220636597030817, + "grad_norm": 0.015406210906803608, + "learning_rate": 1.787078637151505e-05, + "loss": 0.22621474266052247, + "step": 191480 + }, + { + "epoch": 0.8221065917931016, + "grad_norm": 0.07137417048215866, + "learning_rate": 1.786647465139743e-05, + "loss": 0.038229352235794066, + "step": 191490 + }, + { + "epoch": 0.8221495238831217, + "grad_norm": 0.027764789760112762, + "learning_rate": 1.7862162931279806e-05, + "loss": 0.23064649105072021, + "step": 191500 + }, + { + "epoch": 0.8221924559731417, + "grad_norm": 0.019897371530532837, + "learning_rate": 1.7857851211162183e-05, + "loss": 0.35613534450531004, + "step": 191510 + }, + { + "epoch": 0.8222353880631617, + "grad_norm": 0.12119104713201523, + "learning_rate": 1.7853539491044557e-05, + "loss": 0.27148022651672366, + "step": 191520 + }, + { + "epoch": 0.8222783201531817, + "grad_norm": 0.05415143072605133, + "learning_rate": 1.7849227770926934e-05, + "loss": 0.1662237048149109, + "step": 191530 + }, + { + "epoch": 0.8223212522432017, + "grad_norm": 0.06889358162879944, + "learning_rate": 1.784491605080931e-05, + "loss": 0.05766690969467163, + "step": 191540 + }, + { + "epoch": 0.8223641843332217, + "grad_norm": 12.351526260375977, + "learning_rate": 1.784060433069169e-05, + "loss": 0.15811610221862793, + "step": 191550 + }, + { + "epoch": 0.8224071164232417, + "grad_norm": 0.13275538384914398, + "learning_rate": 1.7836292610574063e-05, + "loss": 0.27862823009490967, + "step": 191560 + }, + { + "epoch": 0.8224500485132618, + "grad_norm": 0.003320979652926326, + "learning_rate": 1.783198089045644e-05, + "loss": 0.15656894445419312, + "step": 191570 + }, + { + "epoch": 0.8224929806032817, + "grad_norm": 5.074235916137695, + "learning_rate": 1.7827669170338814e-05, + "loss": 0.307766056060791, + "step": 191580 + }, + { + "epoch": 0.8225359126933017, + "grad_norm": 0.04263024777173996, + "learning_rate": 1.782335745022119e-05, + "loss": 0.14165412187576293, + "step": 191590 + }, + { + "epoch": 0.8225788447833218, + "grad_norm": 0.048086926341056824, + "learning_rate": 1.781904573010357e-05, + "loss": 0.35209856033325193, + "step": 191600 + }, + { + "epoch": 0.8226217768733417, + "grad_norm": 0.006095598451793194, + "learning_rate": 1.7814734009985946e-05, + "loss": 0.08997969627380371, + "step": 191610 + }, + { + "epoch": 0.8226647089633617, + "grad_norm": 0.0008162250742316246, + "learning_rate": 1.781042228986832e-05, + "loss": 0.09934253096580506, + "step": 191620 + }, + { + "epoch": 0.8227076410533818, + "grad_norm": 0.0008933839853852987, + "learning_rate": 1.7806110569750698e-05, + "loss": 0.1271510362625122, + "step": 191630 + }, + { + "epoch": 0.8227505731434017, + "grad_norm": 0.10590466111898422, + "learning_rate": 1.780179884963307e-05, + "loss": 0.13586642742156982, + "step": 191640 + }, + { + "epoch": 0.8227935052334218, + "grad_norm": 0.20287977159023285, + "learning_rate": 1.779748712951545e-05, + "loss": 0.2331789493560791, + "step": 191650 + }, + { + "epoch": 0.8228364373234418, + "grad_norm": 0.017177434638142586, + "learning_rate": 1.7793175409397826e-05, + "loss": 0.24761927127838135, + "step": 191660 + }, + { + "epoch": 0.8228793694134617, + "grad_norm": 1.3558788299560547, + "learning_rate": 1.7788863689280204e-05, + "loss": 0.19192906618118286, + "step": 191670 + }, + { + "epoch": 0.8229223015034818, + "grad_norm": 0.019712621346116066, + "learning_rate": 1.7784551969162577e-05, + "loss": 0.22504265308380128, + "step": 191680 + }, + { + "epoch": 0.8229652335935018, + "grad_norm": 1.1804683208465576, + "learning_rate": 1.7780240249044955e-05, + "loss": 0.5104588985443115, + "step": 191690 + }, + { + "epoch": 0.8230081656835218, + "grad_norm": 0.34881842136383057, + "learning_rate": 1.777592852892733e-05, + "loss": 0.27373499870300294, + "step": 191700 + }, + { + "epoch": 0.8230510977735418, + "grad_norm": 0.0014252919936552644, + "learning_rate": 1.7771616808809706e-05, + "loss": 0.12193593978881836, + "step": 191710 + }, + { + "epoch": 0.8230940298635618, + "grad_norm": 0.12936106324195862, + "learning_rate": 1.7767305088692083e-05, + "loss": 0.16922520399093627, + "step": 191720 + }, + { + "epoch": 0.8231369619535818, + "grad_norm": 2.7369914054870605, + "learning_rate": 1.776299336857446e-05, + "loss": 0.25319030284881594, + "step": 191730 + }, + { + "epoch": 0.8231798940436018, + "grad_norm": 0.022860685363411903, + "learning_rate": 1.7758681648456838e-05, + "loss": 0.4412991046905518, + "step": 191740 + }, + { + "epoch": 0.8232228261336219, + "grad_norm": 0.004503470379859209, + "learning_rate": 1.7754369928339212e-05, + "loss": 0.11105915307998657, + "step": 191750 + }, + { + "epoch": 0.8232657582236419, + "grad_norm": 0.3987155854701996, + "learning_rate": 1.775005820822159e-05, + "loss": 0.20699021816253663, + "step": 191760 + }, + { + "epoch": 0.8233086903136618, + "grad_norm": 0.003383493749424815, + "learning_rate": 1.7745746488103963e-05, + "loss": 0.1353330969810486, + "step": 191770 + }, + { + "epoch": 0.8233516224036819, + "grad_norm": 0.006548069417476654, + "learning_rate": 1.774143476798634e-05, + "loss": 0.023429669439792633, + "step": 191780 + }, + { + "epoch": 0.8233945544937019, + "grad_norm": 0.47888049483299255, + "learning_rate": 1.7737123047868718e-05, + "loss": 0.3590864181518555, + "step": 191790 + }, + { + "epoch": 0.8234374865837218, + "grad_norm": 0.012283619493246078, + "learning_rate": 1.7732811327751095e-05, + "loss": 0.036898362636566165, + "step": 191800 + }, + { + "epoch": 0.8234804186737419, + "grad_norm": 0.9226396083831787, + "learning_rate": 1.772849960763347e-05, + "loss": 0.10457895994186402, + "step": 191810 + }, + { + "epoch": 0.8235233507637619, + "grad_norm": 0.03380728140473366, + "learning_rate": 1.7724187887515847e-05, + "loss": 0.49823312759399413, + "step": 191820 + }, + { + "epoch": 0.8235662828537819, + "grad_norm": 0.031724244356155396, + "learning_rate": 1.771987616739822e-05, + "loss": 0.020426127314567565, + "step": 191830 + }, + { + "epoch": 0.8236092149438019, + "grad_norm": 1.9001127481460571, + "learning_rate": 1.77155644472806e-05, + "loss": 0.2856115818023682, + "step": 191840 + }, + { + "epoch": 0.823652147033822, + "grad_norm": 0.3387307822704315, + "learning_rate": 1.7711252727162975e-05, + "loss": 0.1957484722137451, + "step": 191850 + }, + { + "epoch": 0.8236950791238419, + "grad_norm": 5.49625301361084, + "learning_rate": 1.7706941007045353e-05, + "loss": 0.24340589046478273, + "step": 191860 + }, + { + "epoch": 0.8237380112138619, + "grad_norm": 0.005266428925096989, + "learning_rate": 1.7702629286927727e-05, + "loss": 0.25476138591766356, + "step": 191870 + }, + { + "epoch": 0.823780943303882, + "grad_norm": 0.07123085111379623, + "learning_rate": 1.7698317566810104e-05, + "loss": 0.20363154411315917, + "step": 191880 + }, + { + "epoch": 0.8238238753939019, + "grad_norm": 135.7645721435547, + "learning_rate": 1.7694005846692478e-05, + "loss": 0.16444821357727052, + "step": 191890 + }, + { + "epoch": 0.8238668074839219, + "grad_norm": 1.4238746166229248, + "learning_rate": 1.768969412657486e-05, + "loss": 0.20836834907531737, + "step": 191900 + }, + { + "epoch": 0.823909739573942, + "grad_norm": 0.1336083710193634, + "learning_rate": 1.7685382406457233e-05, + "loss": 0.11080396175384521, + "step": 191910 + }, + { + "epoch": 0.8239526716639619, + "grad_norm": 3.034364700317383, + "learning_rate": 1.768107068633961e-05, + "loss": 0.34577069282531736, + "step": 191920 + }, + { + "epoch": 0.823995603753982, + "grad_norm": 0.02131952904164791, + "learning_rate": 1.7676758966221984e-05, + "loss": 0.03955221474170685, + "step": 191930 + }, + { + "epoch": 0.824038535844002, + "grad_norm": 0.005112878978252411, + "learning_rate": 1.767244724610436e-05, + "loss": 0.20619418621063232, + "step": 191940 + }, + { + "epoch": 0.8240814679340219, + "grad_norm": 1.2098677158355713, + "learning_rate": 1.766813552598674e-05, + "loss": 0.15751324892044066, + "step": 191950 + }, + { + "epoch": 0.824124400024042, + "grad_norm": 0.13064929842948914, + "learning_rate": 1.7663823805869116e-05, + "loss": 0.21526129245758058, + "step": 191960 + }, + { + "epoch": 0.824167332114062, + "grad_norm": 0.6280670762062073, + "learning_rate": 1.765951208575149e-05, + "loss": 0.17393140792846679, + "step": 191970 + }, + { + "epoch": 0.8242102642040819, + "grad_norm": 1.3640555143356323, + "learning_rate": 1.7655200365633867e-05, + "loss": 0.171847927570343, + "step": 191980 + }, + { + "epoch": 0.824253196294102, + "grad_norm": 0.012612332589924335, + "learning_rate": 1.765088864551624e-05, + "loss": 0.17563261985778808, + "step": 191990 + }, + { + "epoch": 0.824296128384122, + "grad_norm": 42.14128112792969, + "learning_rate": 1.764657692539862e-05, + "loss": 0.085991770029068, + "step": 192000 + }, + { + "epoch": 0.824296128384122, + "eval_loss": 0.3825879991054535, + "eval_runtime": 27.4797, + "eval_samples_per_second": 3.639, + "eval_steps_per_second": 3.639, + "step": 192000 + }, + { + "epoch": 0.824339060474142, + "grad_norm": 0.01008710078895092, + "learning_rate": 1.7642265205280996e-05, + "loss": 0.37354443073272703, + "step": 192010 + }, + { + "epoch": 0.824381992564162, + "grad_norm": 1.7483117580413818, + "learning_rate": 1.7637953485163373e-05, + "loss": 0.18292040824890138, + "step": 192020 + }, + { + "epoch": 0.824424924654182, + "grad_norm": 0.012306587770581245, + "learning_rate": 1.7633641765045747e-05, + "loss": 0.36442830562591555, + "step": 192030 + }, + { + "epoch": 0.824467856744202, + "grad_norm": 0.04286915063858032, + "learning_rate": 1.7629330044928125e-05, + "loss": 0.126639723777771, + "step": 192040 + }, + { + "epoch": 0.824510788834222, + "grad_norm": 0.0028463948983699083, + "learning_rate": 1.76250183248105e-05, + "loss": 0.09038893580436706, + "step": 192050 + }, + { + "epoch": 0.8245537209242421, + "grad_norm": 1.2837460041046143, + "learning_rate": 1.7620706604692876e-05, + "loss": 0.06621870994567872, + "step": 192060 + }, + { + "epoch": 0.824596653014262, + "grad_norm": 0.050448864698410034, + "learning_rate": 1.7616394884575253e-05, + "loss": 0.1748013138771057, + "step": 192070 + }, + { + "epoch": 0.824639585104282, + "grad_norm": 0.006843621842563152, + "learning_rate": 1.761208316445763e-05, + "loss": 0.021199363470077514, + "step": 192080 + }, + { + "epoch": 0.8246825171943021, + "grad_norm": 0.28706303238868713, + "learning_rate": 1.7607771444340008e-05, + "loss": 0.1016458511352539, + "step": 192090 + }, + { + "epoch": 0.824725449284322, + "grad_norm": 1.928757905960083, + "learning_rate": 1.7603459724222382e-05, + "loss": 0.15487730503082275, + "step": 192100 + }, + { + "epoch": 0.824768381374342, + "grad_norm": 6.034005165100098, + "learning_rate": 1.759914800410476e-05, + "loss": 0.169073486328125, + "step": 192110 + }, + { + "epoch": 0.8248113134643621, + "grad_norm": 0.748520016670227, + "learning_rate": 1.7594836283987133e-05, + "loss": 0.11985911130905151, + "step": 192120 + }, + { + "epoch": 0.824854245554382, + "grad_norm": 0.25992897152900696, + "learning_rate": 1.759052456386951e-05, + "loss": 0.08560782074928283, + "step": 192130 + }, + { + "epoch": 0.8248971776444021, + "grad_norm": 0.0937369093298912, + "learning_rate": 1.7586212843751888e-05, + "loss": 0.19405397176742553, + "step": 192140 + }, + { + "epoch": 0.8249401097344221, + "grad_norm": 0.5325449109077454, + "learning_rate": 1.7581901123634265e-05, + "loss": 0.28080456256866454, + "step": 192150 + }, + { + "epoch": 0.824983041824442, + "grad_norm": 6.978481769561768, + "learning_rate": 1.757758940351664e-05, + "loss": 0.19362866878509521, + "step": 192160 + }, + { + "epoch": 0.8250259739144621, + "grad_norm": 0.017143480479717255, + "learning_rate": 1.7573277683399016e-05, + "loss": 0.2803755521774292, + "step": 192170 + }, + { + "epoch": 0.8250689060044821, + "grad_norm": 0.14805008471012115, + "learning_rate": 1.756896596328139e-05, + "loss": 0.16662766933441162, + "step": 192180 + }, + { + "epoch": 0.8251118380945022, + "grad_norm": 0.0492326021194458, + "learning_rate": 1.756465424316377e-05, + "loss": 0.09783921837806701, + "step": 192190 + }, + { + "epoch": 0.8251547701845221, + "grad_norm": 3.8107030391693115, + "learning_rate": 1.7560342523046145e-05, + "loss": 0.29922664165496826, + "step": 192200 + }, + { + "epoch": 0.8251977022745421, + "grad_norm": 0.06435045599937439, + "learning_rate": 1.7556030802928522e-05, + "loss": 0.07153820395469665, + "step": 192210 + }, + { + "epoch": 0.8252406343645622, + "grad_norm": 0.0006460752920247614, + "learning_rate": 1.7551719082810896e-05, + "loss": 0.04712951183319092, + "step": 192220 + }, + { + "epoch": 0.8252835664545821, + "grad_norm": 0.022561080753803253, + "learning_rate": 1.7547407362693274e-05, + "loss": 0.1923914670944214, + "step": 192230 + }, + { + "epoch": 0.8253264985446022, + "grad_norm": 0.009855869226157665, + "learning_rate": 1.7543095642575648e-05, + "loss": 0.09128308892250062, + "step": 192240 + }, + { + "epoch": 0.8253694306346222, + "grad_norm": 0.555938184261322, + "learning_rate": 1.753878392245803e-05, + "loss": 0.004356810450553894, + "step": 192250 + }, + { + "epoch": 0.8254123627246421, + "grad_norm": 0.0011161682195961475, + "learning_rate": 1.7534472202340402e-05, + "loss": 0.22292764186859132, + "step": 192260 + }, + { + "epoch": 0.8254552948146622, + "grad_norm": 1.2606300115585327, + "learning_rate": 1.753016048222278e-05, + "loss": 0.1448211669921875, + "step": 192270 + }, + { + "epoch": 0.8254982269046822, + "grad_norm": 4.72949743270874, + "learning_rate": 1.7525848762105154e-05, + "loss": 0.29486114978790284, + "step": 192280 + }, + { + "epoch": 0.8255411589947022, + "grad_norm": 5.2792558670043945, + "learning_rate": 1.752153704198753e-05, + "loss": 0.08456004858016967, + "step": 192290 + }, + { + "epoch": 0.8255840910847222, + "grad_norm": 0.022953951731324196, + "learning_rate": 1.751722532186991e-05, + "loss": 0.2897635936737061, + "step": 192300 + }, + { + "epoch": 0.8256270231747422, + "grad_norm": 0.0130073893815279, + "learning_rate": 1.7512913601752286e-05, + "loss": 0.19400113821029663, + "step": 192310 + }, + { + "epoch": 0.8256699552647622, + "grad_norm": 0.009828859008848667, + "learning_rate": 1.750860188163466e-05, + "loss": 0.16359663009643555, + "step": 192320 + }, + { + "epoch": 0.8257128873547822, + "grad_norm": 0.004650391638278961, + "learning_rate": 1.7504290161517037e-05, + "loss": 0.03614757657051086, + "step": 192330 + }, + { + "epoch": 0.8257558194448023, + "grad_norm": 0.030460670590400696, + "learning_rate": 1.749997844139941e-05, + "loss": 0.06386544108390808, + "step": 192340 + }, + { + "epoch": 0.8257987515348222, + "grad_norm": 0.016825038939714432, + "learning_rate": 1.749566672128179e-05, + "loss": 0.17280819416046142, + "step": 192350 + }, + { + "epoch": 0.8258416836248422, + "grad_norm": 28.79928970336914, + "learning_rate": 1.7491355001164166e-05, + "loss": 0.08564205169677734, + "step": 192360 + }, + { + "epoch": 0.8258846157148623, + "grad_norm": 7.042786598205566, + "learning_rate": 1.7487043281046543e-05, + "loss": 0.2968191385269165, + "step": 192370 + }, + { + "epoch": 0.8259275478048822, + "grad_norm": 0.004526129458099604, + "learning_rate": 1.7482731560928917e-05, + "loss": 0.2754892587661743, + "step": 192380 + }, + { + "epoch": 0.8259704798949022, + "grad_norm": 0.002219579881057143, + "learning_rate": 1.7478419840811294e-05, + "loss": 0.2458263635635376, + "step": 192390 + }, + { + "epoch": 0.8260134119849223, + "grad_norm": 0.05032936483621597, + "learning_rate": 1.7474108120693668e-05, + "loss": 0.28162617683410646, + "step": 192400 + }, + { + "epoch": 0.8260563440749422, + "grad_norm": 0.0014833813766017556, + "learning_rate": 1.7469796400576046e-05, + "loss": 0.2364175796508789, + "step": 192410 + }, + { + "epoch": 0.8260992761649623, + "grad_norm": 0.0007317995768971741, + "learning_rate": 1.7465484680458423e-05, + "loss": 0.0849960207939148, + "step": 192420 + }, + { + "epoch": 0.8261422082549823, + "grad_norm": 0.0030191184487193823, + "learning_rate": 1.74611729603408e-05, + "loss": 0.2047954559326172, + "step": 192430 + }, + { + "epoch": 0.8261851403450022, + "grad_norm": 0.014762978069484234, + "learning_rate": 1.7456861240223178e-05, + "loss": 0.38550779819488523, + "step": 192440 + }, + { + "epoch": 0.8262280724350223, + "grad_norm": 0.004658820573240519, + "learning_rate": 1.745254952010555e-05, + "loss": 0.1906131386756897, + "step": 192450 + }, + { + "epoch": 0.8262710045250423, + "grad_norm": 0.12024617195129395, + "learning_rate": 1.744823779998793e-05, + "loss": 0.18920086622238158, + "step": 192460 + }, + { + "epoch": 0.8263139366150623, + "grad_norm": 6.6206793785095215, + "learning_rate": 1.7443926079870303e-05, + "loss": 0.1918006420135498, + "step": 192470 + }, + { + "epoch": 0.8263568687050823, + "grad_norm": 8.5894193649292, + "learning_rate": 1.743961435975268e-05, + "loss": 0.30832791328430176, + "step": 192480 + }, + { + "epoch": 0.8263998007951023, + "grad_norm": 0.08169787377119064, + "learning_rate": 1.7435302639635058e-05, + "loss": 0.05640159249305725, + "step": 192490 + }, + { + "epoch": 0.8264427328851223, + "grad_norm": 0.0032690681982785463, + "learning_rate": 1.7430990919517435e-05, + "loss": 0.07844080328941345, + "step": 192500 + }, + { + "epoch": 0.8264856649751423, + "grad_norm": 1.843385934829712, + "learning_rate": 1.742667919939981e-05, + "loss": 0.20965149402618408, + "step": 192510 + }, + { + "epoch": 0.8265285970651624, + "grad_norm": 0.04977283626794815, + "learning_rate": 1.7422367479282186e-05, + "loss": 0.3092090129852295, + "step": 192520 + }, + { + "epoch": 0.8265715291551823, + "grad_norm": 0.057017020881175995, + "learning_rate": 1.741805575916456e-05, + "loss": 0.2586258888244629, + "step": 192530 + }, + { + "epoch": 0.8266144612452023, + "grad_norm": 0.7330488562583923, + "learning_rate": 1.7413744039046937e-05, + "loss": 0.2805727481842041, + "step": 192540 + }, + { + "epoch": 0.8266573933352224, + "grad_norm": 1.5467420816421509, + "learning_rate": 1.7409432318929315e-05, + "loss": 0.18074100017547606, + "step": 192550 + }, + { + "epoch": 0.8267003254252423, + "grad_norm": 0.05255649983882904, + "learning_rate": 1.7405120598811692e-05, + "loss": 0.12051188945770264, + "step": 192560 + }, + { + "epoch": 0.8267432575152623, + "grad_norm": 0.35828202962875366, + "learning_rate": 1.7400808878694066e-05, + "loss": 0.1667983651161194, + "step": 192570 + }, + { + "epoch": 0.8267861896052824, + "grad_norm": 0.007995634339749813, + "learning_rate": 1.7396497158576443e-05, + "loss": 0.22869086265563965, + "step": 192580 + }, + { + "epoch": 0.8268291216953023, + "grad_norm": 21.580411911010742, + "learning_rate": 1.7392185438458817e-05, + "loss": 0.21399738788604736, + "step": 192590 + }, + { + "epoch": 0.8268720537853224, + "grad_norm": 0.02281232550740242, + "learning_rate": 1.7387873718341198e-05, + "loss": 0.19772342443466187, + "step": 192600 + }, + { + "epoch": 0.8269149858753424, + "grad_norm": 0.05629677325487137, + "learning_rate": 1.7383561998223572e-05, + "loss": 0.23454837799072265, + "step": 192610 + }, + { + "epoch": 0.8269579179653624, + "grad_norm": 2.705537796020508, + "learning_rate": 1.737925027810595e-05, + "loss": 0.21049823760986328, + "step": 192620 + }, + { + "epoch": 0.8270008500553824, + "grad_norm": 0.03227739408612251, + "learning_rate": 1.7374938557988323e-05, + "loss": 0.15504707098007203, + "step": 192630 + }, + { + "epoch": 0.8270437821454024, + "grad_norm": 0.056841857731342316, + "learning_rate": 1.73706268378707e-05, + "loss": 0.11248964071273804, + "step": 192640 + }, + { + "epoch": 0.8270867142354225, + "grad_norm": 2.6003172397613525, + "learning_rate": 1.7366315117753075e-05, + "loss": 0.22536401748657225, + "step": 192650 + }, + { + "epoch": 0.8271296463254424, + "grad_norm": 0.3005393445491791, + "learning_rate": 1.7362003397635455e-05, + "loss": 0.061086273193359374, + "step": 192660 + }, + { + "epoch": 0.8271725784154624, + "grad_norm": 0.004778577946126461, + "learning_rate": 1.735769167751783e-05, + "loss": 0.17962511777877807, + "step": 192670 + }, + { + "epoch": 0.8272155105054825, + "grad_norm": 2.5120809078216553, + "learning_rate": 1.7353379957400207e-05, + "loss": 0.12587801218032837, + "step": 192680 + }, + { + "epoch": 0.8272584425955024, + "grad_norm": 0.020870067179203033, + "learning_rate": 1.734906823728258e-05, + "loss": 0.38633582592010496, + "step": 192690 + }, + { + "epoch": 0.8273013746855225, + "grad_norm": 0.08555847406387329, + "learning_rate": 1.7344756517164958e-05, + "loss": 0.20045323371887208, + "step": 192700 + }, + { + "epoch": 0.8273443067755425, + "grad_norm": 3.7902328968048096, + "learning_rate": 1.7340444797047335e-05, + "loss": 0.1515338897705078, + "step": 192710 + }, + { + "epoch": 0.8273872388655624, + "grad_norm": 0.05745023488998413, + "learning_rate": 1.7336133076929713e-05, + "loss": 0.2779086589813232, + "step": 192720 + }, + { + "epoch": 0.8274301709555825, + "grad_norm": 0.06869952380657196, + "learning_rate": 1.7331821356812087e-05, + "loss": 0.19615509510040283, + "step": 192730 + }, + { + "epoch": 0.8274731030456025, + "grad_norm": 0.0012193419970571995, + "learning_rate": 1.7327509636694464e-05, + "loss": 0.14778521060943603, + "step": 192740 + }, + { + "epoch": 0.8275160351356224, + "grad_norm": 0.010909829288721085, + "learning_rate": 1.7323197916576838e-05, + "loss": 0.1756802201271057, + "step": 192750 + }, + { + "epoch": 0.8275589672256425, + "grad_norm": 6.0416765213012695, + "learning_rate": 1.7318886196459215e-05, + "loss": 0.26812071800231935, + "step": 192760 + }, + { + "epoch": 0.8276018993156625, + "grad_norm": 3.5843048095703125, + "learning_rate": 1.7314574476341593e-05, + "loss": 0.26665773391723635, + "step": 192770 + }, + { + "epoch": 0.8276448314056825, + "grad_norm": 5.423513889312744, + "learning_rate": 1.731026275622397e-05, + "loss": 0.22109003067016603, + "step": 192780 + }, + { + "epoch": 0.8276877634957025, + "grad_norm": 0.18677234649658203, + "learning_rate": 1.7305951036106344e-05, + "loss": 0.1045659065246582, + "step": 192790 + }, + { + "epoch": 0.8277306955857225, + "grad_norm": 0.008483259938657284, + "learning_rate": 1.730163931598872e-05, + "loss": 0.09153875708580017, + "step": 192800 + }, + { + "epoch": 0.8277736276757425, + "grad_norm": 1.6661065816879272, + "learning_rate": 1.72973275958711e-05, + "loss": 0.17102301120758057, + "step": 192810 + }, + { + "epoch": 0.8278165597657625, + "grad_norm": 1.7879948616027832, + "learning_rate": 1.7293015875753473e-05, + "loss": 0.1961473822593689, + "step": 192820 + }, + { + "epoch": 0.8278594918557826, + "grad_norm": 2.1218552589416504, + "learning_rate": 1.728870415563585e-05, + "loss": 0.1817714214324951, + "step": 192830 + }, + { + "epoch": 0.8279024239458025, + "grad_norm": 0.06027497723698616, + "learning_rate": 1.7284392435518227e-05, + "loss": 0.37011630535125734, + "step": 192840 + }, + { + "epoch": 0.8279453560358225, + "grad_norm": 0.13712593913078308, + "learning_rate": 1.7280080715400605e-05, + "loss": 0.20809953212738036, + "step": 192850 + }, + { + "epoch": 0.8279882881258426, + "grad_norm": 2.3398091793060303, + "learning_rate": 1.727576899528298e-05, + "loss": 0.32560951709747316, + "step": 192860 + }, + { + "epoch": 0.8280312202158625, + "grad_norm": 0.34039562940597534, + "learning_rate": 1.7271457275165356e-05, + "loss": 0.16434768438339234, + "step": 192870 + }, + { + "epoch": 0.8280741523058825, + "grad_norm": 0.008567468263208866, + "learning_rate": 1.726714555504773e-05, + "loss": 0.1836724042892456, + "step": 192880 + }, + { + "epoch": 0.8281170843959026, + "grad_norm": 0.003646705998107791, + "learning_rate": 1.7262833834930107e-05, + "loss": 0.34908242225646974, + "step": 192890 + }, + { + "epoch": 0.8281600164859225, + "grad_norm": 0.00493283849209547, + "learning_rate": 1.7258522114812485e-05, + "loss": 0.25686960220336913, + "step": 192900 + }, + { + "epoch": 0.8282029485759426, + "grad_norm": 0.014279542490839958, + "learning_rate": 1.7254210394694862e-05, + "loss": 0.10751999616622925, + "step": 192910 + }, + { + "epoch": 0.8282458806659626, + "grad_norm": 0.10793264210224152, + "learning_rate": 1.7249898674577236e-05, + "loss": 0.21126208305358887, + "step": 192920 + }, + { + "epoch": 0.8282888127559825, + "grad_norm": 0.020320894196629524, + "learning_rate": 1.7245586954459613e-05, + "loss": 0.3169750452041626, + "step": 192930 + }, + { + "epoch": 0.8283317448460026, + "grad_norm": 0.004445925354957581, + "learning_rate": 1.7241275234341987e-05, + "loss": 0.1835729718208313, + "step": 192940 + }, + { + "epoch": 0.8283746769360226, + "grad_norm": 0.0913233682513237, + "learning_rate": 1.7236963514224368e-05, + "loss": 0.3368794918060303, + "step": 192950 + }, + { + "epoch": 0.8284176090260426, + "grad_norm": 0.005395609885454178, + "learning_rate": 1.7232651794106742e-05, + "loss": 0.12072069644927978, + "step": 192960 + }, + { + "epoch": 0.8284605411160626, + "grad_norm": 1.188323974609375, + "learning_rate": 1.722834007398912e-05, + "loss": 0.07599248290061951, + "step": 192970 + }, + { + "epoch": 0.8285034732060826, + "grad_norm": 1.4815592765808105, + "learning_rate": 1.7224028353871493e-05, + "loss": 0.300715970993042, + "step": 192980 + }, + { + "epoch": 0.8285464052961026, + "grad_norm": 0.013542281463742256, + "learning_rate": 1.721971663375387e-05, + "loss": 0.21341800689697266, + "step": 192990 + }, + { + "epoch": 0.8285893373861226, + "grad_norm": 0.02348690666258335, + "learning_rate": 1.7215404913636244e-05, + "loss": 0.20570461750030516, + "step": 193000 + }, + { + "epoch": 0.8285893373861226, + "eval_loss": 0.3693878948688507, + "eval_runtime": 27.4324, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 3.645, + "step": 193000 + }, + { + "epoch": 0.8286322694761427, + "grad_norm": 0.4562581181526184, + "learning_rate": 1.7211093193518625e-05, + "loss": 0.32135188579559326, + "step": 193010 + }, + { + "epoch": 0.8286752015661626, + "grad_norm": 2.3221492767333984, + "learning_rate": 1.7206781473401e-05, + "loss": 0.19243701696395873, + "step": 193020 + }, + { + "epoch": 0.8287181336561826, + "grad_norm": 1.3994622230529785, + "learning_rate": 1.7202469753283376e-05, + "loss": 0.14126864671707154, + "step": 193030 + }, + { + "epoch": 0.8287610657462027, + "grad_norm": 0.6949800848960876, + "learning_rate": 1.719815803316575e-05, + "loss": 0.09074133634567261, + "step": 193040 + }, + { + "epoch": 0.8288039978362227, + "grad_norm": 5.811275005340576, + "learning_rate": 1.7193846313048128e-05, + "loss": 0.3873776435852051, + "step": 193050 + }, + { + "epoch": 0.8288469299262426, + "grad_norm": 0.003161689732223749, + "learning_rate": 1.7189534592930505e-05, + "loss": 0.2473665952682495, + "step": 193060 + }, + { + "epoch": 0.8288898620162627, + "grad_norm": 0.01264791376888752, + "learning_rate": 1.7185222872812882e-05, + "loss": 0.15263274908065796, + "step": 193070 + }, + { + "epoch": 0.8289327941062827, + "grad_norm": 2.3640644550323486, + "learning_rate": 1.7180911152695256e-05, + "loss": 0.43180265426635744, + "step": 193080 + }, + { + "epoch": 0.8289757261963027, + "grad_norm": 0.015924545004963875, + "learning_rate": 1.7176599432577634e-05, + "loss": 0.2639934062957764, + "step": 193090 + }, + { + "epoch": 0.8290186582863227, + "grad_norm": 2.248006582260132, + "learning_rate": 1.7172287712460008e-05, + "loss": 0.3951799631118774, + "step": 193100 + }, + { + "epoch": 0.8290615903763427, + "grad_norm": 0.01186326239258051, + "learning_rate": 1.7167975992342385e-05, + "loss": 0.1399616003036499, + "step": 193110 + }, + { + "epoch": 0.8291045224663627, + "grad_norm": 0.025430859997868538, + "learning_rate": 1.7163664272224762e-05, + "loss": 0.1432553768157959, + "step": 193120 + }, + { + "epoch": 0.8291474545563827, + "grad_norm": 28.576847076416016, + "learning_rate": 1.715935255210714e-05, + "loss": 0.48826937675476073, + "step": 193130 + }, + { + "epoch": 0.8291903866464028, + "grad_norm": 4.05294942855835, + "learning_rate": 1.7155040831989514e-05, + "loss": 0.41413297653198244, + "step": 193140 + }, + { + "epoch": 0.8292333187364227, + "grad_norm": 0.5668222904205322, + "learning_rate": 1.715072911187189e-05, + "loss": 0.14205758571624755, + "step": 193150 + }, + { + "epoch": 0.8292762508264427, + "grad_norm": 0.2314767837524414, + "learning_rate": 1.714641739175427e-05, + "loss": 0.17212096452713013, + "step": 193160 + }, + { + "epoch": 0.8293191829164628, + "grad_norm": 7.710951805114746, + "learning_rate": 1.7142105671636642e-05, + "loss": 0.2715949773788452, + "step": 193170 + }, + { + "epoch": 0.8293621150064827, + "grad_norm": 0.03295942768454552, + "learning_rate": 1.713779395151902e-05, + "loss": 0.17002564668655396, + "step": 193180 + }, + { + "epoch": 0.8294050470965028, + "grad_norm": 2.842479944229126, + "learning_rate": 1.7133482231401397e-05, + "loss": 0.27584826946258545, + "step": 193190 + }, + { + "epoch": 0.8294479791865228, + "grad_norm": 0.44256696105003357, + "learning_rate": 1.7129170511283774e-05, + "loss": 0.15299785137176514, + "step": 193200 + }, + { + "epoch": 0.8294909112765427, + "grad_norm": 1.9088733196258545, + "learning_rate": 1.712485879116615e-05, + "loss": 0.27400665283203124, + "step": 193210 + }, + { + "epoch": 0.8295338433665628, + "grad_norm": 0.3094813823699951, + "learning_rate": 1.7120547071048526e-05, + "loss": 0.06840046644210815, + "step": 193220 + }, + { + "epoch": 0.8295767754565828, + "grad_norm": 0.250940203666687, + "learning_rate": 1.71162353509309e-05, + "loss": 0.14590828418731688, + "step": 193230 + }, + { + "epoch": 0.8296197075466027, + "grad_norm": 0.02940000407397747, + "learning_rate": 1.7111923630813277e-05, + "loss": 0.0903593122959137, + "step": 193240 + }, + { + "epoch": 0.8296626396366228, + "grad_norm": 0.004827784840017557, + "learning_rate": 1.7107611910695654e-05, + "loss": 0.12426952123641968, + "step": 193250 + }, + { + "epoch": 0.8297055717266428, + "grad_norm": 0.0651189386844635, + "learning_rate": 1.710330019057803e-05, + "loss": 0.1715847373008728, + "step": 193260 + }, + { + "epoch": 0.8297485038166628, + "grad_norm": 0.1649560183286667, + "learning_rate": 1.7098988470460406e-05, + "loss": 0.23831121921539306, + "step": 193270 + }, + { + "epoch": 0.8297914359066828, + "grad_norm": 0.007706853561103344, + "learning_rate": 1.7094676750342783e-05, + "loss": 0.2472676992416382, + "step": 193280 + }, + { + "epoch": 0.8298343679967028, + "grad_norm": 2.9978578090667725, + "learning_rate": 1.7090365030225157e-05, + "loss": 0.2791655778884888, + "step": 193290 + }, + { + "epoch": 0.8298773000867228, + "grad_norm": 0.02437140978872776, + "learning_rate": 1.7086053310107538e-05, + "loss": 0.172445547580719, + "step": 193300 + }, + { + "epoch": 0.8299202321767428, + "grad_norm": 0.02160993218421936, + "learning_rate": 1.708174158998991e-05, + "loss": 0.11930578947067261, + "step": 193310 + }, + { + "epoch": 0.8299631642667629, + "grad_norm": 0.06851465255022049, + "learning_rate": 1.707742986987229e-05, + "loss": 0.17158340215682982, + "step": 193320 + }, + { + "epoch": 0.8300060963567828, + "grad_norm": 0.004634546581655741, + "learning_rate": 1.7073118149754663e-05, + "loss": 0.004219388589262962, + "step": 193330 + }, + { + "epoch": 0.8300490284468028, + "grad_norm": 0.07137849926948547, + "learning_rate": 1.706880642963704e-05, + "loss": 0.08486306667327881, + "step": 193340 + }, + { + "epoch": 0.8300919605368229, + "grad_norm": 0.10204579681158066, + "learning_rate": 1.7064494709519414e-05, + "loss": 0.07411643266677856, + "step": 193350 + }, + { + "epoch": 0.8301348926268428, + "grad_norm": 0.038510385900735855, + "learning_rate": 1.7060182989401795e-05, + "loss": 0.23881709575653076, + "step": 193360 + }, + { + "epoch": 0.8301778247168629, + "grad_norm": 0.030046027153730392, + "learning_rate": 1.705587126928417e-05, + "loss": 0.07068468332290649, + "step": 193370 + }, + { + "epoch": 0.8302207568068829, + "grad_norm": 0.5060645937919617, + "learning_rate": 1.7051559549166546e-05, + "loss": 0.33170912265777586, + "step": 193380 + }, + { + "epoch": 0.8302636888969028, + "grad_norm": 0.07317323982715607, + "learning_rate": 1.704724782904892e-05, + "loss": 0.13146320581436158, + "step": 193390 + }, + { + "epoch": 0.8303066209869229, + "grad_norm": 0.9889684319496155, + "learning_rate": 1.7042936108931298e-05, + "loss": 0.3599538803100586, + "step": 193400 + }, + { + "epoch": 0.8303495530769429, + "grad_norm": 0.009405163116753101, + "learning_rate": 1.7038624388813675e-05, + "loss": 0.09876270294189453, + "step": 193410 + }, + { + "epoch": 0.8303924851669628, + "grad_norm": 0.1360063999891281, + "learning_rate": 1.7034312668696052e-05, + "loss": 0.12787834405899048, + "step": 193420 + }, + { + "epoch": 0.8304354172569829, + "grad_norm": 0.08976658433675766, + "learning_rate": 1.7030000948578426e-05, + "loss": 0.09479953050613403, + "step": 193430 + }, + { + "epoch": 0.8304783493470029, + "grad_norm": 0.015315504744648933, + "learning_rate": 1.7025689228460803e-05, + "loss": 0.20027387142181396, + "step": 193440 + }, + { + "epoch": 0.8305212814370229, + "grad_norm": 0.015854516997933388, + "learning_rate": 1.7021377508343177e-05, + "loss": 0.2696486711502075, + "step": 193450 + }, + { + "epoch": 0.8305642135270429, + "grad_norm": 0.015382854267954826, + "learning_rate": 1.7017065788225555e-05, + "loss": 0.04382437169551849, + "step": 193460 + }, + { + "epoch": 0.830607145617063, + "grad_norm": 1.2499767541885376, + "learning_rate": 1.7012754068107932e-05, + "loss": 0.2440438985824585, + "step": 193470 + }, + { + "epoch": 0.830650077707083, + "grad_norm": 0.0012643365189433098, + "learning_rate": 1.700844234799031e-05, + "loss": 0.29134511947631836, + "step": 193480 + }, + { + "epoch": 0.8306930097971029, + "grad_norm": 2.6301071643829346, + "learning_rate": 1.7004130627872683e-05, + "loss": 0.42733354568481446, + "step": 193490 + }, + { + "epoch": 0.830735941887123, + "grad_norm": 0.018867680802941322, + "learning_rate": 1.699981890775506e-05, + "loss": 0.12103630304336548, + "step": 193500 + }, + { + "epoch": 0.830778873977143, + "grad_norm": 0.5415545105934143, + "learning_rate": 1.6995507187637435e-05, + "loss": 0.18488938808441163, + "step": 193510 + }, + { + "epoch": 0.8308218060671629, + "grad_norm": 10.19937801361084, + "learning_rate": 1.6991195467519812e-05, + "loss": 0.5037790775299072, + "step": 193520 + }, + { + "epoch": 0.830864738157183, + "grad_norm": 0.039546895772218704, + "learning_rate": 1.698688374740219e-05, + "loss": 0.08394562602043151, + "step": 193530 + }, + { + "epoch": 0.830907670247203, + "grad_norm": 0.06411205977201462, + "learning_rate": 1.6982572027284567e-05, + "loss": 0.19905096292495728, + "step": 193540 + }, + { + "epoch": 0.830950602337223, + "grad_norm": 0.005737802479416132, + "learning_rate": 1.6978260307166944e-05, + "loss": 0.1987488627433777, + "step": 193550 + }, + { + "epoch": 0.830993534427243, + "grad_norm": 0.017030853778123856, + "learning_rate": 1.6973948587049318e-05, + "loss": 0.21782851219177246, + "step": 193560 + }, + { + "epoch": 0.831036466517263, + "grad_norm": 0.2145611047744751, + "learning_rate": 1.6969636866931695e-05, + "loss": 0.23176918029785157, + "step": 193570 + }, + { + "epoch": 0.831079398607283, + "grad_norm": 0.0035232685040682554, + "learning_rate": 1.696532514681407e-05, + "loss": 0.004082301631569862, + "step": 193580 + }, + { + "epoch": 0.831122330697303, + "grad_norm": 1.625364899635315, + "learning_rate": 1.6961013426696447e-05, + "loss": 0.23201777935028076, + "step": 193590 + }, + { + "epoch": 0.8311652627873231, + "grad_norm": 0.777439534664154, + "learning_rate": 1.6956701706578824e-05, + "loss": 0.0941228985786438, + "step": 193600 + }, + { + "epoch": 0.831208194877343, + "grad_norm": 0.0933847650885582, + "learning_rate": 1.69523899864612e-05, + "loss": 0.027557867765426635, + "step": 193610 + }, + { + "epoch": 0.831251126967363, + "grad_norm": 0.3298414945602417, + "learning_rate": 1.6948078266343575e-05, + "loss": 0.3442185878753662, + "step": 193620 + }, + { + "epoch": 0.8312940590573831, + "grad_norm": 0.032382167875766754, + "learning_rate": 1.6943766546225953e-05, + "loss": 0.14262547492980956, + "step": 193630 + }, + { + "epoch": 0.831336991147403, + "grad_norm": 0.09118391573429108, + "learning_rate": 1.6939454826108327e-05, + "loss": 0.1025362491607666, + "step": 193640 + }, + { + "epoch": 0.831379923237423, + "grad_norm": 0.010061063803732395, + "learning_rate": 1.6935143105990704e-05, + "loss": 0.08400711417198181, + "step": 193650 + }, + { + "epoch": 0.8314228553274431, + "grad_norm": 1.2069307565689087, + "learning_rate": 1.693083138587308e-05, + "loss": 0.2209791660308838, + "step": 193660 + }, + { + "epoch": 0.831465787417463, + "grad_norm": 2.28737735748291, + "learning_rate": 1.692651966575546e-05, + "loss": 0.1758479356765747, + "step": 193670 + }, + { + "epoch": 0.8315087195074831, + "grad_norm": 1.1508358716964722, + "learning_rate": 1.6922207945637833e-05, + "loss": 0.13885608911514283, + "step": 193680 + }, + { + "epoch": 0.8315516515975031, + "grad_norm": 0.05167385935783386, + "learning_rate": 1.691789622552021e-05, + "loss": 0.2271339178085327, + "step": 193690 + }, + { + "epoch": 0.831594583687523, + "grad_norm": 0.10447119921445847, + "learning_rate": 1.6913584505402584e-05, + "loss": 0.08349364995956421, + "step": 193700 + }, + { + "epoch": 0.8316375157775431, + "grad_norm": 0.01147378422319889, + "learning_rate": 1.6909272785284965e-05, + "loss": 0.09823180437088012, + "step": 193710 + }, + { + "epoch": 0.8316804478675631, + "grad_norm": 1.3284432888031006, + "learning_rate": 1.690496106516734e-05, + "loss": 0.1961848258972168, + "step": 193720 + }, + { + "epoch": 0.831723379957583, + "grad_norm": 5.696112155914307, + "learning_rate": 1.6900649345049716e-05, + "loss": 0.2415191650390625, + "step": 193730 + }, + { + "epoch": 0.8317663120476031, + "grad_norm": 12.25600528717041, + "learning_rate": 1.689633762493209e-05, + "loss": 0.21733403205871582, + "step": 193740 + }, + { + "epoch": 0.8318092441376231, + "grad_norm": 0.0029865370597690344, + "learning_rate": 1.6892025904814467e-05, + "loss": 0.2730604887008667, + "step": 193750 + }, + { + "epoch": 0.8318521762276431, + "grad_norm": 0.19473698735237122, + "learning_rate": 1.688771418469684e-05, + "loss": 0.0037568826228380203, + "step": 193760 + }, + { + "epoch": 0.8318951083176631, + "grad_norm": 0.04992964491248131, + "learning_rate": 1.6883402464579222e-05, + "loss": 0.19734233617782593, + "step": 193770 + }, + { + "epoch": 0.8319380404076832, + "grad_norm": 0.1691906601190567, + "learning_rate": 1.6879090744461596e-05, + "loss": 0.10528583526611328, + "step": 193780 + }, + { + "epoch": 0.8319809724977031, + "grad_norm": 0.0038565825670957565, + "learning_rate": 1.6874779024343973e-05, + "loss": 0.4071638107299805, + "step": 193790 + }, + { + "epoch": 0.8320239045877231, + "grad_norm": 0.030594274401664734, + "learning_rate": 1.6870467304226347e-05, + "loss": 0.16759810447692872, + "step": 193800 + }, + { + "epoch": 0.8320668366777432, + "grad_norm": 1.1685280799865723, + "learning_rate": 1.6866155584108725e-05, + "loss": 0.1373188853263855, + "step": 193810 + }, + { + "epoch": 0.8321097687677631, + "grad_norm": 2.992621660232544, + "learning_rate": 1.6861843863991102e-05, + "loss": 0.22395699024200438, + "step": 193820 + }, + { + "epoch": 0.8321527008577831, + "grad_norm": 0.08463391661643982, + "learning_rate": 1.685753214387348e-05, + "loss": 0.25692574977874755, + "step": 193830 + }, + { + "epoch": 0.8321956329478032, + "grad_norm": 0.002346848836168647, + "learning_rate": 1.6853220423755853e-05, + "loss": 0.3371147394180298, + "step": 193840 + }, + { + "epoch": 0.8322385650378231, + "grad_norm": 0.02107856795191765, + "learning_rate": 1.684890870363823e-05, + "loss": 0.0994668424129486, + "step": 193850 + }, + { + "epoch": 0.8322814971278432, + "grad_norm": 0.007043390069156885, + "learning_rate": 1.6844596983520604e-05, + "loss": 0.26696507930755614, + "step": 193860 + }, + { + "epoch": 0.8323244292178632, + "grad_norm": 2.924276351928711, + "learning_rate": 1.6840285263402982e-05, + "loss": 0.24864647388458253, + "step": 193870 + }, + { + "epoch": 0.8323673613078831, + "grad_norm": 1.4473538398742676, + "learning_rate": 1.683597354328536e-05, + "loss": 0.18037652969360352, + "step": 193880 + }, + { + "epoch": 0.8324102933979032, + "grad_norm": 2.4899935722351074, + "learning_rate": 1.6831661823167736e-05, + "loss": 0.14435986280441285, + "step": 193890 + }, + { + "epoch": 0.8324532254879232, + "grad_norm": 0.00426316587254405, + "learning_rate": 1.6827350103050114e-05, + "loss": 0.39287576675415037, + "step": 193900 + }, + { + "epoch": 0.8324961575779433, + "grad_norm": 0.003730037249624729, + "learning_rate": 1.6823038382932488e-05, + "loss": 0.06763601303100586, + "step": 193910 + }, + { + "epoch": 0.8325390896679632, + "grad_norm": 0.04255102202296257, + "learning_rate": 1.6818726662814865e-05, + "loss": 0.20267326831817628, + "step": 193920 + }, + { + "epoch": 0.8325820217579832, + "grad_norm": 0.03916976973414421, + "learning_rate": 1.681441494269724e-05, + "loss": 0.1570077896118164, + "step": 193930 + }, + { + "epoch": 0.8326249538480033, + "grad_norm": 0.331269770860672, + "learning_rate": 1.6810103222579616e-05, + "loss": 0.21268806457519532, + "step": 193940 + }, + { + "epoch": 0.8326678859380232, + "grad_norm": 0.013200412504374981, + "learning_rate": 1.6805791502461994e-05, + "loss": 0.24857487678527831, + "step": 193950 + }, + { + "epoch": 0.8327108180280433, + "grad_norm": 0.13833187520503998, + "learning_rate": 1.680147978234437e-05, + "loss": 0.00927966982126236, + "step": 193960 + }, + { + "epoch": 0.8327537501180633, + "grad_norm": 0.020017297938466072, + "learning_rate": 1.6797168062226745e-05, + "loss": 0.046907836198806764, + "step": 193970 + }, + { + "epoch": 0.8327966822080832, + "grad_norm": 0.0073755038902163506, + "learning_rate": 1.6792856342109122e-05, + "loss": 0.27490532398223877, + "step": 193980 + }, + { + "epoch": 0.8328396142981033, + "grad_norm": 0.07011353224515915, + "learning_rate": 1.6788544621991496e-05, + "loss": 0.30378525257110595, + "step": 193990 + }, + { + "epoch": 0.8328825463881233, + "grad_norm": 0.0438222661614418, + "learning_rate": 1.6784232901873874e-05, + "loss": 0.0050745390355587, + "step": 194000 + }, + { + "epoch": 0.8328825463881233, + "eval_loss": 0.3831603527069092, + "eval_runtime": 27.471, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 194000 + }, + { + "epoch": 0.8329254784781432, + "grad_norm": 1.8510911464691162, + "learning_rate": 1.677992118175625e-05, + "loss": 0.2678192138671875, + "step": 194010 + }, + { + "epoch": 0.8329684105681633, + "grad_norm": 0.7535163164138794, + "learning_rate": 1.677560946163863e-05, + "loss": 0.3478584289550781, + "step": 194020 + }, + { + "epoch": 0.8330113426581833, + "grad_norm": 0.014061828143894672, + "learning_rate": 1.6771297741521002e-05, + "loss": 0.09568552970886231, + "step": 194030 + }, + { + "epoch": 0.8330542747482033, + "grad_norm": 2.01088285446167, + "learning_rate": 1.676698602140338e-05, + "loss": 0.41285243034362795, + "step": 194040 + }, + { + "epoch": 0.8330972068382233, + "grad_norm": 0.00961384829133749, + "learning_rate": 1.6762674301285754e-05, + "loss": 0.2360004425048828, + "step": 194050 + }, + { + "epoch": 0.8331401389282433, + "grad_norm": 166.18780517578125, + "learning_rate": 1.6758362581168134e-05, + "loss": 0.2200385332107544, + "step": 194060 + }, + { + "epoch": 0.8331830710182633, + "grad_norm": 0.002402801066637039, + "learning_rate": 1.675405086105051e-05, + "loss": 0.3726295709609985, + "step": 194070 + }, + { + "epoch": 0.8332260031082833, + "grad_norm": 0.007039608899503946, + "learning_rate": 1.6749739140932886e-05, + "loss": 0.06041044592857361, + "step": 194080 + }, + { + "epoch": 0.8332689351983034, + "grad_norm": 0.04404909536242485, + "learning_rate": 1.674542742081526e-05, + "loss": 0.10072480440139771, + "step": 194090 + }, + { + "epoch": 0.8333118672883233, + "grad_norm": 0.13793088495731354, + "learning_rate": 1.6741115700697637e-05, + "loss": 0.053259152173995974, + "step": 194100 + }, + { + "epoch": 0.8333547993783433, + "grad_norm": 0.02142341621220112, + "learning_rate": 1.673680398058001e-05, + "loss": 0.10477865934371948, + "step": 194110 + }, + { + "epoch": 0.8333977314683634, + "grad_norm": 0.015102100558578968, + "learning_rate": 1.673249226046239e-05, + "loss": 0.21803498268127441, + "step": 194120 + }, + { + "epoch": 0.8334406635583833, + "grad_norm": 0.39184778928756714, + "learning_rate": 1.6728180540344766e-05, + "loss": 0.15625743865966796, + "step": 194130 + }, + { + "epoch": 0.8334835956484034, + "grad_norm": 0.16577816009521484, + "learning_rate": 1.6723868820227143e-05, + "loss": 0.1322509765625, + "step": 194140 + }, + { + "epoch": 0.8335265277384234, + "grad_norm": 0.35797154903411865, + "learning_rate": 1.6719557100109517e-05, + "loss": 0.20927348136901855, + "step": 194150 + }, + { + "epoch": 0.8335694598284433, + "grad_norm": 0.7359702587127686, + "learning_rate": 1.6715245379991894e-05, + "loss": 0.059522110223770144, + "step": 194160 + }, + { + "epoch": 0.8336123919184634, + "grad_norm": 0.015525028109550476, + "learning_rate": 1.671093365987427e-05, + "loss": 0.23056964874267577, + "step": 194170 + }, + { + "epoch": 0.8336553240084834, + "grad_norm": 0.009239768609404564, + "learning_rate": 1.670662193975665e-05, + "loss": 0.0353708952665329, + "step": 194180 + }, + { + "epoch": 0.8336982560985033, + "grad_norm": 0.00043988885590806603, + "learning_rate": 1.6702310219639023e-05, + "loss": 0.2876828670501709, + "step": 194190 + }, + { + "epoch": 0.8337411881885234, + "grad_norm": 0.0003478638536762446, + "learning_rate": 1.66979984995214e-05, + "loss": 0.12474907636642456, + "step": 194200 + }, + { + "epoch": 0.8337841202785434, + "grad_norm": 1.505427360534668, + "learning_rate": 1.6693686779403774e-05, + "loss": 0.16469504833221435, + "step": 194210 + }, + { + "epoch": 0.8338270523685634, + "grad_norm": 1.9175589084625244, + "learning_rate": 1.668937505928615e-05, + "loss": 0.3978353261947632, + "step": 194220 + }, + { + "epoch": 0.8338699844585834, + "grad_norm": 0.22217345237731934, + "learning_rate": 1.668506333916853e-05, + "loss": 0.16778547763824464, + "step": 194230 + }, + { + "epoch": 0.8339129165486034, + "grad_norm": 2.0455987453460693, + "learning_rate": 1.6680751619050906e-05, + "loss": 0.2525254011154175, + "step": 194240 + }, + { + "epoch": 0.8339558486386234, + "grad_norm": 0.008584201335906982, + "learning_rate": 1.6676439898933284e-05, + "loss": 0.22424793243408203, + "step": 194250 + }, + { + "epoch": 0.8339987807286434, + "grad_norm": 1.2639451026916504, + "learning_rate": 1.6672128178815658e-05, + "loss": 0.2725630283355713, + "step": 194260 + }, + { + "epoch": 0.8340417128186635, + "grad_norm": 0.06333382427692413, + "learning_rate": 1.6667816458698035e-05, + "loss": 0.20261473655700685, + "step": 194270 + }, + { + "epoch": 0.8340846449086834, + "grad_norm": 0.17632277309894562, + "learning_rate": 1.666350473858041e-05, + "loss": 0.06435710787773133, + "step": 194280 + }, + { + "epoch": 0.8341275769987034, + "grad_norm": 0.01805727928876877, + "learning_rate": 1.6659193018462786e-05, + "loss": 0.1538480281829834, + "step": 194290 + }, + { + "epoch": 0.8341705090887235, + "grad_norm": 0.0022346056066453457, + "learning_rate": 1.6654881298345164e-05, + "loss": 0.2566747426986694, + "step": 194300 + }, + { + "epoch": 0.8342134411787434, + "grad_norm": 1.2149933576583862, + "learning_rate": 1.665056957822754e-05, + "loss": 0.15987224578857423, + "step": 194310 + }, + { + "epoch": 0.8342563732687635, + "grad_norm": 0.003662853967398405, + "learning_rate": 1.6646257858109915e-05, + "loss": 0.2221593141555786, + "step": 194320 + }, + { + "epoch": 0.8342993053587835, + "grad_norm": 1.8059449195861816, + "learning_rate": 1.6641946137992292e-05, + "loss": 0.10572590827941894, + "step": 194330 + }, + { + "epoch": 0.8343422374488035, + "grad_norm": 0.08389879763126373, + "learning_rate": 1.6637634417874666e-05, + "loss": 0.24218251705169677, + "step": 194340 + }, + { + "epoch": 0.8343851695388235, + "grad_norm": 0.18546342849731445, + "learning_rate": 1.6633322697757043e-05, + "loss": 0.15501357316970826, + "step": 194350 + }, + { + "epoch": 0.8344281016288435, + "grad_norm": 0.1114467978477478, + "learning_rate": 1.662901097763942e-05, + "loss": 0.20605828762054443, + "step": 194360 + }, + { + "epoch": 0.8344710337188636, + "grad_norm": 0.008445865474641323, + "learning_rate": 1.6624699257521798e-05, + "loss": 0.14372949600219725, + "step": 194370 + }, + { + "epoch": 0.8345139658088835, + "grad_norm": 1.4896812438964844, + "learning_rate": 1.6620387537404172e-05, + "loss": 0.2524362564086914, + "step": 194380 + }, + { + "epoch": 0.8345568978989035, + "grad_norm": 0.013691961765289307, + "learning_rate": 1.661607581728655e-05, + "loss": 0.14471871852874757, + "step": 194390 + }, + { + "epoch": 0.8345998299889236, + "grad_norm": 4.92308235168457, + "learning_rate": 1.6611764097168923e-05, + "loss": 0.2559764862060547, + "step": 194400 + }, + { + "epoch": 0.8346427620789435, + "grad_norm": 2.5394248962402344, + "learning_rate": 1.66074523770513e-05, + "loss": 0.24362993240356445, + "step": 194410 + }, + { + "epoch": 0.8346856941689635, + "grad_norm": 0.277831107378006, + "learning_rate": 1.6603140656933678e-05, + "loss": 0.12817366123199464, + "step": 194420 + }, + { + "epoch": 0.8347286262589836, + "grad_norm": 0.0360804982483387, + "learning_rate": 1.6598828936816055e-05, + "loss": 0.07550318837165833, + "step": 194430 + }, + { + "epoch": 0.8347715583490035, + "grad_norm": 0.03386925160884857, + "learning_rate": 1.659451721669843e-05, + "loss": 0.06955471634864807, + "step": 194440 + }, + { + "epoch": 0.8348144904390236, + "grad_norm": 0.004523274954408407, + "learning_rate": 1.6590205496580807e-05, + "loss": 0.12661986351013182, + "step": 194450 + }, + { + "epoch": 0.8348574225290436, + "grad_norm": 0.06306543201208115, + "learning_rate": 1.658589377646318e-05, + "loss": 0.16889942884445192, + "step": 194460 + }, + { + "epoch": 0.8349003546190635, + "grad_norm": 3.41239595413208, + "learning_rate": 1.658158205634556e-05, + "loss": 0.2694386959075928, + "step": 194470 + }, + { + "epoch": 0.8349432867090836, + "grad_norm": 0.033036649227142334, + "learning_rate": 1.6577270336227935e-05, + "loss": 0.1256989598274231, + "step": 194480 + }, + { + "epoch": 0.8349862187991036, + "grad_norm": 0.1092417761683464, + "learning_rate": 1.6572958616110313e-05, + "loss": 0.08331429362297058, + "step": 194490 + }, + { + "epoch": 0.8350291508891236, + "grad_norm": 1.1124284267425537, + "learning_rate": 1.6568646895992687e-05, + "loss": 0.2619120359420776, + "step": 194500 + }, + { + "epoch": 0.8350720829791436, + "grad_norm": 0.020524216815829277, + "learning_rate": 1.6564335175875064e-05, + "loss": 0.12931500673294066, + "step": 194510 + }, + { + "epoch": 0.8351150150691636, + "grad_norm": 5.772107124328613, + "learning_rate": 1.6560023455757438e-05, + "loss": 0.1707077741622925, + "step": 194520 + }, + { + "epoch": 0.8351579471591836, + "grad_norm": 1.7792444229125977, + "learning_rate": 1.655571173563982e-05, + "loss": 0.21941332817077636, + "step": 194530 + }, + { + "epoch": 0.8352008792492036, + "grad_norm": 0.0069997734390199184, + "learning_rate": 1.6551400015522193e-05, + "loss": 0.29697065353393554, + "step": 194540 + }, + { + "epoch": 0.8352438113392237, + "grad_norm": 0.35176554322242737, + "learning_rate": 1.654708829540457e-05, + "loss": 0.37339417934417723, + "step": 194550 + }, + { + "epoch": 0.8352867434292436, + "grad_norm": 0.1942061185836792, + "learning_rate": 1.6542776575286944e-05, + "loss": 0.276427698135376, + "step": 194560 + }, + { + "epoch": 0.8353296755192636, + "grad_norm": 1.3757528066635132, + "learning_rate": 1.653846485516932e-05, + "loss": 0.19914722442626953, + "step": 194570 + }, + { + "epoch": 0.8353726076092837, + "grad_norm": 0.005043280776590109, + "learning_rate": 1.65341531350517e-05, + "loss": 0.24934778213500977, + "step": 194580 + }, + { + "epoch": 0.8354155396993036, + "grad_norm": 0.06708847731351852, + "learning_rate": 1.6529841414934076e-05, + "loss": 0.16489659547805785, + "step": 194590 + }, + { + "epoch": 0.8354584717893236, + "grad_norm": 2.7416841983795166, + "learning_rate": 1.652552969481645e-05, + "loss": 0.30127294063568116, + "step": 194600 + }, + { + "epoch": 0.8355014038793437, + "grad_norm": 0.2182602882385254, + "learning_rate": 1.6521217974698827e-05, + "loss": 0.45470247268676756, + "step": 194610 + }, + { + "epoch": 0.8355443359693636, + "grad_norm": 1.337004542350769, + "learning_rate": 1.6516906254581205e-05, + "loss": 0.27136123180389404, + "step": 194620 + }, + { + "epoch": 0.8355872680593837, + "grad_norm": 0.018352841958403587, + "learning_rate": 1.651259453446358e-05, + "loss": 0.3394086122512817, + "step": 194630 + }, + { + "epoch": 0.8356302001494037, + "grad_norm": 0.003774836892262101, + "learning_rate": 1.6508282814345956e-05, + "loss": 0.0814660906791687, + "step": 194640 + }, + { + "epoch": 0.8356731322394236, + "grad_norm": 0.007637556176632643, + "learning_rate": 1.6503971094228333e-05, + "loss": 0.1790636420249939, + "step": 194650 + }, + { + "epoch": 0.8357160643294437, + "grad_norm": 1.6730716228485107, + "learning_rate": 1.649965937411071e-05, + "loss": 0.29316766262054444, + "step": 194660 + }, + { + "epoch": 0.8357589964194637, + "grad_norm": 0.005538949277251959, + "learning_rate": 1.6495347653993085e-05, + "loss": 0.16200257539749147, + "step": 194670 + }, + { + "epoch": 0.8358019285094836, + "grad_norm": 1.8531112670898438, + "learning_rate": 1.6491035933875462e-05, + "loss": 0.3488029718399048, + "step": 194680 + }, + { + "epoch": 0.8358448605995037, + "grad_norm": 1.9031802415847778, + "learning_rate": 1.6486724213757836e-05, + "loss": 0.4225339412689209, + "step": 194690 + }, + { + "epoch": 0.8358877926895237, + "grad_norm": 0.03141864761710167, + "learning_rate": 1.6482412493640213e-05, + "loss": 0.07720988988876343, + "step": 194700 + }, + { + "epoch": 0.8359307247795437, + "grad_norm": 4.636709690093994, + "learning_rate": 1.647810077352259e-05, + "loss": 0.2965707778930664, + "step": 194710 + }, + { + "epoch": 0.8359736568695637, + "grad_norm": 0.011487384326756, + "learning_rate": 1.6473789053404968e-05, + "loss": 0.17508405447006226, + "step": 194720 + }, + { + "epoch": 0.8360165889595838, + "grad_norm": 0.00859050638973713, + "learning_rate": 1.6469477333287342e-05, + "loss": 0.2810189962387085, + "step": 194730 + }, + { + "epoch": 0.8360595210496037, + "grad_norm": 0.008925511501729488, + "learning_rate": 1.646516561316972e-05, + "loss": 0.01954289376735687, + "step": 194740 + }, + { + "epoch": 0.8361024531396237, + "grad_norm": 4.719195365905762, + "learning_rate": 1.6460853893052093e-05, + "loss": 0.33624041080474854, + "step": 194750 + }, + { + "epoch": 0.8361453852296438, + "grad_norm": 129.30929565429688, + "learning_rate": 1.645654217293447e-05, + "loss": 0.23399410247802735, + "step": 194760 + }, + { + "epoch": 0.8361883173196638, + "grad_norm": 0.4405730068683624, + "learning_rate": 1.6452230452816848e-05, + "loss": 0.1191325068473816, + "step": 194770 + }, + { + "epoch": 0.8362312494096837, + "grad_norm": 0.0006891182856634259, + "learning_rate": 1.6447918732699225e-05, + "loss": 0.3420095443725586, + "step": 194780 + }, + { + "epoch": 0.8362741814997038, + "grad_norm": 0.7859971523284912, + "learning_rate": 1.64436070125816e-05, + "loss": 0.3719822645187378, + "step": 194790 + }, + { + "epoch": 0.8363171135897238, + "grad_norm": 3.2730681896209717, + "learning_rate": 1.6439295292463976e-05, + "loss": 0.41228413581848145, + "step": 194800 + }, + { + "epoch": 0.8363600456797438, + "grad_norm": 1.8419042825698853, + "learning_rate": 1.643498357234635e-05, + "loss": 0.18629003763198854, + "step": 194810 + }, + { + "epoch": 0.8364029777697638, + "grad_norm": 0.25675979256629944, + "learning_rate": 1.643067185222873e-05, + "loss": 0.06031713485717773, + "step": 194820 + }, + { + "epoch": 0.8364459098597838, + "grad_norm": 0.029397347941994667, + "learning_rate": 1.6426360132111105e-05, + "loss": 0.22633352279663085, + "step": 194830 + }, + { + "epoch": 0.8364888419498038, + "grad_norm": 0.1383998543024063, + "learning_rate": 1.6422048411993482e-05, + "loss": 0.1639503836631775, + "step": 194840 + }, + { + "epoch": 0.8365317740398238, + "grad_norm": 1.5284881591796875, + "learning_rate": 1.6417736691875856e-05, + "loss": 0.16716455221176146, + "step": 194850 + }, + { + "epoch": 0.8365747061298439, + "grad_norm": 0.769423246383667, + "learning_rate": 1.6413424971758234e-05, + "loss": 0.2911246299743652, + "step": 194860 + }, + { + "epoch": 0.8366176382198638, + "grad_norm": 0.06036985665559769, + "learning_rate": 1.6409113251640608e-05, + "loss": 0.16936222314834595, + "step": 194870 + }, + { + "epoch": 0.8366605703098838, + "grad_norm": 4.0209879875183105, + "learning_rate": 1.640480153152299e-05, + "loss": 0.23061556816101075, + "step": 194880 + }, + { + "epoch": 0.8367035023999039, + "grad_norm": 2.87886118888855, + "learning_rate": 1.6400489811405362e-05, + "loss": 0.1437964677810669, + "step": 194890 + }, + { + "epoch": 0.8367464344899238, + "grad_norm": 0.0012091115349903703, + "learning_rate": 1.639617809128774e-05, + "loss": 0.10468262434005737, + "step": 194900 + }, + { + "epoch": 0.8367893665799438, + "grad_norm": 6.849912643432617, + "learning_rate": 1.6391866371170114e-05, + "loss": 0.3797173500061035, + "step": 194910 + }, + { + "epoch": 0.8368322986699639, + "grad_norm": 0.00793464481830597, + "learning_rate": 1.638755465105249e-05, + "loss": 0.31881911754608155, + "step": 194920 + }, + { + "epoch": 0.8368752307599838, + "grad_norm": 0.00296783191151917, + "learning_rate": 1.638324293093487e-05, + "loss": 0.06018228530883789, + "step": 194930 + }, + { + "epoch": 0.8369181628500039, + "grad_norm": 0.8054410219192505, + "learning_rate": 1.6378931210817246e-05, + "loss": 0.2937172412872314, + "step": 194940 + }, + { + "epoch": 0.8369610949400239, + "grad_norm": 0.054855868220329285, + "learning_rate": 1.637461949069962e-05, + "loss": 0.28021395206451416, + "step": 194950 + }, + { + "epoch": 0.8370040270300438, + "grad_norm": 0.05339659005403519, + "learning_rate": 1.6370307770581997e-05, + "loss": 0.2437211275100708, + "step": 194960 + }, + { + "epoch": 0.8370469591200639, + "grad_norm": 0.013190588913857937, + "learning_rate": 1.636599605046437e-05, + "loss": 0.21684250831604004, + "step": 194970 + }, + { + "epoch": 0.8370898912100839, + "grad_norm": 1.0553587675094604, + "learning_rate": 1.6361684330346748e-05, + "loss": 0.26926403045654296, + "step": 194980 + }, + { + "epoch": 0.8371328233001039, + "grad_norm": 0.05917760357260704, + "learning_rate": 1.6357372610229126e-05, + "loss": 0.1761362671852112, + "step": 194990 + }, + { + "epoch": 0.8371757553901239, + "grad_norm": 0.020496509969234467, + "learning_rate": 1.6353060890111503e-05, + "loss": 0.17508223056793212, + "step": 195000 + }, + { + "epoch": 0.8371757553901239, + "eval_loss": 0.3825985789299011, + "eval_runtime": 27.4396, + "eval_samples_per_second": 3.644, + "eval_steps_per_second": 3.644, + "step": 195000 + }, + { + "epoch": 0.8372186874801439, + "grad_norm": 0.007341762073338032, + "learning_rate": 1.634874916999388e-05, + "loss": 0.17335011959075927, + "step": 195010 + }, + { + "epoch": 0.8372616195701639, + "grad_norm": 2.7988274097442627, + "learning_rate": 1.6344437449876254e-05, + "loss": 0.09647920131683349, + "step": 195020 + }, + { + "epoch": 0.8373045516601839, + "grad_norm": 0.16739942133426666, + "learning_rate": 1.634012572975863e-05, + "loss": 0.2648929595947266, + "step": 195030 + }, + { + "epoch": 0.837347483750204, + "grad_norm": 0.8594076633453369, + "learning_rate": 1.6335814009641006e-05, + "loss": 0.121567702293396, + "step": 195040 + }, + { + "epoch": 0.8373904158402239, + "grad_norm": 0.008997962810099125, + "learning_rate": 1.6331502289523383e-05, + "loss": 0.3291927814483643, + "step": 195050 + }, + { + "epoch": 0.8374333479302439, + "grad_norm": 1.8808585405349731, + "learning_rate": 1.632719056940576e-05, + "loss": 0.2479541301727295, + "step": 195060 + }, + { + "epoch": 0.837476280020264, + "grad_norm": 0.26992496848106384, + "learning_rate": 1.6322878849288138e-05, + "loss": 0.31995177268981934, + "step": 195070 + }, + { + "epoch": 0.8375192121102839, + "grad_norm": 0.7552814483642578, + "learning_rate": 1.631856712917051e-05, + "loss": 0.37452163696289065, + "step": 195080 + }, + { + "epoch": 0.837562144200304, + "grad_norm": 0.01972191222012043, + "learning_rate": 1.631425540905289e-05, + "loss": 0.27466640472412107, + "step": 195090 + }, + { + "epoch": 0.837605076290324, + "grad_norm": 0.03341887891292572, + "learning_rate": 1.6309943688935263e-05, + "loss": 0.020095197856426238, + "step": 195100 + }, + { + "epoch": 0.8376480083803439, + "grad_norm": 0.01714405231177807, + "learning_rate": 1.630563196881764e-05, + "loss": 0.2320852279663086, + "step": 195110 + }, + { + "epoch": 0.837690940470364, + "grad_norm": 6.817076683044434, + "learning_rate": 1.6301320248700018e-05, + "loss": 0.33026838302612305, + "step": 195120 + }, + { + "epoch": 0.837733872560384, + "grad_norm": 0.2995063364505768, + "learning_rate": 1.6297008528582395e-05, + "loss": 0.06860451102256775, + "step": 195130 + }, + { + "epoch": 0.8377768046504039, + "grad_norm": 0.29282864928245544, + "learning_rate": 1.629269680846477e-05, + "loss": 0.08025183081626892, + "step": 195140 + }, + { + "epoch": 0.837819736740424, + "grad_norm": 0.6161121726036072, + "learning_rate": 1.6288385088347146e-05, + "loss": 0.14255157709121705, + "step": 195150 + }, + { + "epoch": 0.837862668830444, + "grad_norm": 0.008154317736625671, + "learning_rate": 1.628407336822952e-05, + "loss": 0.19751204252243043, + "step": 195160 + }, + { + "epoch": 0.837905600920464, + "grad_norm": 0.030738957226276398, + "learning_rate": 1.62797616481119e-05, + "loss": 0.07990161180496216, + "step": 195170 + }, + { + "epoch": 0.837948533010484, + "grad_norm": 0.06262635439634323, + "learning_rate": 1.6275449927994275e-05, + "loss": 0.10323004722595215, + "step": 195180 + }, + { + "epoch": 0.837991465100504, + "grad_norm": 0.16967739164829254, + "learning_rate": 1.6271138207876652e-05, + "loss": 0.13956884145736695, + "step": 195190 + }, + { + "epoch": 0.8380343971905241, + "grad_norm": 0.008536574430763721, + "learning_rate": 1.6266826487759026e-05, + "loss": 0.06692397594451904, + "step": 195200 + }, + { + "epoch": 0.838077329280544, + "grad_norm": 0.1352798193693161, + "learning_rate": 1.6262514767641403e-05, + "loss": 0.40900468826293945, + "step": 195210 + }, + { + "epoch": 0.8381202613705641, + "grad_norm": 3.675179958343506, + "learning_rate": 1.6258203047523777e-05, + "loss": 0.3779770374298096, + "step": 195220 + }, + { + "epoch": 0.8381631934605841, + "grad_norm": 0.16516053676605225, + "learning_rate": 1.6253891327406158e-05, + "loss": 0.12357484102249146, + "step": 195230 + }, + { + "epoch": 0.838206125550604, + "grad_norm": 0.1563853621482849, + "learning_rate": 1.6249579607288532e-05, + "loss": 0.013473853468894958, + "step": 195240 + }, + { + "epoch": 0.8382490576406241, + "grad_norm": 0.005513192620128393, + "learning_rate": 1.624526788717091e-05, + "loss": 0.0922442376613617, + "step": 195250 + }, + { + "epoch": 0.8382919897306441, + "grad_norm": 0.019567089155316353, + "learning_rate": 1.6240956167053283e-05, + "loss": 0.13599594831466674, + "step": 195260 + }, + { + "epoch": 0.838334921820664, + "grad_norm": 0.07949855923652649, + "learning_rate": 1.623664444693566e-05, + "loss": 0.12350625991821289, + "step": 195270 + }, + { + "epoch": 0.8383778539106841, + "grad_norm": 0.03716310113668442, + "learning_rate": 1.6232332726818038e-05, + "loss": 0.18347951173782348, + "step": 195280 + }, + { + "epoch": 0.8384207860007041, + "grad_norm": 0.0044286614283919334, + "learning_rate": 1.6228021006700415e-05, + "loss": 0.08632364869117737, + "step": 195290 + }, + { + "epoch": 0.8384637180907241, + "grad_norm": 1.718677043914795, + "learning_rate": 1.622370928658279e-05, + "loss": 0.31393024921417234, + "step": 195300 + }, + { + "epoch": 0.8385066501807441, + "grad_norm": 0.00345993647351861, + "learning_rate": 1.6219397566465167e-05, + "loss": 0.2210599184036255, + "step": 195310 + }, + { + "epoch": 0.8385495822707641, + "grad_norm": 0.14155049622058868, + "learning_rate": 1.621508584634754e-05, + "loss": 0.2827406644821167, + "step": 195320 + }, + { + "epoch": 0.8385925143607841, + "grad_norm": 0.009309999644756317, + "learning_rate": 1.6210774126229918e-05, + "loss": 0.38733735084533694, + "step": 195330 + }, + { + "epoch": 0.8386354464508041, + "grad_norm": 0.2578687071800232, + "learning_rate": 1.6206462406112295e-05, + "loss": 0.13358743190765382, + "step": 195340 + }, + { + "epoch": 0.8386783785408242, + "grad_norm": 0.024788912385702133, + "learning_rate": 1.6202150685994673e-05, + "loss": 0.22722995281219482, + "step": 195350 + }, + { + "epoch": 0.8387213106308441, + "grad_norm": 1.2188549041748047, + "learning_rate": 1.619783896587705e-05, + "loss": 0.18080921173095704, + "step": 195360 + }, + { + "epoch": 0.8387642427208641, + "grad_norm": 0.03402552753686905, + "learning_rate": 1.6193527245759424e-05, + "loss": 0.16962271928787231, + "step": 195370 + }, + { + "epoch": 0.8388071748108842, + "grad_norm": 1.3098325729370117, + "learning_rate": 1.61892155256418e-05, + "loss": 0.1280285358428955, + "step": 195380 + }, + { + "epoch": 0.8388501069009041, + "grad_norm": 0.08495664596557617, + "learning_rate": 1.6184903805524175e-05, + "loss": 0.10631186962127685, + "step": 195390 + }, + { + "epoch": 0.8388930389909242, + "grad_norm": 0.005879923235625029, + "learning_rate": 1.6180592085406553e-05, + "loss": 0.3474308967590332, + "step": 195400 + }, + { + "epoch": 0.8389359710809442, + "grad_norm": 0.004986526444554329, + "learning_rate": 1.617628036528893e-05, + "loss": 0.02645334303379059, + "step": 195410 + }, + { + "epoch": 0.8389789031709641, + "grad_norm": 0.0748465284705162, + "learning_rate": 1.6171968645171307e-05, + "loss": 0.09948440790176391, + "step": 195420 + }, + { + "epoch": 0.8390218352609842, + "grad_norm": 0.00232899421826005, + "learning_rate": 1.616765692505368e-05, + "loss": 0.1367961049079895, + "step": 195430 + }, + { + "epoch": 0.8390647673510042, + "grad_norm": 0.0858355164527893, + "learning_rate": 1.616334520493606e-05, + "loss": 0.1316709876060486, + "step": 195440 + }, + { + "epoch": 0.8391076994410241, + "grad_norm": 0.024373479187488556, + "learning_rate": 1.6159033484818433e-05, + "loss": 0.35381669998168946, + "step": 195450 + }, + { + "epoch": 0.8391506315310442, + "grad_norm": 0.007658019196242094, + "learning_rate": 1.615472176470081e-05, + "loss": 0.1145735502243042, + "step": 195460 + }, + { + "epoch": 0.8391935636210642, + "grad_norm": 4.139742374420166, + "learning_rate": 1.6150410044583187e-05, + "loss": 0.20845775604248046, + "step": 195470 + }, + { + "epoch": 0.8392364957110842, + "grad_norm": 0.07298199832439423, + "learning_rate": 1.6146098324465565e-05, + "loss": 0.14519520998001098, + "step": 195480 + }, + { + "epoch": 0.8392794278011042, + "grad_norm": 0.04505956918001175, + "learning_rate": 1.614178660434794e-05, + "loss": 0.19180811643600465, + "step": 195490 + }, + { + "epoch": 0.8393223598911242, + "grad_norm": 0.5236793160438538, + "learning_rate": 1.6137474884230316e-05, + "loss": 0.26501855850219724, + "step": 195500 + }, + { + "epoch": 0.8393652919811442, + "grad_norm": 1.8800487518310547, + "learning_rate": 1.613316316411269e-05, + "loss": 0.19736210107803345, + "step": 195510 + }, + { + "epoch": 0.8394082240711642, + "grad_norm": 0.018839063122868538, + "learning_rate": 1.6128851443995067e-05, + "loss": 0.20923397541046143, + "step": 195520 + }, + { + "epoch": 0.8394511561611843, + "grad_norm": 0.009148648008704185, + "learning_rate": 1.6124539723877445e-05, + "loss": 0.21486268043518067, + "step": 195530 + }, + { + "epoch": 0.8394940882512042, + "grad_norm": 0.010658112354576588, + "learning_rate": 1.6120228003759822e-05, + "loss": 0.24198806285858154, + "step": 195540 + }, + { + "epoch": 0.8395370203412242, + "grad_norm": 2.198223352432251, + "learning_rate": 1.6115916283642196e-05, + "loss": 0.17943172454833983, + "step": 195550 + }, + { + "epoch": 0.8395799524312443, + "grad_norm": 0.008235502988100052, + "learning_rate": 1.6111604563524573e-05, + "loss": 0.1194198489189148, + "step": 195560 + }, + { + "epoch": 0.8396228845212642, + "grad_norm": 8.622540473937988, + "learning_rate": 1.6107292843406947e-05, + "loss": 0.2828612565994263, + "step": 195570 + }, + { + "epoch": 0.8396658166112843, + "grad_norm": 0.01411815918982029, + "learning_rate": 1.6102981123289328e-05, + "loss": 0.20698959827423097, + "step": 195580 + }, + { + "epoch": 0.8397087487013043, + "grad_norm": 0.015219231136143208, + "learning_rate": 1.6098669403171702e-05, + "loss": 0.04689061641693115, + "step": 195590 + }, + { + "epoch": 0.8397516807913242, + "grad_norm": 7.192612171173096, + "learning_rate": 1.609435768305408e-05, + "loss": 0.4910551071166992, + "step": 195600 + }, + { + "epoch": 0.8397946128813443, + "grad_norm": 0.047113098204135895, + "learning_rate": 1.6090045962936453e-05, + "loss": 0.0928109884262085, + "step": 195610 + }, + { + "epoch": 0.8398375449713643, + "grad_norm": 2.3072288036346436, + "learning_rate": 1.608573424281883e-05, + "loss": 0.25140502452850344, + "step": 195620 + }, + { + "epoch": 0.8398804770613844, + "grad_norm": 0.09699340909719467, + "learning_rate": 1.6081422522701204e-05, + "loss": 0.02875358462333679, + "step": 195630 + }, + { + "epoch": 0.8399234091514043, + "grad_norm": 1.5584121942520142, + "learning_rate": 1.6077110802583585e-05, + "loss": 0.1507789373397827, + "step": 195640 + }, + { + "epoch": 0.8399663412414243, + "grad_norm": 0.008640035055577755, + "learning_rate": 1.607279908246596e-05, + "loss": 0.31208076477050783, + "step": 195650 + }, + { + "epoch": 0.8400092733314444, + "grad_norm": 0.27219441533088684, + "learning_rate": 1.6068487362348336e-05, + "loss": 0.15312207937240602, + "step": 195660 + }, + { + "epoch": 0.8400522054214643, + "grad_norm": 0.002600613981485367, + "learning_rate": 1.606417564223071e-05, + "loss": 0.03642718493938446, + "step": 195670 + }, + { + "epoch": 0.8400951375114843, + "grad_norm": 0.12953244149684906, + "learning_rate": 1.6059863922113088e-05, + "loss": 0.12930947542190552, + "step": 195680 + }, + { + "epoch": 0.8401380696015044, + "grad_norm": 0.0030175873544067144, + "learning_rate": 1.6055552201995465e-05, + "loss": 0.459410572052002, + "step": 195690 + }, + { + "epoch": 0.8401810016915243, + "grad_norm": 0.0005476188962347806, + "learning_rate": 1.6051240481877842e-05, + "loss": 0.2390958547592163, + "step": 195700 + }, + { + "epoch": 0.8402239337815444, + "grad_norm": 1.5651413202285767, + "learning_rate": 1.604692876176022e-05, + "loss": 0.08409489393234253, + "step": 195710 + }, + { + "epoch": 0.8402668658715644, + "grad_norm": 3.797452211380005, + "learning_rate": 1.6042617041642594e-05, + "loss": 0.22493295669555663, + "step": 195720 + }, + { + "epoch": 0.8403097979615843, + "grad_norm": 5.24019718170166, + "learning_rate": 1.603830532152497e-05, + "loss": 0.2507028579711914, + "step": 195730 + }, + { + "epoch": 0.8403527300516044, + "grad_norm": 0.00809667818248272, + "learning_rate": 1.6033993601407345e-05, + "loss": 0.20862414836883544, + "step": 195740 + }, + { + "epoch": 0.8403956621416244, + "grad_norm": 0.08349917829036713, + "learning_rate": 1.6029681881289722e-05, + "loss": 0.0961777925491333, + "step": 195750 + }, + { + "epoch": 0.8404385942316444, + "grad_norm": 0.03538598120212555, + "learning_rate": 1.60253701611721e-05, + "loss": 0.0851999580860138, + "step": 195760 + }, + { + "epoch": 0.8404815263216644, + "grad_norm": 3.1396093368530273, + "learning_rate": 1.6021058441054477e-05, + "loss": 0.2021549701690674, + "step": 195770 + }, + { + "epoch": 0.8405244584116844, + "grad_norm": 2.161872386932373, + "learning_rate": 1.601674672093685e-05, + "loss": 0.23405060768127442, + "step": 195780 + }, + { + "epoch": 0.8405673905017044, + "grad_norm": 0.05371826887130737, + "learning_rate": 1.601243500081923e-05, + "loss": 0.1200188398361206, + "step": 195790 + }, + { + "epoch": 0.8406103225917244, + "grad_norm": 0.006190289277583361, + "learning_rate": 1.6008123280701602e-05, + "loss": 0.30011117458343506, + "step": 195800 + }, + { + "epoch": 0.8406532546817445, + "grad_norm": 0.002181611256673932, + "learning_rate": 1.600381156058398e-05, + "loss": 0.33170261383056643, + "step": 195810 + }, + { + "epoch": 0.8406961867717644, + "grad_norm": 0.00914520863443613, + "learning_rate": 1.5999499840466357e-05, + "loss": 0.008389408886432647, + "step": 195820 + }, + { + "epoch": 0.8407391188617844, + "grad_norm": 0.03632459416985512, + "learning_rate": 1.5995188120348734e-05, + "loss": 0.287453818321228, + "step": 195830 + }, + { + "epoch": 0.8407820509518045, + "grad_norm": 0.5841567516326904, + "learning_rate": 1.5990876400231108e-05, + "loss": 0.12442935705184936, + "step": 195840 + }, + { + "epoch": 0.8408249830418244, + "grad_norm": 0.05322708189487457, + "learning_rate": 1.5986564680113486e-05, + "loss": 0.1795598268508911, + "step": 195850 + }, + { + "epoch": 0.8408679151318444, + "grad_norm": 0.018948446959257126, + "learning_rate": 1.598225295999586e-05, + "loss": 0.13776202201843263, + "step": 195860 + }, + { + "epoch": 0.8409108472218645, + "grad_norm": 2.212151050567627, + "learning_rate": 1.5977941239878237e-05, + "loss": 0.1870142936706543, + "step": 195870 + }, + { + "epoch": 0.8409537793118844, + "grad_norm": 1.0908949375152588, + "learning_rate": 1.5973629519760614e-05, + "loss": 0.12300001382827759, + "step": 195880 + }, + { + "epoch": 0.8409967114019045, + "grad_norm": 2.6546292304992676, + "learning_rate": 1.596931779964299e-05, + "loss": 0.0752610445022583, + "step": 195890 + }, + { + "epoch": 0.8410396434919245, + "grad_norm": 0.012228342704474926, + "learning_rate": 1.5965006079525366e-05, + "loss": 0.22464075088500976, + "step": 195900 + }, + { + "epoch": 0.8410825755819444, + "grad_norm": 0.1338385045528412, + "learning_rate": 1.5960694359407743e-05, + "loss": 0.36805989742279055, + "step": 195910 + }, + { + "epoch": 0.8411255076719645, + "grad_norm": 0.05296871438622475, + "learning_rate": 1.5956382639290117e-05, + "loss": 0.18148987293243407, + "step": 195920 + }, + { + "epoch": 0.8411684397619845, + "grad_norm": 0.025313997641205788, + "learning_rate": 1.5952070919172498e-05, + "loss": 0.13553870916366578, + "step": 195930 + }, + { + "epoch": 0.8412113718520045, + "grad_norm": 0.0009583273204043508, + "learning_rate": 1.594775919905487e-05, + "loss": 0.3775708436965942, + "step": 195940 + }, + { + "epoch": 0.8412543039420245, + "grad_norm": 0.012733984738588333, + "learning_rate": 1.594344747893725e-05, + "loss": 0.10729289054870605, + "step": 195950 + }, + { + "epoch": 0.8412972360320445, + "grad_norm": 0.0589638277888298, + "learning_rate": 1.5939135758819623e-05, + "loss": 0.29300096035003664, + "step": 195960 + }, + { + "epoch": 0.8413401681220645, + "grad_norm": 47.52212905883789, + "learning_rate": 1.5934824038702e-05, + "loss": 0.11700329780578614, + "step": 195970 + }, + { + "epoch": 0.8413831002120845, + "grad_norm": 0.08651348203420639, + "learning_rate": 1.5930512318584374e-05, + "loss": 0.119724440574646, + "step": 195980 + }, + { + "epoch": 0.8414260323021046, + "grad_norm": 0.017822356894612312, + "learning_rate": 1.5926200598466755e-05, + "loss": 0.12555168867111205, + "step": 195990 + }, + { + "epoch": 0.8414689643921245, + "grad_norm": 0.001900406088680029, + "learning_rate": 1.592188887834913e-05, + "loss": 0.24136440753936766, + "step": 196000 + }, + { + "epoch": 0.8414689643921245, + "eval_loss": 0.3772696256637573, + "eval_runtime": 27.5097, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 3.635, + "step": 196000 + }, + { + "epoch": 0.8415118964821445, + "grad_norm": 0.20596669614315033, + "learning_rate": 1.5917577158231506e-05, + "loss": 0.12156057357788086, + "step": 196010 + }, + { + "epoch": 0.8415548285721646, + "grad_norm": 0.022826924920082092, + "learning_rate": 1.591326543811388e-05, + "loss": 0.09349904656410217, + "step": 196020 + }, + { + "epoch": 0.8415977606621845, + "grad_norm": 4.478949546813965, + "learning_rate": 1.5908953717996257e-05, + "loss": 0.16890153884887696, + "step": 196030 + }, + { + "epoch": 0.8416406927522045, + "grad_norm": 0.10414303839206696, + "learning_rate": 1.5904641997878635e-05, + "loss": 0.25435285568237304, + "step": 196040 + }, + { + "epoch": 0.8416836248422246, + "grad_norm": 18.744157791137695, + "learning_rate": 1.5900330277761012e-05, + "loss": 0.23109426498413085, + "step": 196050 + }, + { + "epoch": 0.8417265569322446, + "grad_norm": 0.00422808388248086, + "learning_rate": 1.5896018557643386e-05, + "loss": 0.09207351803779602, + "step": 196060 + }, + { + "epoch": 0.8417694890222646, + "grad_norm": 0.0252239890396595, + "learning_rate": 1.5891706837525763e-05, + "loss": 0.13819221258163453, + "step": 196070 + }, + { + "epoch": 0.8418124211122846, + "grad_norm": 0.0008331200224347413, + "learning_rate": 1.588739511740814e-05, + "loss": 0.24390954971313478, + "step": 196080 + }, + { + "epoch": 0.8418553532023046, + "grad_norm": 0.004637080244719982, + "learning_rate": 1.5883083397290515e-05, + "loss": 0.13876519203186036, + "step": 196090 + }, + { + "epoch": 0.8418982852923246, + "grad_norm": 0.0025159171782433987, + "learning_rate": 1.5878771677172892e-05, + "loss": 0.3237619400024414, + "step": 196100 + }, + { + "epoch": 0.8419412173823446, + "grad_norm": 1.4971562623977661, + "learning_rate": 1.587445995705527e-05, + "loss": 0.08215521574020386, + "step": 196110 + }, + { + "epoch": 0.8419841494723647, + "grad_norm": 3.8147895336151123, + "learning_rate": 1.5870148236937647e-05, + "loss": 0.21302051544189454, + "step": 196120 + }, + { + "epoch": 0.8420270815623846, + "grad_norm": 4.908078193664551, + "learning_rate": 1.586583651682002e-05, + "loss": 0.5922693252563477, + "step": 196130 + }, + { + "epoch": 0.8420700136524046, + "grad_norm": 0.05550285801291466, + "learning_rate": 1.5861524796702398e-05, + "loss": 0.07058622241020203, + "step": 196140 + }, + { + "epoch": 0.8421129457424247, + "grad_norm": 0.3030736744403839, + "learning_rate": 1.5857213076584772e-05, + "loss": 0.36227562427520754, + "step": 196150 + }, + { + "epoch": 0.8421558778324446, + "grad_norm": 0.9248490929603577, + "learning_rate": 1.585290135646715e-05, + "loss": 0.18082973957061768, + "step": 196160 + }, + { + "epoch": 0.8421988099224647, + "grad_norm": 0.6012828946113586, + "learning_rate": 1.5848589636349527e-05, + "loss": 0.18957573175430298, + "step": 196170 + }, + { + "epoch": 0.8422417420124847, + "grad_norm": 0.40026846528053284, + "learning_rate": 1.5844277916231904e-05, + "loss": 0.1907409906387329, + "step": 196180 + }, + { + "epoch": 0.8422846741025046, + "grad_norm": 4.090051651000977, + "learning_rate": 1.5839966196114278e-05, + "loss": 0.063755863904953, + "step": 196190 + }, + { + "epoch": 0.8423276061925247, + "grad_norm": 0.009709478355944157, + "learning_rate": 1.5835654475996655e-05, + "loss": 0.3304955244064331, + "step": 196200 + }, + { + "epoch": 0.8423705382825447, + "grad_norm": 0.017408445477485657, + "learning_rate": 1.583134275587903e-05, + "loss": 0.4167612552642822, + "step": 196210 + }, + { + "epoch": 0.8424134703725646, + "grad_norm": 0.0060608345083892345, + "learning_rate": 1.5827031035761407e-05, + "loss": 0.1044541597366333, + "step": 196220 + }, + { + "epoch": 0.8424564024625847, + "grad_norm": 0.011498616077005863, + "learning_rate": 1.5822719315643784e-05, + "loss": 0.20374600887298583, + "step": 196230 + }, + { + "epoch": 0.8424993345526047, + "grad_norm": 0.00787246972322464, + "learning_rate": 1.581840759552616e-05, + "loss": 0.22661259174346923, + "step": 196240 + }, + { + "epoch": 0.8425422666426247, + "grad_norm": 4.431751728057861, + "learning_rate": 1.5814095875408535e-05, + "loss": 0.37505383491516114, + "step": 196250 + }, + { + "epoch": 0.8425851987326447, + "grad_norm": 0.006028730887919664, + "learning_rate": 1.5809784155290913e-05, + "loss": 0.19709838628768922, + "step": 196260 + }, + { + "epoch": 0.8426281308226647, + "grad_norm": 0.3250470757484436, + "learning_rate": 1.5805472435173287e-05, + "loss": 0.2291292667388916, + "step": 196270 + }, + { + "epoch": 0.8426710629126847, + "grad_norm": 0.46007418632507324, + "learning_rate": 1.5801160715055664e-05, + "loss": 0.19784101247787475, + "step": 196280 + }, + { + "epoch": 0.8427139950027047, + "grad_norm": 0.928049623966217, + "learning_rate": 1.579684899493804e-05, + "loss": 0.10087544918060302, + "step": 196290 + }, + { + "epoch": 0.8427569270927248, + "grad_norm": 0.02121254988014698, + "learning_rate": 1.579253727482042e-05, + "loss": 0.20783622264862062, + "step": 196300 + }, + { + "epoch": 0.8427998591827447, + "grad_norm": 0.1715366542339325, + "learning_rate": 1.5788225554702793e-05, + "loss": 0.1536109209060669, + "step": 196310 + }, + { + "epoch": 0.8428427912727647, + "grad_norm": 0.6590801477432251, + "learning_rate": 1.578391383458517e-05, + "loss": 0.1001272439956665, + "step": 196320 + }, + { + "epoch": 0.8428857233627848, + "grad_norm": 7.680621147155762, + "learning_rate": 1.5779602114467544e-05, + "loss": 0.15085554122924805, + "step": 196330 + }, + { + "epoch": 0.8429286554528047, + "grad_norm": 4.486323833465576, + "learning_rate": 1.5775290394349925e-05, + "loss": 0.27280516624450685, + "step": 196340 + }, + { + "epoch": 0.8429715875428248, + "grad_norm": 1.990494728088379, + "learning_rate": 1.57709786742323e-05, + "loss": 0.11515017747879028, + "step": 196350 + }, + { + "epoch": 0.8430145196328448, + "grad_norm": 1.1661758422851562, + "learning_rate": 1.5766666954114676e-05, + "loss": 0.23308405876159669, + "step": 196360 + }, + { + "epoch": 0.8430574517228647, + "grad_norm": 1.5656630992889404, + "learning_rate": 1.576235523399705e-05, + "loss": 0.18918917179107667, + "step": 196370 + }, + { + "epoch": 0.8431003838128848, + "grad_norm": 6.898838520050049, + "learning_rate": 1.5758043513879427e-05, + "loss": 0.1752528190612793, + "step": 196380 + }, + { + "epoch": 0.8431433159029048, + "grad_norm": 0.010424407199025154, + "learning_rate": 1.57537317937618e-05, + "loss": 0.3139214277267456, + "step": 196390 + }, + { + "epoch": 0.8431862479929247, + "grad_norm": 0.010615244507789612, + "learning_rate": 1.5749420073644182e-05, + "loss": 0.21709723472595216, + "step": 196400 + }, + { + "epoch": 0.8432291800829448, + "grad_norm": 1.2221513986587524, + "learning_rate": 1.5745108353526556e-05, + "loss": 0.08752541542053223, + "step": 196410 + }, + { + "epoch": 0.8432721121729648, + "grad_norm": 0.0036838948726654053, + "learning_rate": 1.5740796633408933e-05, + "loss": 0.15428515672683715, + "step": 196420 + }, + { + "epoch": 0.8433150442629848, + "grad_norm": 1.6186450719833374, + "learning_rate": 1.5736484913291307e-05, + "loss": 0.019905810058116914, + "step": 196430 + }, + { + "epoch": 0.8433579763530048, + "grad_norm": 0.002743236254900694, + "learning_rate": 1.5732173193173684e-05, + "loss": 0.03064911365509033, + "step": 196440 + }, + { + "epoch": 0.8434009084430248, + "grad_norm": 0.009198924526572227, + "learning_rate": 1.5727861473056062e-05, + "loss": 0.030152544379234314, + "step": 196450 + }, + { + "epoch": 0.8434438405330448, + "grad_norm": 0.9507923722267151, + "learning_rate": 1.572354975293844e-05, + "loss": 0.42665905952453614, + "step": 196460 + }, + { + "epoch": 0.8434867726230648, + "grad_norm": 0.0016342259477823973, + "learning_rate": 1.5719238032820817e-05, + "loss": 0.21152596473693847, + "step": 196470 + }, + { + "epoch": 0.8435297047130849, + "grad_norm": 0.17750652134418488, + "learning_rate": 1.571492631270319e-05, + "loss": 0.14839037656784057, + "step": 196480 + }, + { + "epoch": 0.8435726368031049, + "grad_norm": 3.569044589996338, + "learning_rate": 1.5710614592585568e-05, + "loss": 0.19094862937927246, + "step": 196490 + }, + { + "epoch": 0.8436155688931248, + "grad_norm": 0.27529528737068176, + "learning_rate": 1.5706302872467942e-05, + "loss": 0.2316378116607666, + "step": 196500 + }, + { + "epoch": 0.8436585009831449, + "grad_norm": 0.0013704081065952778, + "learning_rate": 1.570199115235032e-05, + "loss": 0.03895947635173798, + "step": 196510 + }, + { + "epoch": 0.8437014330731649, + "grad_norm": 0.020330656319856644, + "learning_rate": 1.5697679432232696e-05, + "loss": 0.04956190586090088, + "step": 196520 + }, + { + "epoch": 0.8437443651631849, + "grad_norm": 5.464777946472168, + "learning_rate": 1.5693367712115074e-05, + "loss": 0.30693516731262205, + "step": 196530 + }, + { + "epoch": 0.8437872972532049, + "grad_norm": 0.006584883667528629, + "learning_rate": 1.5689055991997448e-05, + "loss": 0.157961905002594, + "step": 196540 + }, + { + "epoch": 0.8438302293432249, + "grad_norm": 0.004959666635841131, + "learning_rate": 1.5684744271879825e-05, + "loss": 0.1256941080093384, + "step": 196550 + }, + { + "epoch": 0.8438731614332449, + "grad_norm": 0.0028481590561568737, + "learning_rate": 1.56804325517622e-05, + "loss": 0.011361487209796906, + "step": 196560 + }, + { + "epoch": 0.8439160935232649, + "grad_norm": 0.01640136167407036, + "learning_rate": 1.5676120831644576e-05, + "loss": 0.04784930944442749, + "step": 196570 + }, + { + "epoch": 0.843959025613285, + "grad_norm": 2.794304847717285, + "learning_rate": 1.5671809111526954e-05, + "loss": 0.22906594276428222, + "step": 196580 + }, + { + "epoch": 0.8440019577033049, + "grad_norm": 2.3926756381988525, + "learning_rate": 1.566749739140933e-05, + "loss": 0.15934289693832399, + "step": 196590 + }, + { + "epoch": 0.8440448897933249, + "grad_norm": 0.094965860247612, + "learning_rate": 1.5663185671291705e-05, + "loss": 0.13832385540008546, + "step": 196600 + }, + { + "epoch": 0.844087821883345, + "grad_norm": 1.1906780004501343, + "learning_rate": 1.5658873951174082e-05, + "loss": 0.2520163536071777, + "step": 196610 + }, + { + "epoch": 0.8441307539733649, + "grad_norm": 0.026045776903629303, + "learning_rate": 1.5654562231056456e-05, + "loss": 0.22062888145446777, + "step": 196620 + }, + { + "epoch": 0.8441736860633849, + "grad_norm": 0.009660093113780022, + "learning_rate": 1.5650250510938834e-05, + "loss": 0.03899048864841461, + "step": 196630 + }, + { + "epoch": 0.844216618153405, + "grad_norm": 0.9208100438117981, + "learning_rate": 1.564593879082121e-05, + "loss": 0.33248515129089357, + "step": 196640 + }, + { + "epoch": 0.8442595502434249, + "grad_norm": 0.24865712225437164, + "learning_rate": 1.564162707070359e-05, + "loss": 0.16582545042037963, + "step": 196650 + }, + { + "epoch": 0.844302482333445, + "grad_norm": 0.021726198494434357, + "learning_rate": 1.5637315350585962e-05, + "loss": 0.15123635530471802, + "step": 196660 + }, + { + "epoch": 0.844345414423465, + "grad_norm": 0.06716220825910568, + "learning_rate": 1.563300363046834e-05, + "loss": 0.20972583293914795, + "step": 196670 + }, + { + "epoch": 0.8443883465134849, + "grad_norm": 3.435544729232788, + "learning_rate": 1.5628691910350714e-05, + "loss": 0.07267772555351257, + "step": 196680 + }, + { + "epoch": 0.844431278603505, + "grad_norm": 0.06330103427171707, + "learning_rate": 1.5624380190233094e-05, + "loss": 0.10410884618759156, + "step": 196690 + }, + { + "epoch": 0.844474210693525, + "grad_norm": 0.06830105185508728, + "learning_rate": 1.562006847011547e-05, + "loss": 0.15068842172622682, + "step": 196700 + }, + { + "epoch": 0.844517142783545, + "grad_norm": 7.13935661315918, + "learning_rate": 1.5615756749997846e-05, + "loss": 0.34758007526397705, + "step": 196710 + }, + { + "epoch": 0.844560074873565, + "grad_norm": 0.0010626811999827623, + "learning_rate": 1.561144502988022e-05, + "loss": 0.22543506622314452, + "step": 196720 + }, + { + "epoch": 0.844603006963585, + "grad_norm": 0.6307433843612671, + "learning_rate": 1.5607133309762597e-05, + "loss": 0.014995664358139038, + "step": 196730 + }, + { + "epoch": 0.844645939053605, + "grad_norm": 14.586715698242188, + "learning_rate": 1.560282158964497e-05, + "loss": 0.27491576671600343, + "step": 196740 + }, + { + "epoch": 0.844688871143625, + "grad_norm": 0.30766063928604126, + "learning_rate": 1.559850986952735e-05, + "loss": 0.27586333751678466, + "step": 196750 + }, + { + "epoch": 0.844731803233645, + "grad_norm": 0.016385281458497047, + "learning_rate": 1.5594198149409726e-05, + "loss": 0.27432804107666015, + "step": 196760 + }, + { + "epoch": 0.844774735323665, + "grad_norm": 0.1537727415561676, + "learning_rate": 1.5589886429292103e-05, + "loss": 0.1805056095123291, + "step": 196770 + }, + { + "epoch": 0.844817667413685, + "grad_norm": 5.115199089050293, + "learning_rate": 1.5585574709174477e-05, + "loss": 0.2694607973098755, + "step": 196780 + }, + { + "epoch": 0.8448605995037051, + "grad_norm": 0.0011792482109740376, + "learning_rate": 1.5581262989056854e-05, + "loss": 0.21872477531433104, + "step": 196790 + }, + { + "epoch": 0.844903531593725, + "grad_norm": 0.26486822962760925, + "learning_rate": 1.557695126893923e-05, + "loss": 0.15273507833480834, + "step": 196800 + }, + { + "epoch": 0.844946463683745, + "grad_norm": 0.014987935312092304, + "learning_rate": 1.557263954882161e-05, + "loss": 0.1194075584411621, + "step": 196810 + }, + { + "epoch": 0.8449893957737651, + "grad_norm": 0.001741683459840715, + "learning_rate": 1.5568327828703986e-05, + "loss": 0.1621376872062683, + "step": 196820 + }, + { + "epoch": 0.845032327863785, + "grad_norm": 0.01641698181629181, + "learning_rate": 1.556401610858636e-05, + "loss": 0.08461439609527588, + "step": 196830 + }, + { + "epoch": 0.8450752599538051, + "grad_norm": 0.2613827884197235, + "learning_rate": 1.5559704388468738e-05, + "loss": 0.02929074168205261, + "step": 196840 + }, + { + "epoch": 0.8451181920438251, + "grad_norm": 3.5483486652374268, + "learning_rate": 1.555539266835111e-05, + "loss": 0.24586338996887208, + "step": 196850 + }, + { + "epoch": 0.845161124133845, + "grad_norm": 0.2233700454235077, + "learning_rate": 1.555108094823349e-05, + "loss": 0.5050179958343506, + "step": 196860 + }, + { + "epoch": 0.8452040562238651, + "grad_norm": 0.036731135100126266, + "learning_rate": 1.5546769228115866e-05, + "loss": 0.17513363361358641, + "step": 196870 + }, + { + "epoch": 0.8452469883138851, + "grad_norm": 0.8537135124206543, + "learning_rate": 1.5542457507998244e-05, + "loss": 0.10908613204956055, + "step": 196880 + }, + { + "epoch": 0.845289920403905, + "grad_norm": 0.018275853246450424, + "learning_rate": 1.5538145787880617e-05, + "loss": 0.19336570501327516, + "step": 196890 + }, + { + "epoch": 0.8453328524939251, + "grad_norm": 0.08329416811466217, + "learning_rate": 1.5533834067762995e-05, + "loss": 0.14187638759613036, + "step": 196900 + }, + { + "epoch": 0.8453757845839451, + "grad_norm": 0.01114792563021183, + "learning_rate": 1.552952234764537e-05, + "loss": 0.10525352954864502, + "step": 196910 + }, + { + "epoch": 0.8454187166739652, + "grad_norm": 0.3175894320011139, + "learning_rate": 1.5525210627527746e-05, + "loss": 0.05137323141098023, + "step": 196920 + }, + { + "epoch": 0.8454616487639851, + "grad_norm": 0.008936109021306038, + "learning_rate": 1.5520898907410123e-05, + "loss": 0.08222769498825074, + "step": 196930 + }, + { + "epoch": 0.8455045808540051, + "grad_norm": 7.562148094177246, + "learning_rate": 1.55165871872925e-05, + "loss": 0.30330629348754884, + "step": 196940 + }, + { + "epoch": 0.8455475129440252, + "grad_norm": 0.37392285466194153, + "learning_rate": 1.5512275467174875e-05, + "loss": 0.12386249303817749, + "step": 196950 + }, + { + "epoch": 0.8455904450340451, + "grad_norm": 7.58154296875, + "learning_rate": 1.5507963747057252e-05, + "loss": 0.15494595766067504, + "step": 196960 + }, + { + "epoch": 0.8456333771240652, + "grad_norm": 3.248518466949463, + "learning_rate": 1.5503652026939626e-05, + "loss": 0.39366850852966306, + "step": 196970 + }, + { + "epoch": 0.8456763092140852, + "grad_norm": 4.586677074432373, + "learning_rate": 1.5499340306822003e-05, + "loss": 0.1424519896507263, + "step": 196980 + }, + { + "epoch": 0.8457192413041051, + "grad_norm": 3.7495882511138916, + "learning_rate": 1.549502858670438e-05, + "loss": 0.027593034505844116, + "step": 196990 + }, + { + "epoch": 0.8457621733941252, + "grad_norm": 0.5563340783119202, + "learning_rate": 1.5490716866586758e-05, + "loss": 0.19862442016601561, + "step": 197000 + }, + { + "epoch": 0.8457621733941252, + "eval_loss": 0.3774981200695038, + "eval_runtime": 27.4864, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 3.638, + "step": 197000 + }, + { + "epoch": 0.8458051054841452, + "grad_norm": 42.356590270996094, + "learning_rate": 1.5486405146469132e-05, + "loss": 0.38418533802032473, + "step": 197010 + }, + { + "epoch": 0.8458480375741652, + "grad_norm": 0.005917608272284269, + "learning_rate": 1.548209342635151e-05, + "loss": 0.12943546772003173, + "step": 197020 + }, + { + "epoch": 0.8458909696641852, + "grad_norm": 0.477541446685791, + "learning_rate": 1.5477781706233883e-05, + "loss": 0.1763970136642456, + "step": 197030 + }, + { + "epoch": 0.8459339017542052, + "grad_norm": 1.4189867973327637, + "learning_rate": 1.5473469986116264e-05, + "loss": 0.27072913646698, + "step": 197040 + }, + { + "epoch": 0.8459768338442252, + "grad_norm": 1.8773818016052246, + "learning_rate": 1.5469158265998638e-05, + "loss": 0.44829769134521485, + "step": 197050 + }, + { + "epoch": 0.8460197659342452, + "grad_norm": 0.03911508619785309, + "learning_rate": 1.5464846545881015e-05, + "loss": 0.12318435907363892, + "step": 197060 + }, + { + "epoch": 0.8460626980242653, + "grad_norm": 0.0011162409791722894, + "learning_rate": 1.546053482576339e-05, + "loss": 0.19681681394577027, + "step": 197070 + }, + { + "epoch": 0.8461056301142852, + "grad_norm": 0.033100008964538574, + "learning_rate": 1.5456223105645767e-05, + "loss": 0.15288090705871582, + "step": 197080 + }, + { + "epoch": 0.8461485622043052, + "grad_norm": 0.04976103827357292, + "learning_rate": 1.545191138552814e-05, + "loss": 0.14306716918945311, + "step": 197090 + }, + { + "epoch": 0.8461914942943253, + "grad_norm": 1.2336324453353882, + "learning_rate": 1.544759966541052e-05, + "loss": 0.13660180568695068, + "step": 197100 + }, + { + "epoch": 0.8462344263843452, + "grad_norm": 0.0031069640535861254, + "learning_rate": 1.5443287945292895e-05, + "loss": 0.1620272159576416, + "step": 197110 + }, + { + "epoch": 0.8462773584743652, + "grad_norm": 0.04594505578279495, + "learning_rate": 1.5438976225175273e-05, + "loss": 0.0735178291797638, + "step": 197120 + }, + { + "epoch": 0.8463202905643853, + "grad_norm": 3.394017219543457, + "learning_rate": 1.5434664505057647e-05, + "loss": 0.20956742763519287, + "step": 197130 + }, + { + "epoch": 0.8463632226544052, + "grad_norm": 4.642988204956055, + "learning_rate": 1.5430352784940024e-05, + "loss": 0.24042127132415772, + "step": 197140 + }, + { + "epoch": 0.8464061547444253, + "grad_norm": 1.316240668296814, + "learning_rate": 1.54260410648224e-05, + "loss": 0.13471782207489014, + "step": 197150 + }, + { + "epoch": 0.8464490868344453, + "grad_norm": 1.475600004196167, + "learning_rate": 1.542172934470478e-05, + "loss": 0.2492267370223999, + "step": 197160 + }, + { + "epoch": 0.8464920189244652, + "grad_norm": 0.02170061506330967, + "learning_rate": 1.5417417624587156e-05, + "loss": 0.2634519100189209, + "step": 197170 + }, + { + "epoch": 0.8465349510144853, + "grad_norm": 5.417356491088867, + "learning_rate": 1.541310590446953e-05, + "loss": 0.12121155261993408, + "step": 197180 + }, + { + "epoch": 0.8465778831045053, + "grad_norm": 0.09285827726125717, + "learning_rate": 1.5408794184351907e-05, + "loss": 0.12122727632522583, + "step": 197190 + }, + { + "epoch": 0.8466208151945253, + "grad_norm": 0.007813221774995327, + "learning_rate": 1.540448246423428e-05, + "loss": 0.098856121301651, + "step": 197200 + }, + { + "epoch": 0.8466637472845453, + "grad_norm": 0.2953546643257141, + "learning_rate": 1.540017074411666e-05, + "loss": 0.10793371200561523, + "step": 197210 + }, + { + "epoch": 0.8467066793745653, + "grad_norm": 1.193633794784546, + "learning_rate": 1.5395859023999036e-05, + "loss": 0.30551395416259763, + "step": 197220 + }, + { + "epoch": 0.8467496114645853, + "grad_norm": 0.010783948004245758, + "learning_rate": 1.5391547303881413e-05, + "loss": 0.23938562870025634, + "step": 197230 + }, + { + "epoch": 0.8467925435546053, + "grad_norm": 0.0030303297098726034, + "learning_rate": 1.5387235583763787e-05, + "loss": 0.17464864253997803, + "step": 197240 + }, + { + "epoch": 0.8468354756446254, + "grad_norm": 2.3788888454437256, + "learning_rate": 1.5382923863646165e-05, + "loss": 0.23440778255462646, + "step": 197250 + }, + { + "epoch": 0.8468784077346453, + "grad_norm": 0.002660320606082678, + "learning_rate": 1.537861214352854e-05, + "loss": 0.23806724548339844, + "step": 197260 + }, + { + "epoch": 0.8469213398246653, + "grad_norm": 0.014182024635374546, + "learning_rate": 1.5374300423410916e-05, + "loss": 0.10502651929855347, + "step": 197270 + }, + { + "epoch": 0.8469642719146854, + "grad_norm": 0.03121148981153965, + "learning_rate": 1.5369988703293293e-05, + "loss": 0.2541842222213745, + "step": 197280 + }, + { + "epoch": 0.8470072040047053, + "grad_norm": 1.580581545829773, + "learning_rate": 1.536567698317567e-05, + "loss": 0.22803409099578859, + "step": 197290 + }, + { + "epoch": 0.8470501360947253, + "grad_norm": 0.006081053521484137, + "learning_rate": 1.5361365263058045e-05, + "loss": 0.1805238366127014, + "step": 197300 + }, + { + "epoch": 0.8470930681847454, + "grad_norm": 0.1482483446598053, + "learning_rate": 1.5357053542940422e-05, + "loss": 0.24855918884277345, + "step": 197310 + }, + { + "epoch": 0.8471360002747653, + "grad_norm": 0.010404618456959724, + "learning_rate": 1.5352741822822796e-05, + "loss": 0.1581823706626892, + "step": 197320 + }, + { + "epoch": 0.8471789323647854, + "grad_norm": 0.5020700097084045, + "learning_rate": 1.5348430102705173e-05, + "loss": 0.2955003023147583, + "step": 197330 + }, + { + "epoch": 0.8472218644548054, + "grad_norm": 0.04875797778367996, + "learning_rate": 1.534411838258755e-05, + "loss": 0.17217621803283692, + "step": 197340 + }, + { + "epoch": 0.8472647965448254, + "grad_norm": 0.12947072088718414, + "learning_rate": 1.5339806662469928e-05, + "loss": 0.23299505710601806, + "step": 197350 + }, + { + "epoch": 0.8473077286348454, + "grad_norm": 0.00724436342716217, + "learning_rate": 1.5335494942352302e-05, + "loss": 0.16534049510955812, + "step": 197360 + }, + { + "epoch": 0.8473506607248654, + "grad_norm": 0.017508773133158684, + "learning_rate": 1.533118322223468e-05, + "loss": 0.15059289932250977, + "step": 197370 + }, + { + "epoch": 0.8473935928148855, + "grad_norm": 0.17189420759677887, + "learning_rate": 1.5326871502117053e-05, + "loss": 0.2993730306625366, + "step": 197380 + }, + { + "epoch": 0.8474365249049054, + "grad_norm": 0.10093765705823898, + "learning_rate": 1.532255978199943e-05, + "loss": 0.15821362733840943, + "step": 197390 + }, + { + "epoch": 0.8474794569949254, + "grad_norm": 0.009381674230098724, + "learning_rate": 1.5318248061881808e-05, + "loss": 0.11810331344604492, + "step": 197400 + }, + { + "epoch": 0.8475223890849455, + "grad_norm": 2.480515956878662, + "learning_rate": 1.5313936341764185e-05, + "loss": 0.15523053407669068, + "step": 197410 + }, + { + "epoch": 0.8475653211749654, + "grad_norm": 3.614999532699585, + "learning_rate": 1.530962462164656e-05, + "loss": 0.15576525926589965, + "step": 197420 + }, + { + "epoch": 0.8476082532649855, + "grad_norm": 65.64250183105469, + "learning_rate": 1.5305312901528936e-05, + "loss": 0.19414908885955812, + "step": 197430 + }, + { + "epoch": 0.8476511853550055, + "grad_norm": 0.489206999540329, + "learning_rate": 1.530100118141131e-05, + "loss": 0.32179012298583987, + "step": 197440 + }, + { + "epoch": 0.8476941174450254, + "grad_norm": 0.11385396867990494, + "learning_rate": 1.529668946129369e-05, + "loss": 0.20681419372558593, + "step": 197450 + }, + { + "epoch": 0.8477370495350455, + "grad_norm": 0.11933526396751404, + "learning_rate": 1.5292377741176065e-05, + "loss": 0.14109406471252442, + "step": 197460 + }, + { + "epoch": 0.8477799816250655, + "grad_norm": 0.002649921691045165, + "learning_rate": 1.5288066021058442e-05, + "loss": 0.17448320388793945, + "step": 197470 + }, + { + "epoch": 0.8478229137150854, + "grad_norm": 2.262589931488037, + "learning_rate": 1.5283754300940816e-05, + "loss": 0.07583979964256286, + "step": 197480 + }, + { + "epoch": 0.8478658458051055, + "grad_norm": 0.14377081394195557, + "learning_rate": 1.5279442580823194e-05, + "loss": 0.3324470043182373, + "step": 197490 + }, + { + "epoch": 0.8479087778951255, + "grad_norm": 0.023049483075737953, + "learning_rate": 1.5275130860705568e-05, + "loss": 0.09699010848999023, + "step": 197500 + }, + { + "epoch": 0.8479517099851455, + "grad_norm": 0.047775398939847946, + "learning_rate": 1.527081914058795e-05, + "loss": 0.13095303773880004, + "step": 197510 + }, + { + "epoch": 0.8479946420751655, + "grad_norm": 1.631107211112976, + "learning_rate": 1.5266507420470322e-05, + "loss": 0.29105587005615235, + "step": 197520 + }, + { + "epoch": 0.8480375741651855, + "grad_norm": 0.03031880035996437, + "learning_rate": 1.52621957003527e-05, + "loss": 0.2267350435256958, + "step": 197530 + }, + { + "epoch": 0.8480805062552055, + "grad_norm": 0.4916745722293854, + "learning_rate": 1.5257883980235077e-05, + "loss": 0.14272670745849608, + "step": 197540 + }, + { + "epoch": 0.8481234383452255, + "grad_norm": 0.009396664798259735, + "learning_rate": 1.5253572260117453e-05, + "loss": 0.03404759168624878, + "step": 197550 + }, + { + "epoch": 0.8481663704352456, + "grad_norm": 6.043698310852051, + "learning_rate": 1.524926053999983e-05, + "loss": 0.2500048875808716, + "step": 197560 + }, + { + "epoch": 0.8482093025252655, + "grad_norm": 0.02857845462858677, + "learning_rate": 1.5244948819882204e-05, + "loss": 0.08116910457611085, + "step": 197570 + }, + { + "epoch": 0.8482522346152855, + "grad_norm": 4.6327128410339355, + "learning_rate": 1.5240637099764581e-05, + "loss": 0.3209220886230469, + "step": 197580 + }, + { + "epoch": 0.8482951667053056, + "grad_norm": 0.36840423941612244, + "learning_rate": 1.5236325379646957e-05, + "loss": 0.18857355117797853, + "step": 197590 + }, + { + "epoch": 0.8483380987953255, + "grad_norm": 0.011004694737493992, + "learning_rate": 1.5232013659529334e-05, + "loss": 0.0036482542753219606, + "step": 197600 + }, + { + "epoch": 0.8483810308853456, + "grad_norm": 0.09082172065973282, + "learning_rate": 1.522770193941171e-05, + "loss": 0.2099597215652466, + "step": 197610 + }, + { + "epoch": 0.8484239629753656, + "grad_norm": 1.6527332067489624, + "learning_rate": 1.5223390219294087e-05, + "loss": 0.40575323104858396, + "step": 197620 + }, + { + "epoch": 0.8484668950653855, + "grad_norm": 3.2181715965270996, + "learning_rate": 1.5219078499176461e-05, + "loss": 0.11115305423736573, + "step": 197630 + }, + { + "epoch": 0.8485098271554056, + "grad_norm": 0.1894989311695099, + "learning_rate": 1.5214766779058839e-05, + "loss": 0.2039348840713501, + "step": 197640 + }, + { + "epoch": 0.8485527592454256, + "grad_norm": 1.8448779582977295, + "learning_rate": 1.5210455058941214e-05, + "loss": 0.2965463399887085, + "step": 197650 + }, + { + "epoch": 0.8485956913354455, + "grad_norm": 0.028072591871023178, + "learning_rate": 1.5206143338823592e-05, + "loss": 0.144870388507843, + "step": 197660 + }, + { + "epoch": 0.8486386234254656, + "grad_norm": 0.006217554677277803, + "learning_rate": 1.5201831618705967e-05, + "loss": 0.03407878577709198, + "step": 197670 + }, + { + "epoch": 0.8486815555154856, + "grad_norm": 0.029794985428452492, + "learning_rate": 1.5197519898588345e-05, + "loss": 0.1508237600326538, + "step": 197680 + }, + { + "epoch": 0.8487244876055056, + "grad_norm": 1.7843852043151855, + "learning_rate": 1.5193208178470719e-05, + "loss": 0.20659241676330567, + "step": 197690 + }, + { + "epoch": 0.8487674196955256, + "grad_norm": 0.2533365488052368, + "learning_rate": 1.5188896458353098e-05, + "loss": 0.22586512565612793, + "step": 197700 + }, + { + "epoch": 0.8488103517855456, + "grad_norm": 0.00228360784240067, + "learning_rate": 1.5184584738235472e-05, + "loss": 0.3032339572906494, + "step": 197710 + }, + { + "epoch": 0.8488532838755656, + "grad_norm": 0.0023056359495967627, + "learning_rate": 1.5180273018117849e-05, + "loss": 0.15963205099105834, + "step": 197720 + }, + { + "epoch": 0.8488962159655856, + "grad_norm": 26.590858459472656, + "learning_rate": 1.5175961298000225e-05, + "loss": 0.18662004470825194, + "step": 197730 + }, + { + "epoch": 0.8489391480556057, + "grad_norm": 0.003507897723466158, + "learning_rate": 1.5171649577882602e-05, + "loss": 0.13641457557678222, + "step": 197740 + }, + { + "epoch": 0.8489820801456256, + "grad_norm": 0.008930052630603313, + "learning_rate": 1.5167337857764976e-05, + "loss": 0.23413097858428955, + "step": 197750 + }, + { + "epoch": 0.8490250122356456, + "grad_norm": 0.057356588542461395, + "learning_rate": 1.5163026137647355e-05, + "loss": 0.04057014882564545, + "step": 197760 + }, + { + "epoch": 0.8490679443256657, + "grad_norm": 1.1264686584472656, + "learning_rate": 1.5158714417529729e-05, + "loss": 0.11325465440750122, + "step": 197770 + }, + { + "epoch": 0.8491108764156857, + "grad_norm": 0.25341594219207764, + "learning_rate": 1.5154402697412106e-05, + "loss": 0.09531864523887634, + "step": 197780 + }, + { + "epoch": 0.8491538085057057, + "grad_norm": 0.05748600512742996, + "learning_rate": 1.5150090977294482e-05, + "loss": 0.17363308668136596, + "step": 197790 + }, + { + "epoch": 0.8491967405957257, + "grad_norm": 0.07798365503549576, + "learning_rate": 1.5145779257176859e-05, + "loss": 0.1146618366241455, + "step": 197800 + }, + { + "epoch": 0.8492396726857457, + "grad_norm": 0.015463123098015785, + "learning_rate": 1.5141467537059235e-05, + "loss": 0.0030902113765478136, + "step": 197810 + }, + { + "epoch": 0.8492826047757657, + "grad_norm": 0.07422202825546265, + "learning_rate": 1.5137155816941612e-05, + "loss": 0.23043811321258545, + "step": 197820 + }, + { + "epoch": 0.8493255368657857, + "grad_norm": 0.3136723041534424, + "learning_rate": 1.5132844096823986e-05, + "loss": 0.10654685497283936, + "step": 197830 + }, + { + "epoch": 0.8493684689558058, + "grad_norm": 0.011924277059733868, + "learning_rate": 1.5128532376706363e-05, + "loss": 0.32079641819000243, + "step": 197840 + }, + { + "epoch": 0.8494114010458257, + "grad_norm": 0.002160630887374282, + "learning_rate": 1.5124220656588739e-05, + "loss": 0.09690849184989929, + "step": 197850 + }, + { + "epoch": 0.8494543331358457, + "grad_norm": 4.282308101654053, + "learning_rate": 1.5119908936471116e-05, + "loss": 0.3771207809448242, + "step": 197860 + }, + { + "epoch": 0.8494972652258658, + "grad_norm": 0.034167733043432236, + "learning_rate": 1.5115597216353492e-05, + "loss": 0.05973265767097473, + "step": 197870 + }, + { + "epoch": 0.8495401973158857, + "grad_norm": 2.4957375526428223, + "learning_rate": 1.511128549623587e-05, + "loss": 0.18202605247497558, + "step": 197880 + }, + { + "epoch": 0.8495831294059057, + "grad_norm": 0.14000332355499268, + "learning_rate": 1.5106973776118247e-05, + "loss": 0.16483761072158815, + "step": 197890 + }, + { + "epoch": 0.8496260614959258, + "grad_norm": 6.224600791931152, + "learning_rate": 1.5102662056000622e-05, + "loss": 0.4294785499572754, + "step": 197900 + }, + { + "epoch": 0.8496689935859457, + "grad_norm": 0.027004102244973183, + "learning_rate": 1.5098350335883e-05, + "loss": 0.0069320306181907656, + "step": 197910 + }, + { + "epoch": 0.8497119256759658, + "grad_norm": 0.08216096460819244, + "learning_rate": 1.5094038615765374e-05, + "loss": 0.0045673668384552, + "step": 197920 + }, + { + "epoch": 0.8497548577659858, + "grad_norm": 0.07297103106975555, + "learning_rate": 1.5089726895647751e-05, + "loss": 0.26889212131500245, + "step": 197930 + }, + { + "epoch": 0.8497977898560057, + "grad_norm": 0.006628489587455988, + "learning_rate": 1.5085415175530127e-05, + "loss": 0.2143918514251709, + "step": 197940 + }, + { + "epoch": 0.8498407219460258, + "grad_norm": 0.0073399050161242485, + "learning_rate": 1.5081103455412504e-05, + "loss": 0.1867246985435486, + "step": 197950 + }, + { + "epoch": 0.8498836540360458, + "grad_norm": 1.6105217933654785, + "learning_rate": 1.507679173529488e-05, + "loss": 0.10490700006484985, + "step": 197960 + }, + { + "epoch": 0.8499265861260658, + "grad_norm": 1.9887235164642334, + "learning_rate": 1.5072480015177257e-05, + "loss": 0.30768733024597167, + "step": 197970 + }, + { + "epoch": 0.8499695182160858, + "grad_norm": 1.3068478107452393, + "learning_rate": 1.5068168295059631e-05, + "loss": 0.2877668857574463, + "step": 197980 + }, + { + "epoch": 0.8500124503061058, + "grad_norm": 0.037830330431461334, + "learning_rate": 1.5063856574942008e-05, + "loss": 0.40204415321350095, + "step": 197990 + }, + { + "epoch": 0.8500553823961258, + "grad_norm": 0.6198349595069885, + "learning_rate": 1.5059544854824384e-05, + "loss": 0.21968297958374022, + "step": 198000 + }, + { + "epoch": 0.8500553823961258, + "eval_loss": 0.37510281801223755, + "eval_runtime": 27.474, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 198000 + }, + { + "epoch": 0.8500983144861458, + "grad_norm": 1.1435520648956299, + "learning_rate": 1.5055233134706761e-05, + "loss": 0.0789073646068573, + "step": 198010 + }, + { + "epoch": 0.8501412465761659, + "grad_norm": 0.03565061092376709, + "learning_rate": 1.5050921414589137e-05, + "loss": 0.3191929578781128, + "step": 198020 + }, + { + "epoch": 0.8501841786661858, + "grad_norm": 7.397432804107666, + "learning_rate": 1.5046609694471514e-05, + "loss": 0.2238842487335205, + "step": 198030 + }, + { + "epoch": 0.8502271107562058, + "grad_norm": 0.06873110681772232, + "learning_rate": 1.5042297974353888e-05, + "loss": 0.21624131202697755, + "step": 198040 + }, + { + "epoch": 0.8502700428462259, + "grad_norm": 2.253124952316284, + "learning_rate": 1.5037986254236267e-05, + "loss": 0.28505237102508546, + "step": 198050 + }, + { + "epoch": 0.8503129749362458, + "grad_norm": 4.617161273956299, + "learning_rate": 1.5033674534118641e-05, + "loss": 0.041191473603248596, + "step": 198060 + }, + { + "epoch": 0.8503559070262658, + "grad_norm": 0.4110824167728424, + "learning_rate": 1.5029362814001019e-05, + "loss": 0.1780386209487915, + "step": 198070 + }, + { + "epoch": 0.8503988391162859, + "grad_norm": 0.0656173974275589, + "learning_rate": 1.5025051093883394e-05, + "loss": 0.01941981315612793, + "step": 198080 + }, + { + "epoch": 0.8504417712063058, + "grad_norm": 0.017897628247737885, + "learning_rate": 1.5020739373765772e-05, + "loss": 0.27413904666900635, + "step": 198090 + }, + { + "epoch": 0.8504847032963259, + "grad_norm": 0.0807955265045166, + "learning_rate": 1.5016427653648146e-05, + "loss": 0.46311440467834475, + "step": 198100 + }, + { + "epoch": 0.8505276353863459, + "grad_norm": 0.0031225387938320637, + "learning_rate": 1.5012115933530525e-05, + "loss": 0.28832767009735105, + "step": 198110 + }, + { + "epoch": 0.8505705674763658, + "grad_norm": 0.026935061439871788, + "learning_rate": 1.5007804213412899e-05, + "loss": 0.025250527262687682, + "step": 198120 + }, + { + "epoch": 0.8506134995663859, + "grad_norm": 0.037563394755125046, + "learning_rate": 1.5003492493295276e-05, + "loss": 0.3608541965484619, + "step": 198130 + }, + { + "epoch": 0.8506564316564059, + "grad_norm": 0.007384961470961571, + "learning_rate": 1.4999180773177652e-05, + "loss": 0.16276904344558715, + "step": 198140 + }, + { + "epoch": 0.8506993637464259, + "grad_norm": 1.0036953687667847, + "learning_rate": 1.4994869053060029e-05, + "loss": 0.1697608470916748, + "step": 198150 + }, + { + "epoch": 0.8507422958364459, + "grad_norm": 4.348733425140381, + "learning_rate": 1.4990557332942405e-05, + "loss": 0.33487553596496583, + "step": 198160 + }, + { + "epoch": 0.8507852279264659, + "grad_norm": 0.004956691060215235, + "learning_rate": 1.4986245612824782e-05, + "loss": 0.12451618909835815, + "step": 198170 + }, + { + "epoch": 0.8508281600164859, + "grad_norm": 2.82716703414917, + "learning_rate": 1.4981933892707156e-05, + "loss": 0.16169651746749877, + "step": 198180 + }, + { + "epoch": 0.8508710921065059, + "grad_norm": 0.003215742064639926, + "learning_rate": 1.4977622172589533e-05, + "loss": 0.22183990478515625, + "step": 198190 + }, + { + "epoch": 0.850914024196526, + "grad_norm": 0.011257747188210487, + "learning_rate": 1.4973310452471909e-05, + "loss": 0.14878798723220826, + "step": 198200 + }, + { + "epoch": 0.850956956286546, + "grad_norm": 0.20254521071910858, + "learning_rate": 1.4968998732354286e-05, + "loss": 0.0998440444469452, + "step": 198210 + }, + { + "epoch": 0.8509998883765659, + "grad_norm": 6.641183853149414, + "learning_rate": 1.4964687012236662e-05, + "loss": 0.3686415910720825, + "step": 198220 + }, + { + "epoch": 0.851042820466586, + "grad_norm": 3.061818838119507, + "learning_rate": 1.4960375292119039e-05, + "loss": 0.2695192575454712, + "step": 198230 + }, + { + "epoch": 0.851085752556606, + "grad_norm": 0.0106343450024724, + "learning_rate": 1.4956063572001413e-05, + "loss": 0.10752819776535034, + "step": 198240 + }, + { + "epoch": 0.8511286846466259, + "grad_norm": 1.9127508401870728, + "learning_rate": 1.495175185188379e-05, + "loss": 0.13210554122924806, + "step": 198250 + }, + { + "epoch": 0.851171616736646, + "grad_norm": 4.323821544647217, + "learning_rate": 1.494744013176617e-05, + "loss": 0.21595339775085448, + "step": 198260 + }, + { + "epoch": 0.851214548826666, + "grad_norm": 0.07402490079402924, + "learning_rate": 1.4943128411648543e-05, + "loss": 0.14881352186203003, + "step": 198270 + }, + { + "epoch": 0.851257480916686, + "grad_norm": 0.0036409071180969477, + "learning_rate": 1.493881669153092e-05, + "loss": 0.1895911693572998, + "step": 198280 + }, + { + "epoch": 0.851300413006706, + "grad_norm": 1.7480568885803223, + "learning_rate": 1.4934504971413296e-05, + "loss": 0.10597457885742187, + "step": 198290 + }, + { + "epoch": 0.851343345096726, + "grad_norm": 0.02202913723886013, + "learning_rate": 1.4930193251295674e-05, + "loss": 0.2131192207336426, + "step": 198300 + }, + { + "epoch": 0.851386277186746, + "grad_norm": 0.06853848695755005, + "learning_rate": 1.492588153117805e-05, + "loss": 0.12416889667510986, + "step": 198310 + }, + { + "epoch": 0.851429209276766, + "grad_norm": 0.20226770639419556, + "learning_rate": 1.4921569811060427e-05, + "loss": 0.18529151678085326, + "step": 198320 + }, + { + "epoch": 0.8514721413667861, + "grad_norm": 0.012098453938961029, + "learning_rate": 1.49172580909428e-05, + "loss": 0.18486225605010986, + "step": 198330 + }, + { + "epoch": 0.851515073456806, + "grad_norm": 0.000547365692909807, + "learning_rate": 1.4912946370825178e-05, + "loss": 0.2261587142944336, + "step": 198340 + }, + { + "epoch": 0.851558005546826, + "grad_norm": 52.2000732421875, + "learning_rate": 1.4908634650707554e-05, + "loss": 0.17255141735076904, + "step": 198350 + }, + { + "epoch": 0.8516009376368461, + "grad_norm": 0.004955257289111614, + "learning_rate": 1.4904322930589931e-05, + "loss": 0.14709478616714478, + "step": 198360 + }, + { + "epoch": 0.851643869726866, + "grad_norm": 0.029430586844682693, + "learning_rate": 1.4900011210472307e-05, + "loss": 0.019669802486896516, + "step": 198370 + }, + { + "epoch": 0.851686801816886, + "grad_norm": 0.7717126607894897, + "learning_rate": 1.4895699490354684e-05, + "loss": 0.2101571798324585, + "step": 198380 + }, + { + "epoch": 0.8517297339069061, + "grad_norm": 0.08160284161567688, + "learning_rate": 1.4891387770237058e-05, + "loss": 0.01767835021018982, + "step": 198390 + }, + { + "epoch": 0.851772665996926, + "grad_norm": 3.979954481124878, + "learning_rate": 1.4887076050119435e-05, + "loss": 0.1902117371559143, + "step": 198400 + }, + { + "epoch": 0.8518155980869461, + "grad_norm": 0.8650118708610535, + "learning_rate": 1.4882764330001811e-05, + "loss": 0.1752025842666626, + "step": 198410 + }, + { + "epoch": 0.8518585301769661, + "grad_norm": 0.02981482818722725, + "learning_rate": 1.4878452609884188e-05, + "loss": 0.17029629945755004, + "step": 198420 + }, + { + "epoch": 0.851901462266986, + "grad_norm": 2.2017018795013428, + "learning_rate": 1.4874140889766564e-05, + "loss": 0.14863049983978271, + "step": 198430 + }, + { + "epoch": 0.8519443943570061, + "grad_norm": 0.03577551618218422, + "learning_rate": 1.4869829169648941e-05, + "loss": 0.09861577153205872, + "step": 198440 + }, + { + "epoch": 0.8519873264470261, + "grad_norm": 5.863005638122559, + "learning_rate": 1.4865517449531315e-05, + "loss": 0.32581243515014646, + "step": 198450 + }, + { + "epoch": 0.8520302585370461, + "grad_norm": 0.10977106541395187, + "learning_rate": 1.4861205729413694e-05, + "loss": 0.2411884069442749, + "step": 198460 + }, + { + "epoch": 0.8520731906270661, + "grad_norm": 0.026395481079816818, + "learning_rate": 1.4856894009296068e-05, + "loss": 0.11193904876708985, + "step": 198470 + }, + { + "epoch": 0.8521161227170861, + "grad_norm": 0.003131694160401821, + "learning_rate": 1.4852582289178446e-05, + "loss": 0.09634979963302612, + "step": 198480 + }, + { + "epoch": 0.8521590548071061, + "grad_norm": 0.011490268632769585, + "learning_rate": 1.4848270569060821e-05, + "loss": 0.12840532064437865, + "step": 198490 + }, + { + "epoch": 0.8522019868971261, + "grad_norm": 0.004589975345879793, + "learning_rate": 1.4843958848943199e-05, + "loss": 0.2682207107543945, + "step": 198500 + }, + { + "epoch": 0.8522449189871462, + "grad_norm": 0.00972757488489151, + "learning_rate": 1.4839647128825573e-05, + "loss": 0.24504990577697755, + "step": 198510 + }, + { + "epoch": 0.8522878510771661, + "grad_norm": 0.8272261023521423, + "learning_rate": 1.4835335408707952e-05, + "loss": 0.10439317226409912, + "step": 198520 + }, + { + "epoch": 0.8523307831671861, + "grad_norm": 0.005756685975939035, + "learning_rate": 1.4831023688590326e-05, + "loss": 0.1679734468460083, + "step": 198530 + }, + { + "epoch": 0.8523737152572062, + "grad_norm": 0.1370045691728592, + "learning_rate": 1.4826711968472703e-05, + "loss": 0.06637945771217346, + "step": 198540 + }, + { + "epoch": 0.8524166473472261, + "grad_norm": 1.1877647638320923, + "learning_rate": 1.4822400248355079e-05, + "loss": 0.23604612350463866, + "step": 198550 + }, + { + "epoch": 0.8524595794372462, + "grad_norm": 0.03956817835569382, + "learning_rate": 1.4818088528237456e-05, + "loss": 0.2815361976623535, + "step": 198560 + }, + { + "epoch": 0.8525025115272662, + "grad_norm": 0.0008984607411548495, + "learning_rate": 1.4813776808119832e-05, + "loss": 0.13040565252304076, + "step": 198570 + }, + { + "epoch": 0.8525454436172861, + "grad_norm": 0.013303420506417751, + "learning_rate": 1.4809465088002209e-05, + "loss": 0.07282218933105469, + "step": 198580 + }, + { + "epoch": 0.8525883757073062, + "grad_norm": 0.01912931352853775, + "learning_rate": 1.4805153367884583e-05, + "loss": 0.1441921591758728, + "step": 198590 + }, + { + "epoch": 0.8526313077973262, + "grad_norm": 0.0023365512024611235, + "learning_rate": 1.480084164776696e-05, + "loss": 0.038869994878768924, + "step": 198600 + }, + { + "epoch": 0.8526742398873461, + "grad_norm": 0.0236685611307621, + "learning_rate": 1.4796529927649336e-05, + "loss": 0.14604955911636353, + "step": 198610 + }, + { + "epoch": 0.8527171719773662, + "grad_norm": 0.8849748373031616, + "learning_rate": 1.4792218207531713e-05, + "loss": 0.24399905204772948, + "step": 198620 + }, + { + "epoch": 0.8527601040673862, + "grad_norm": 0.007544425316154957, + "learning_rate": 1.478790648741409e-05, + "loss": 0.1654476284980774, + "step": 198630 + }, + { + "epoch": 0.8528030361574063, + "grad_norm": 0.0008877692162059247, + "learning_rate": 1.4783594767296466e-05, + "loss": 0.2518858671188354, + "step": 198640 + }, + { + "epoch": 0.8528459682474262, + "grad_norm": 5.671640396118164, + "learning_rate": 1.4779283047178844e-05, + "loss": 0.06790667772293091, + "step": 198650 + }, + { + "epoch": 0.8528889003374462, + "grad_norm": 0.04101986810564995, + "learning_rate": 1.477497132706122e-05, + "loss": 0.274362587928772, + "step": 198660 + }, + { + "epoch": 0.8529318324274663, + "grad_norm": 0.4285285174846649, + "learning_rate": 1.4770659606943596e-05, + "loss": 0.3526389837265015, + "step": 198670 + }, + { + "epoch": 0.8529747645174862, + "grad_norm": 4.997021198272705, + "learning_rate": 1.476634788682597e-05, + "loss": 0.2588140249252319, + "step": 198680 + }, + { + "epoch": 0.8530176966075063, + "grad_norm": 0.00464608846232295, + "learning_rate": 1.4762036166708348e-05, + "loss": 0.2015136480331421, + "step": 198690 + }, + { + "epoch": 0.8530606286975263, + "grad_norm": 3.5841684341430664, + "learning_rate": 1.4757724446590723e-05, + "loss": 0.2638310432434082, + "step": 198700 + }, + { + "epoch": 0.8531035607875462, + "grad_norm": 0.02887623757123947, + "learning_rate": 1.47534127264731e-05, + "loss": 0.19619773626327514, + "step": 198710 + }, + { + "epoch": 0.8531464928775663, + "grad_norm": 0.12220575660467148, + "learning_rate": 1.4749101006355476e-05, + "loss": 0.05668666362762451, + "step": 198720 + }, + { + "epoch": 0.8531894249675863, + "grad_norm": 0.0056074149906635284, + "learning_rate": 1.4744789286237854e-05, + "loss": 0.3229382038116455, + "step": 198730 + }, + { + "epoch": 0.8532323570576062, + "grad_norm": 0.0034133633598685265, + "learning_rate": 1.4740477566120228e-05, + "loss": 0.11227246522903442, + "step": 198740 + }, + { + "epoch": 0.8532752891476263, + "grad_norm": 0.008565911091864109, + "learning_rate": 1.4736165846002605e-05, + "loss": 0.30050177574157716, + "step": 198750 + }, + { + "epoch": 0.8533182212376463, + "grad_norm": 0.010548289865255356, + "learning_rate": 1.473185412588498e-05, + "loss": 0.15760529041290283, + "step": 198760 + }, + { + "epoch": 0.8533611533276663, + "grad_norm": 1.4700676202774048, + "learning_rate": 1.4727542405767358e-05, + "loss": 0.2501633644104004, + "step": 198770 + }, + { + "epoch": 0.8534040854176863, + "grad_norm": 1.133919358253479, + "learning_rate": 1.4723230685649734e-05, + "loss": 0.09911720156669616, + "step": 198780 + }, + { + "epoch": 0.8534470175077064, + "grad_norm": 0.002514589112251997, + "learning_rate": 1.4718918965532111e-05, + "loss": 0.11620491743087769, + "step": 198790 + }, + { + "epoch": 0.8534899495977263, + "grad_norm": 0.2662227749824524, + "learning_rate": 1.4714607245414485e-05, + "loss": 0.24606781005859374, + "step": 198800 + }, + { + "epoch": 0.8535328816877463, + "grad_norm": 0.05281221121549606, + "learning_rate": 1.4710295525296864e-05, + "loss": 0.1803385615348816, + "step": 198810 + }, + { + "epoch": 0.8535758137777664, + "grad_norm": 0.2812144160270691, + "learning_rate": 1.4705983805179238e-05, + "loss": 0.06128525733947754, + "step": 198820 + }, + { + "epoch": 0.8536187458677863, + "grad_norm": 3.4682204723358154, + "learning_rate": 1.4701672085061615e-05, + "loss": 0.4475576877593994, + "step": 198830 + }, + { + "epoch": 0.8536616779578063, + "grad_norm": 11.337523460388184, + "learning_rate": 1.4697360364943991e-05, + "loss": 0.5344314575195312, + "step": 198840 + }, + { + "epoch": 0.8537046100478264, + "grad_norm": 1.6709545850753784, + "learning_rate": 1.4693048644826368e-05, + "loss": 0.28226404190063475, + "step": 198850 + }, + { + "epoch": 0.8537475421378463, + "grad_norm": 0.10255688428878784, + "learning_rate": 1.4688736924708742e-05, + "loss": 0.2390998363494873, + "step": 198860 + }, + { + "epoch": 0.8537904742278664, + "grad_norm": 0.021323187276721, + "learning_rate": 1.4684425204591121e-05, + "loss": 0.24051153659820557, + "step": 198870 + }, + { + "epoch": 0.8538334063178864, + "grad_norm": 0.011119546368718147, + "learning_rate": 1.4680113484473495e-05, + "loss": 0.09393274188041686, + "step": 198880 + }, + { + "epoch": 0.8538763384079063, + "grad_norm": 0.03298373520374298, + "learning_rate": 1.4675801764355873e-05, + "loss": 0.135442852973938, + "step": 198890 + }, + { + "epoch": 0.8539192704979264, + "grad_norm": 2.304260730743408, + "learning_rate": 1.4671490044238248e-05, + "loss": 0.1495063066482544, + "step": 198900 + }, + { + "epoch": 0.8539622025879464, + "grad_norm": 0.033370859920978546, + "learning_rate": 1.4667178324120626e-05, + "loss": 0.17575643062591553, + "step": 198910 + }, + { + "epoch": 0.8540051346779663, + "grad_norm": 0.0040658460929989815, + "learning_rate": 1.4662866604003001e-05, + "loss": 0.21572141647338866, + "step": 198920 + }, + { + "epoch": 0.8540480667679864, + "grad_norm": 1.2997663021087646, + "learning_rate": 1.4658554883885379e-05, + "loss": 0.12388513088226319, + "step": 198930 + }, + { + "epoch": 0.8540909988580064, + "grad_norm": 0.35280415415763855, + "learning_rate": 1.4654243163767753e-05, + "loss": 0.11193270683288574, + "step": 198940 + }, + { + "epoch": 0.8541339309480264, + "grad_norm": 1.490394949913025, + "learning_rate": 1.464993144365013e-05, + "loss": 0.309222149848938, + "step": 198950 + }, + { + "epoch": 0.8541768630380464, + "grad_norm": 0.7588453888893127, + "learning_rate": 1.4645619723532506e-05, + "loss": 0.28592557907104493, + "step": 198960 + }, + { + "epoch": 0.8542197951280665, + "grad_norm": 0.002865720773115754, + "learning_rate": 1.4641308003414883e-05, + "loss": 0.11334433555603027, + "step": 198970 + }, + { + "epoch": 0.8542627272180864, + "grad_norm": 17.505897521972656, + "learning_rate": 1.463699628329726e-05, + "loss": 0.2242356061935425, + "step": 198980 + }, + { + "epoch": 0.8543056593081064, + "grad_norm": 0.6268243193626404, + "learning_rate": 1.4632684563179636e-05, + "loss": 0.3863025665283203, + "step": 198990 + }, + { + "epoch": 0.8543485913981265, + "grad_norm": 0.0017697836738079786, + "learning_rate": 1.4628372843062013e-05, + "loss": 0.10508785247802735, + "step": 199000 + }, + { + "epoch": 0.8543485913981265, + "eval_loss": 0.3737829625606537, + "eval_runtime": 27.4522, + "eval_samples_per_second": 3.643, + "eval_steps_per_second": 3.643, + "step": 199000 + }, + { + "epoch": 0.8543915234881464, + "grad_norm": 21.080825805664062, + "learning_rate": 1.4624061122944387e-05, + "loss": 0.2644538640975952, + "step": 199010 + }, + { + "epoch": 0.8544344555781664, + "grad_norm": 0.2038983702659607, + "learning_rate": 1.4619749402826766e-05, + "loss": 0.24054296016693116, + "step": 199020 + }, + { + "epoch": 0.8544773876681865, + "grad_norm": 0.03463335707783699, + "learning_rate": 1.461543768270914e-05, + "loss": 0.2102268934249878, + "step": 199030 + }, + { + "epoch": 0.8545203197582064, + "grad_norm": 0.0042945947498083115, + "learning_rate": 1.4611125962591518e-05, + "loss": 0.09614935517311096, + "step": 199040 + }, + { + "epoch": 0.8545632518482265, + "grad_norm": 0.13123205304145813, + "learning_rate": 1.4606814242473893e-05, + "loss": 0.3031400203704834, + "step": 199050 + }, + { + "epoch": 0.8546061839382465, + "grad_norm": 0.06705895811319351, + "learning_rate": 1.460250252235627e-05, + "loss": 0.1421942949295044, + "step": 199060 + }, + { + "epoch": 0.8546491160282665, + "grad_norm": 4.683314323425293, + "learning_rate": 1.4598190802238646e-05, + "loss": 0.08168305158615112, + "step": 199070 + }, + { + "epoch": 0.8546920481182865, + "grad_norm": 0.0422859787940979, + "learning_rate": 1.4593879082121024e-05, + "loss": 0.16412352323532103, + "step": 199080 + }, + { + "epoch": 0.8547349802083065, + "grad_norm": 0.021981369704008102, + "learning_rate": 1.4589567362003397e-05, + "loss": 0.20852310657501222, + "step": 199090 + }, + { + "epoch": 0.8547779122983266, + "grad_norm": 0.1515505313873291, + "learning_rate": 1.4585255641885775e-05, + "loss": 0.06142481565475464, + "step": 199100 + }, + { + "epoch": 0.8548208443883465, + "grad_norm": 0.13333864510059357, + "learning_rate": 1.458094392176815e-05, + "loss": 0.2553966283798218, + "step": 199110 + }, + { + "epoch": 0.8548637764783665, + "grad_norm": 26.86813735961914, + "learning_rate": 1.4576632201650528e-05, + "loss": 0.36869144439697266, + "step": 199120 + }, + { + "epoch": 0.8549067085683866, + "grad_norm": 5.093358516693115, + "learning_rate": 1.4572320481532903e-05, + "loss": 0.32376537322998045, + "step": 199130 + }, + { + "epoch": 0.8549496406584065, + "grad_norm": 0.03296341374516487, + "learning_rate": 1.456800876141528e-05, + "loss": 0.3048763036727905, + "step": 199140 + }, + { + "epoch": 0.8549925727484265, + "grad_norm": 0.14229708909988403, + "learning_rate": 1.4563697041297655e-05, + "loss": 0.1860854983329773, + "step": 199150 + }, + { + "epoch": 0.8550355048384466, + "grad_norm": 0.0028889018576592207, + "learning_rate": 1.4559385321180034e-05, + "loss": 0.3693838357925415, + "step": 199160 + }, + { + "epoch": 0.8550784369284665, + "grad_norm": 0.8938426971435547, + "learning_rate": 1.4555073601062408e-05, + "loss": 0.3041311979293823, + "step": 199170 + }, + { + "epoch": 0.8551213690184866, + "grad_norm": 0.0019801489543169737, + "learning_rate": 1.4550761880944785e-05, + "loss": 0.2032299041748047, + "step": 199180 + }, + { + "epoch": 0.8551643011085066, + "grad_norm": 5.202205181121826, + "learning_rate": 1.454645016082716e-05, + "loss": 0.09197419881820679, + "step": 199190 + }, + { + "epoch": 0.8552072331985265, + "grad_norm": 3.7978355884552, + "learning_rate": 1.4542138440709538e-05, + "loss": 0.38236684799194337, + "step": 199200 + }, + { + "epoch": 0.8552501652885466, + "grad_norm": 0.003671700833365321, + "learning_rate": 1.4537826720591912e-05, + "loss": 0.17069878578186035, + "step": 199210 + }, + { + "epoch": 0.8552930973785666, + "grad_norm": 0.001484514563344419, + "learning_rate": 1.4533515000474291e-05, + "loss": 0.15370936393737794, + "step": 199220 + }, + { + "epoch": 0.8553360294685866, + "grad_norm": 3.502985715866089, + "learning_rate": 1.4529203280356665e-05, + "loss": 0.3548114776611328, + "step": 199230 + }, + { + "epoch": 0.8553789615586066, + "grad_norm": 0.02675688825547695, + "learning_rate": 1.4524891560239042e-05, + "loss": 0.45171194076538085, + "step": 199240 + }, + { + "epoch": 0.8554218936486266, + "grad_norm": 4.345065593719482, + "learning_rate": 1.4520579840121418e-05, + "loss": 0.3299868583679199, + "step": 199250 + }, + { + "epoch": 0.8554648257386466, + "grad_norm": 0.011832942254841328, + "learning_rate": 1.4516268120003795e-05, + "loss": 0.1040421962738037, + "step": 199260 + }, + { + "epoch": 0.8555077578286666, + "grad_norm": 0.05655861273407936, + "learning_rate": 1.4511956399886171e-05, + "loss": 0.10234776735305787, + "step": 199270 + }, + { + "epoch": 0.8555506899186867, + "grad_norm": 0.03904902935028076, + "learning_rate": 1.4507644679768548e-05, + "loss": 0.20289580821990966, + "step": 199280 + }, + { + "epoch": 0.8555936220087066, + "grad_norm": 1.6829935312271118, + "learning_rate": 1.4503332959650922e-05, + "loss": 0.1187442421913147, + "step": 199290 + }, + { + "epoch": 0.8556365540987266, + "grad_norm": 0.07314160466194153, + "learning_rate": 1.44990212395333e-05, + "loss": 0.23276002407073976, + "step": 199300 + }, + { + "epoch": 0.8556794861887467, + "grad_norm": 0.026286179199814796, + "learning_rate": 1.4494709519415675e-05, + "loss": 0.424951171875, + "step": 199310 + }, + { + "epoch": 0.8557224182787666, + "grad_norm": 0.01814284734427929, + "learning_rate": 1.4490397799298053e-05, + "loss": 0.35801832675933837, + "step": 199320 + }, + { + "epoch": 0.8557653503687866, + "grad_norm": 0.002590248826891184, + "learning_rate": 1.4486086079180428e-05, + "loss": 0.25980291366577146, + "step": 199330 + }, + { + "epoch": 0.8558082824588067, + "grad_norm": 0.009577231481671333, + "learning_rate": 1.4481774359062806e-05, + "loss": 0.09345943331718445, + "step": 199340 + }, + { + "epoch": 0.8558512145488266, + "grad_norm": 0.005946278106421232, + "learning_rate": 1.4477462638945183e-05, + "loss": 0.21709318161010743, + "step": 199350 + }, + { + "epoch": 0.8558941466388467, + "grad_norm": 0.15229612588882446, + "learning_rate": 1.4473150918827557e-05, + "loss": 0.14111752510070802, + "step": 199360 + }, + { + "epoch": 0.8559370787288667, + "grad_norm": 0.004969291388988495, + "learning_rate": 1.4468839198709936e-05, + "loss": 0.14079365730285645, + "step": 199370 + }, + { + "epoch": 0.8559800108188866, + "grad_norm": 0.028083153069019318, + "learning_rate": 1.446452747859231e-05, + "loss": 0.11227288246154785, + "step": 199380 + }, + { + "epoch": 0.8560229429089067, + "grad_norm": 0.001869790954515338, + "learning_rate": 1.4460215758474687e-05, + "loss": 0.054939448833465576, + "step": 199390 + }, + { + "epoch": 0.8560658749989267, + "grad_norm": 0.37315037846565247, + "learning_rate": 1.4455904038357063e-05, + "loss": 0.14655609130859376, + "step": 199400 + }, + { + "epoch": 0.8561088070889467, + "grad_norm": 3.289761543273926, + "learning_rate": 1.445159231823944e-05, + "loss": 0.14338996410369872, + "step": 199410 + }, + { + "epoch": 0.8561517391789667, + "grad_norm": 42.340110778808594, + "learning_rate": 1.4447280598121816e-05, + "loss": 0.036475923657417295, + "step": 199420 + }, + { + "epoch": 0.8561946712689867, + "grad_norm": 0.0018733445322141051, + "learning_rate": 1.4442968878004193e-05, + "loss": 0.039892581105232236, + "step": 199430 + }, + { + "epoch": 0.8562376033590067, + "grad_norm": 0.045775555074214935, + "learning_rate": 1.4438657157886567e-05, + "loss": 0.03090730607509613, + "step": 199440 + }, + { + "epoch": 0.8562805354490267, + "grad_norm": 1.3289633989334106, + "learning_rate": 1.4434345437768945e-05, + "loss": 0.22715282440185547, + "step": 199450 + }, + { + "epoch": 0.8563234675390468, + "grad_norm": 0.033413052558898926, + "learning_rate": 1.443003371765132e-05, + "loss": 0.17703909873962403, + "step": 199460 + }, + { + "epoch": 0.8563663996290667, + "grad_norm": 0.0026612640358507633, + "learning_rate": 1.4425721997533698e-05, + "loss": 0.09658971428871155, + "step": 199470 + }, + { + "epoch": 0.8564093317190867, + "grad_norm": 0.0032949293963611126, + "learning_rate": 1.4421410277416073e-05, + "loss": 0.08444079160690307, + "step": 199480 + }, + { + "epoch": 0.8564522638091068, + "grad_norm": 2.955509662628174, + "learning_rate": 1.441709855729845e-05, + "loss": 0.3362619161605835, + "step": 199490 + }, + { + "epoch": 0.8564951958991268, + "grad_norm": 1.2076606750488281, + "learning_rate": 1.4412786837180824e-05, + "loss": 0.15357297658920288, + "step": 199500 + }, + { + "epoch": 0.8565381279891467, + "grad_norm": 1.189163327217102, + "learning_rate": 1.4408475117063202e-05, + "loss": 0.20367789268493652, + "step": 199510 + }, + { + "epoch": 0.8565810600791668, + "grad_norm": 3.673931360244751, + "learning_rate": 1.4404163396945577e-05, + "loss": 0.3991194486618042, + "step": 199520 + }, + { + "epoch": 0.8566239921691868, + "grad_norm": 0.24287502467632294, + "learning_rate": 1.4399851676827955e-05, + "loss": 0.20523061752319335, + "step": 199530 + }, + { + "epoch": 0.8566669242592068, + "grad_norm": 0.8159131407737732, + "learning_rate": 1.439553995671033e-05, + "loss": 0.16643912792205812, + "step": 199540 + }, + { + "epoch": 0.8567098563492268, + "grad_norm": 0.0028872189577668905, + "learning_rate": 1.4391228236592708e-05, + "loss": 0.22258753776550294, + "step": 199550 + }, + { + "epoch": 0.8567527884392468, + "grad_norm": 1.2108325958251953, + "learning_rate": 1.4386916516475082e-05, + "loss": 0.13343702554702758, + "step": 199560 + }, + { + "epoch": 0.8567957205292668, + "grad_norm": 1.2748582363128662, + "learning_rate": 1.438260479635746e-05, + "loss": 0.1584118366241455, + "step": 199570 + }, + { + "epoch": 0.8568386526192868, + "grad_norm": 3.328965663909912, + "learning_rate": 1.4378293076239835e-05, + "loss": 0.08251501321792602, + "step": 199580 + }, + { + "epoch": 0.8568815847093069, + "grad_norm": 9.528289794921875, + "learning_rate": 1.4373981356122212e-05, + "loss": 0.3464500904083252, + "step": 199590 + }, + { + "epoch": 0.8569245167993268, + "grad_norm": 0.027622880414128304, + "learning_rate": 1.4369669636004588e-05, + "loss": 0.11451354026794433, + "step": 199600 + }, + { + "epoch": 0.8569674488893468, + "grad_norm": 0.0027303590904921293, + "learning_rate": 1.4365357915886965e-05, + "loss": 0.24467782974243163, + "step": 199610 + }, + { + "epoch": 0.8570103809793669, + "grad_norm": 0.02862054668366909, + "learning_rate": 1.4361046195769339e-05, + "loss": 0.1920831799507141, + "step": 199620 + }, + { + "epoch": 0.8570533130693868, + "grad_norm": 0.0019593604374676943, + "learning_rate": 1.4356734475651718e-05, + "loss": 0.11015578508377075, + "step": 199630 + }, + { + "epoch": 0.8570962451594069, + "grad_norm": 3.255344867706299, + "learning_rate": 1.4352422755534092e-05, + "loss": 0.3502398729324341, + "step": 199640 + }, + { + "epoch": 0.8571391772494269, + "grad_norm": 2.490980625152588, + "learning_rate": 1.434811103541647e-05, + "loss": 0.4833946704864502, + "step": 199650 + }, + { + "epoch": 0.8571821093394468, + "grad_norm": 0.00246992614120245, + "learning_rate": 1.4343799315298845e-05, + "loss": 0.1826173782348633, + "step": 199660 + }, + { + "epoch": 0.8572250414294669, + "grad_norm": 0.002251984551548958, + "learning_rate": 1.4339487595181222e-05, + "loss": 0.5154530048370362, + "step": 199670 + }, + { + "epoch": 0.8572679735194869, + "grad_norm": 5.490510940551758, + "learning_rate": 1.4335175875063598e-05, + "loss": 0.29611093997955323, + "step": 199680 + }, + { + "epoch": 0.8573109056095068, + "grad_norm": 2.570019006729126, + "learning_rate": 1.4330864154945975e-05, + "loss": 0.40025644302368163, + "step": 199690 + }, + { + "epoch": 0.8573538376995269, + "grad_norm": 0.5635516047477722, + "learning_rate": 1.432655243482835e-05, + "loss": 0.06518831253051757, + "step": 199700 + }, + { + "epoch": 0.8573967697895469, + "grad_norm": 2.1924285888671875, + "learning_rate": 1.4322240714710727e-05, + "loss": 0.15234854221343994, + "step": 199710 + }, + { + "epoch": 0.8574397018795669, + "grad_norm": 3.579240083694458, + "learning_rate": 1.4317928994593106e-05, + "loss": 0.17781139612197877, + "step": 199720 + }, + { + "epoch": 0.8574826339695869, + "grad_norm": 0.04103695973753929, + "learning_rate": 1.431361727447548e-05, + "loss": 0.41463704109191896, + "step": 199730 + }, + { + "epoch": 0.857525566059607, + "grad_norm": 1.245309829711914, + "learning_rate": 1.4309305554357857e-05, + "loss": 0.1456066131591797, + "step": 199740 + }, + { + "epoch": 0.8575684981496269, + "grad_norm": 0.6712180972099304, + "learning_rate": 1.4304993834240233e-05, + "loss": 0.40964322090148925, + "step": 199750 + }, + { + "epoch": 0.8576114302396469, + "grad_norm": 1.6160268783569336, + "learning_rate": 1.430068211412261e-05, + "loss": 0.27961690425872804, + "step": 199760 + }, + { + "epoch": 0.857654362329667, + "grad_norm": 0.030963778495788574, + "learning_rate": 1.4296370394004986e-05, + "loss": 0.19356164932250977, + "step": 199770 + }, + { + "epoch": 0.8576972944196869, + "grad_norm": 0.06203527748584747, + "learning_rate": 1.4292058673887363e-05, + "loss": 0.1277173399925232, + "step": 199780 + }, + { + "epoch": 0.8577402265097069, + "grad_norm": 0.07475373893976212, + "learning_rate": 1.4287746953769737e-05, + "loss": 0.13933945894241334, + "step": 199790 + }, + { + "epoch": 0.857783158599727, + "grad_norm": 2.2970597743988037, + "learning_rate": 1.4283435233652114e-05, + "loss": 0.343434739112854, + "step": 199800 + }, + { + "epoch": 0.8578260906897469, + "grad_norm": 1.9501216411590576, + "learning_rate": 1.427912351353449e-05, + "loss": 0.17411470413208008, + "step": 199810 + }, + { + "epoch": 0.857869022779767, + "grad_norm": 1.5812982320785522, + "learning_rate": 1.4274811793416867e-05, + "loss": 0.12971644401550292, + "step": 199820 + }, + { + "epoch": 0.857911954869787, + "grad_norm": 0.2392256110906601, + "learning_rate": 1.4270500073299243e-05, + "loss": 0.07452273964881898, + "step": 199830 + }, + { + "epoch": 0.8579548869598069, + "grad_norm": 2.668260097503662, + "learning_rate": 1.426618835318162e-05, + "loss": 0.1514972448348999, + "step": 199840 + }, + { + "epoch": 0.857997819049827, + "grad_norm": 0.029095986858010292, + "learning_rate": 1.4261876633063994e-05, + "loss": 0.06277897953987122, + "step": 199850 + }, + { + "epoch": 0.858040751139847, + "grad_norm": 0.001128542353399098, + "learning_rate": 1.4257564912946372e-05, + "loss": 0.24682888984680176, + "step": 199860 + }, + { + "epoch": 0.858083683229867, + "grad_norm": 0.061851732432842255, + "learning_rate": 1.4253253192828747e-05, + "loss": 0.12122787237167358, + "step": 199870 + }, + { + "epoch": 0.858126615319887, + "grad_norm": 0.0018141150940209627, + "learning_rate": 1.4248941472711125e-05, + "loss": 0.37261626720428465, + "step": 199880 + }, + { + "epoch": 0.858169547409907, + "grad_norm": 0.0010228273458778858, + "learning_rate": 1.42446297525935e-05, + "loss": 0.10979610681533813, + "step": 199890 + }, + { + "epoch": 0.858212479499927, + "grad_norm": 0.2523162364959717, + "learning_rate": 1.4240318032475878e-05, + "loss": 0.22949092388153075, + "step": 199900 + }, + { + "epoch": 0.858255411589947, + "grad_norm": 0.06273109465837479, + "learning_rate": 1.4236006312358251e-05, + "loss": 0.28017995357513426, + "step": 199910 + }, + { + "epoch": 0.858298343679967, + "grad_norm": 0.008993618190288544, + "learning_rate": 1.423169459224063e-05, + "loss": 0.09785916209220887, + "step": 199920 + }, + { + "epoch": 0.8583412757699871, + "grad_norm": 0.0179828479886055, + "learning_rate": 1.4227382872123004e-05, + "loss": 0.4375472545623779, + "step": 199930 + }, + { + "epoch": 0.858384207860007, + "grad_norm": 0.003786655142903328, + "learning_rate": 1.4223071152005382e-05, + "loss": 0.28068861961364744, + "step": 199940 + }, + { + "epoch": 0.8584271399500271, + "grad_norm": 1.220051884651184, + "learning_rate": 1.4218759431887757e-05, + "loss": 0.3362873077392578, + "step": 199950 + }, + { + "epoch": 0.8584700720400471, + "grad_norm": 0.004382483195513487, + "learning_rate": 1.4214447711770135e-05, + "loss": 0.1859076976776123, + "step": 199960 + }, + { + "epoch": 0.858513004130067, + "grad_norm": 0.008440139703452587, + "learning_rate": 1.4210135991652509e-05, + "loss": 0.2096766471862793, + "step": 199970 + }, + { + "epoch": 0.8585559362200871, + "grad_norm": 0.004920396488159895, + "learning_rate": 1.4205824271534888e-05, + "loss": 0.23950378894805907, + "step": 199980 + }, + { + "epoch": 0.8585988683101071, + "grad_norm": 0.0025335282552987337, + "learning_rate": 1.4201512551417262e-05, + "loss": 0.14211657047271728, + "step": 199990 + }, + { + "epoch": 0.858641800400127, + "grad_norm": 0.011228648945689201, + "learning_rate": 1.4197200831299639e-05, + "loss": 0.16281137466430665, + "step": 200000 + }, + { + "epoch": 0.858641800400127, + "eval_loss": 0.3807581961154938, + "eval_runtime": 27.6023, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 3.623, + "step": 200000 + }, + { + "epoch": 0.8586847324901471, + "grad_norm": 0.020021537318825722, + "learning_rate": 1.4192889111182015e-05, + "loss": 0.3246197462081909, + "step": 200010 + }, + { + "epoch": 0.8587276645801671, + "grad_norm": 0.00552589725703001, + "learning_rate": 1.4188577391064392e-05, + "loss": 0.325114107131958, + "step": 200020 + }, + { + "epoch": 0.8587705966701871, + "grad_norm": 1.4242668151855469, + "learning_rate": 1.4184265670946768e-05, + "loss": 0.1461607336997986, + "step": 200030 + }, + { + "epoch": 0.8588135287602071, + "grad_norm": 2.1981289386749268, + "learning_rate": 1.4179953950829145e-05, + "loss": 0.23893659114837645, + "step": 200040 + }, + { + "epoch": 0.8588564608502272, + "grad_norm": 0.003543504746630788, + "learning_rate": 1.4175642230711519e-05, + "loss": 0.17610794305801392, + "step": 200050 + }, + { + "epoch": 0.8588993929402471, + "grad_norm": 7.391210079193115, + "learning_rate": 1.4171330510593896e-05, + "loss": 0.28483357429504397, + "step": 200060 + }, + { + "epoch": 0.8589423250302671, + "grad_norm": 0.03855060786008835, + "learning_rate": 1.4167018790476272e-05, + "loss": 0.28767685890197753, + "step": 200070 + }, + { + "epoch": 0.8589852571202872, + "grad_norm": 4.486928939819336, + "learning_rate": 1.416270707035865e-05, + "loss": 0.1376192569732666, + "step": 200080 + }, + { + "epoch": 0.8590281892103071, + "grad_norm": 0.01081493217498064, + "learning_rate": 1.4158395350241027e-05, + "loss": 0.2854840040206909, + "step": 200090 + }, + { + "epoch": 0.8590711213003271, + "grad_norm": 0.015720579773187637, + "learning_rate": 1.4154083630123402e-05, + "loss": 0.15743573904037475, + "step": 200100 + }, + { + "epoch": 0.8591140533903472, + "grad_norm": 0.38807976245880127, + "learning_rate": 1.414977191000578e-05, + "loss": 0.11960583925247192, + "step": 200110 + }, + { + "epoch": 0.8591569854803671, + "grad_norm": 0.2119477093219757, + "learning_rate": 1.4145460189888154e-05, + "loss": 0.09136803150177002, + "step": 200120 + }, + { + "epoch": 0.8591999175703872, + "grad_norm": 1.8273348808288574, + "learning_rate": 1.4141148469770533e-05, + "loss": 0.1844263792037964, + "step": 200130 + }, + { + "epoch": 0.8592428496604072, + "grad_norm": 6.799977779388428, + "learning_rate": 1.4136836749652907e-05, + "loss": 0.2549062013626099, + "step": 200140 + }, + { + "epoch": 0.8592857817504271, + "grad_norm": 0.05221116542816162, + "learning_rate": 1.4132525029535284e-05, + "loss": 0.01656073033809662, + "step": 200150 + }, + { + "epoch": 0.8593287138404472, + "grad_norm": 0.002522778697311878, + "learning_rate": 1.412821330941766e-05, + "loss": 0.18575385808944703, + "step": 200160 + }, + { + "epoch": 0.8593716459304672, + "grad_norm": 2.425380229949951, + "learning_rate": 1.4123901589300037e-05, + "loss": 0.3281693458557129, + "step": 200170 + }, + { + "epoch": 0.8594145780204872, + "grad_norm": 1.184431791305542, + "learning_rate": 1.4119589869182413e-05, + "loss": 0.16849026679992676, + "step": 200180 + }, + { + "epoch": 0.8594575101105072, + "grad_norm": 0.27626046538352966, + "learning_rate": 1.411527814906479e-05, + "loss": 0.2617574453353882, + "step": 200190 + }, + { + "epoch": 0.8595004422005272, + "grad_norm": 0.0017152708023786545, + "learning_rate": 1.4110966428947164e-05, + "loss": 0.2841400146484375, + "step": 200200 + }, + { + "epoch": 0.8595433742905472, + "grad_norm": 0.006438401993364096, + "learning_rate": 1.4106654708829541e-05, + "loss": 0.4257059097290039, + "step": 200210 + }, + { + "epoch": 0.8595863063805672, + "grad_norm": 0.039594002068042755, + "learning_rate": 1.4102342988711917e-05, + "loss": 0.16782171726226808, + "step": 200220 + }, + { + "epoch": 0.8596292384705873, + "grad_norm": 0.28560030460357666, + "learning_rate": 1.4098031268594294e-05, + "loss": 0.22525856494903565, + "step": 200230 + }, + { + "epoch": 0.8596721705606072, + "grad_norm": 2.2180888652801514, + "learning_rate": 1.409371954847667e-05, + "loss": 0.1681581974029541, + "step": 200240 + }, + { + "epoch": 0.8597151026506272, + "grad_norm": 0.059903547167778015, + "learning_rate": 1.4089407828359047e-05, + "loss": 0.11498527526855469, + "step": 200250 + }, + { + "epoch": 0.8597580347406473, + "grad_norm": 0.017579248175024986, + "learning_rate": 1.4085096108241421e-05, + "loss": 0.2624624252319336, + "step": 200260 + }, + { + "epoch": 0.8598009668306672, + "grad_norm": 0.0018392846686765552, + "learning_rate": 1.4080784388123799e-05, + "loss": 0.20496695041656493, + "step": 200270 + }, + { + "epoch": 0.8598438989206872, + "grad_norm": 0.022280734032392502, + "learning_rate": 1.4076472668006174e-05, + "loss": 0.22362060546875, + "step": 200280 + }, + { + "epoch": 0.8598868310107073, + "grad_norm": 0.04711763188242912, + "learning_rate": 1.4072160947888552e-05, + "loss": 0.25614199638366697, + "step": 200290 + }, + { + "epoch": 0.8599297631007272, + "grad_norm": 0.9575517177581787, + "learning_rate": 1.4067849227770927e-05, + "loss": 0.3862587928771973, + "step": 200300 + }, + { + "epoch": 0.8599726951907473, + "grad_norm": 1.2002527713775635, + "learning_rate": 1.4063537507653305e-05, + "loss": 0.29901714324951173, + "step": 200310 + }, + { + "epoch": 0.8600156272807673, + "grad_norm": 3.8473565578460693, + "learning_rate": 1.4059225787535679e-05, + "loss": 0.2574320793151855, + "step": 200320 + }, + { + "epoch": 0.8600585593707872, + "grad_norm": 0.025791872292757034, + "learning_rate": 1.4054914067418058e-05, + "loss": 0.22598130702972413, + "step": 200330 + }, + { + "epoch": 0.8601014914608073, + "grad_norm": 0.2701096534729004, + "learning_rate": 1.4050602347300432e-05, + "loss": 0.3798185348510742, + "step": 200340 + }, + { + "epoch": 0.8601444235508273, + "grad_norm": 0.2829890251159668, + "learning_rate": 1.4046290627182809e-05, + "loss": 0.21899137496948243, + "step": 200350 + }, + { + "epoch": 0.8601873556408474, + "grad_norm": 0.024702927097678185, + "learning_rate": 1.4041978907065184e-05, + "loss": 0.16811333894729613, + "step": 200360 + }, + { + "epoch": 0.8602302877308673, + "grad_norm": 0.07576318085193634, + "learning_rate": 1.4037667186947562e-05, + "loss": 0.00663701742887497, + "step": 200370 + }, + { + "epoch": 0.8602732198208873, + "grad_norm": 1.4503016471862793, + "learning_rate": 1.4033355466829936e-05, + "loss": 0.12653093338012694, + "step": 200380 + }, + { + "epoch": 0.8603161519109074, + "grad_norm": 1.278931975364685, + "learning_rate": 1.4029043746712315e-05, + "loss": 0.149183988571167, + "step": 200390 + }, + { + "epoch": 0.8603590840009273, + "grad_norm": 2.7294182777404785, + "learning_rate": 1.4024732026594689e-05, + "loss": 0.2316087007522583, + "step": 200400 + }, + { + "epoch": 0.8604020160909474, + "grad_norm": 0.0018006651662290096, + "learning_rate": 1.4020420306477066e-05, + "loss": 0.16761742830276488, + "step": 200410 + }, + { + "epoch": 0.8604449481809674, + "grad_norm": 0.011157185770571232, + "learning_rate": 1.4016108586359442e-05, + "loss": 0.16158027648925782, + "step": 200420 + }, + { + "epoch": 0.8604878802709873, + "grad_norm": 2.3285841941833496, + "learning_rate": 1.4011796866241819e-05, + "loss": 0.2909983158111572, + "step": 200430 + }, + { + "epoch": 0.8605308123610074, + "grad_norm": 4.663473129272461, + "learning_rate": 1.4007485146124196e-05, + "loss": 0.20800578594207764, + "step": 200440 + }, + { + "epoch": 0.8605737444510274, + "grad_norm": 0.18279698491096497, + "learning_rate": 1.4003173426006572e-05, + "loss": 0.1953489065170288, + "step": 200450 + }, + { + "epoch": 0.8606166765410473, + "grad_norm": 0.009925964288413525, + "learning_rate": 1.399886170588895e-05, + "loss": 0.2169330358505249, + "step": 200460 + }, + { + "epoch": 0.8606596086310674, + "grad_norm": 0.19705329835414886, + "learning_rate": 1.3994549985771323e-05, + "loss": 0.0679843246936798, + "step": 200470 + }, + { + "epoch": 0.8607025407210874, + "grad_norm": 0.15052109956741333, + "learning_rate": 1.3990238265653702e-05, + "loss": 0.24635510444641112, + "step": 200480 + }, + { + "epoch": 0.8607454728111074, + "grad_norm": 0.002157369162887335, + "learning_rate": 1.3985926545536076e-05, + "loss": 0.1807490348815918, + "step": 200490 + }, + { + "epoch": 0.8607884049011274, + "grad_norm": 0.0005508503527380526, + "learning_rate": 1.3981614825418454e-05, + "loss": 0.06903789043426514, + "step": 200500 + }, + { + "epoch": 0.8608313369911474, + "grad_norm": 1.3312430381774902, + "learning_rate": 1.397730310530083e-05, + "loss": 0.12055761814117431, + "step": 200510 + }, + { + "epoch": 0.8608742690811674, + "grad_norm": 0.02469063363969326, + "learning_rate": 1.3972991385183207e-05, + "loss": 0.02727437615394592, + "step": 200520 + }, + { + "epoch": 0.8609172011711874, + "grad_norm": 2.345923662185669, + "learning_rate": 1.3968679665065582e-05, + "loss": 0.20277533531188965, + "step": 200530 + }, + { + "epoch": 0.8609601332612075, + "grad_norm": 0.0974428579211235, + "learning_rate": 1.396436794494796e-05, + "loss": 0.17809678316116334, + "step": 200540 + }, + { + "epoch": 0.8610030653512274, + "grad_norm": 1.2723183631896973, + "learning_rate": 1.3960056224830334e-05, + "loss": 0.1863088607788086, + "step": 200550 + }, + { + "epoch": 0.8610459974412474, + "grad_norm": 1.1321065425872803, + "learning_rate": 1.3955744504712711e-05, + "loss": 0.2558783292770386, + "step": 200560 + }, + { + "epoch": 0.8610889295312675, + "grad_norm": 1.367501974105835, + "learning_rate": 1.3951432784595087e-05, + "loss": 0.21771454811096191, + "step": 200570 + }, + { + "epoch": 0.8611318616212874, + "grad_norm": 0.05725770443677902, + "learning_rate": 1.3947121064477464e-05, + "loss": 0.08010371923446655, + "step": 200580 + }, + { + "epoch": 0.8611747937113075, + "grad_norm": 1.8050976991653442, + "learning_rate": 1.394280934435984e-05, + "loss": 0.32913758754730227, + "step": 200590 + }, + { + "epoch": 0.8612177258013275, + "grad_norm": 0.08833785355091095, + "learning_rate": 1.3938497624242217e-05, + "loss": 0.03277966380119324, + "step": 200600 + }, + { + "epoch": 0.8612606578913474, + "grad_norm": 0.005714171566069126, + "learning_rate": 1.3934185904124591e-05, + "loss": 0.18971192836761475, + "step": 200610 + }, + { + "epoch": 0.8613035899813675, + "grad_norm": 0.010824470780789852, + "learning_rate": 1.3929874184006968e-05, + "loss": 0.2813425540924072, + "step": 200620 + }, + { + "epoch": 0.8613465220713875, + "grad_norm": 0.00637517124414444, + "learning_rate": 1.3925562463889344e-05, + "loss": 0.2274622440338135, + "step": 200630 + }, + { + "epoch": 0.8613894541614074, + "grad_norm": 0.015382036566734314, + "learning_rate": 1.3921250743771721e-05, + "loss": 0.05090780854225159, + "step": 200640 + }, + { + "epoch": 0.8614323862514275, + "grad_norm": 2.6729819774627686, + "learning_rate": 1.3916939023654097e-05, + "loss": 0.24126005172729492, + "step": 200650 + }, + { + "epoch": 0.8614753183414475, + "grad_norm": 1.538490653038025, + "learning_rate": 1.3912627303536474e-05, + "loss": 0.21481027603149414, + "step": 200660 + }, + { + "epoch": 0.8615182504314675, + "grad_norm": 0.002245868556201458, + "learning_rate": 1.3908315583418848e-05, + "loss": 0.23646047115325927, + "step": 200670 + }, + { + "epoch": 0.8615611825214875, + "grad_norm": 0.0007779735024087131, + "learning_rate": 1.3904003863301227e-05, + "loss": 0.1696911096572876, + "step": 200680 + }, + { + "epoch": 0.8616041146115075, + "grad_norm": 2.3345389366149902, + "learning_rate": 1.3899692143183601e-05, + "loss": 0.24698200225830078, + "step": 200690 + }, + { + "epoch": 0.8616470467015275, + "grad_norm": 0.010887118056416512, + "learning_rate": 1.3895380423065979e-05, + "loss": 0.11975333690643311, + "step": 200700 + }, + { + "epoch": 0.8616899787915475, + "grad_norm": 0.03210204094648361, + "learning_rate": 1.3891068702948354e-05, + "loss": 0.12684575319290162, + "step": 200710 + }, + { + "epoch": 0.8617329108815676, + "grad_norm": 2.8229777812957764, + "learning_rate": 1.3886756982830732e-05, + "loss": 0.33409197330474855, + "step": 200720 + }, + { + "epoch": 0.8617758429715875, + "grad_norm": 1.6647875308990479, + "learning_rate": 1.3882445262713106e-05, + "loss": 0.16415317058563234, + "step": 200730 + }, + { + "epoch": 0.8618187750616075, + "grad_norm": 1.3204957246780396, + "learning_rate": 1.3878133542595485e-05, + "loss": 0.17294979095458984, + "step": 200740 + }, + { + "epoch": 0.8618617071516276, + "grad_norm": 0.0013664969010278583, + "learning_rate": 1.3873821822477859e-05, + "loss": 0.1268696069717407, + "step": 200750 + }, + { + "epoch": 0.8619046392416475, + "grad_norm": 0.16938357055187225, + "learning_rate": 1.3869510102360236e-05, + "loss": 0.06001962423324585, + "step": 200760 + }, + { + "epoch": 0.8619475713316676, + "grad_norm": 0.8504765629768372, + "learning_rate": 1.3865198382242612e-05, + "loss": 0.48522701263427737, + "step": 200770 + }, + { + "epoch": 0.8619905034216876, + "grad_norm": 0.0021308199502527714, + "learning_rate": 1.3860886662124989e-05, + "loss": 0.2758405447006226, + "step": 200780 + }, + { + "epoch": 0.8620334355117076, + "grad_norm": 0.004662891384214163, + "learning_rate": 1.3856574942007365e-05, + "loss": 0.3536546230316162, + "step": 200790 + }, + { + "epoch": 0.8620763676017276, + "grad_norm": 5.162191390991211, + "learning_rate": 1.3852263221889742e-05, + "loss": 0.12391103506088257, + "step": 200800 + }, + { + "epoch": 0.8621192996917476, + "grad_norm": 0.003000795841217041, + "learning_rate": 1.384795150177212e-05, + "loss": 0.08238120079040527, + "step": 200810 + }, + { + "epoch": 0.8621622317817677, + "grad_norm": 0.6597919464111328, + "learning_rate": 1.3843639781654493e-05, + "loss": 0.1471768617630005, + "step": 200820 + }, + { + "epoch": 0.8622051638717876, + "grad_norm": 3.253000020980835, + "learning_rate": 1.3839328061536872e-05, + "loss": 0.3409099817276001, + "step": 200830 + }, + { + "epoch": 0.8622480959618076, + "grad_norm": 0.0024019486736506224, + "learning_rate": 1.3835016341419246e-05, + "loss": 0.3075373888015747, + "step": 200840 + }, + { + "epoch": 0.8622910280518277, + "grad_norm": 0.12324788421392441, + "learning_rate": 1.3830704621301623e-05, + "loss": 0.05642509460449219, + "step": 200850 + }, + { + "epoch": 0.8623339601418476, + "grad_norm": 1.4376167058944702, + "learning_rate": 1.3826392901183999e-05, + "loss": 0.11846233606338501, + "step": 200860 + }, + { + "epoch": 0.8623768922318676, + "grad_norm": 0.025254063308238983, + "learning_rate": 1.3822081181066376e-05, + "loss": 0.17070431709289552, + "step": 200870 + }, + { + "epoch": 0.8624198243218877, + "grad_norm": 0.9955533146858215, + "learning_rate": 1.381776946094875e-05, + "loss": 0.34915981292724607, + "step": 200880 + }, + { + "epoch": 0.8624627564119076, + "grad_norm": 0.008249502629041672, + "learning_rate": 1.381345774083113e-05, + "loss": 0.23022923469543458, + "step": 200890 + }, + { + "epoch": 0.8625056885019277, + "grad_norm": 1.0851255655288696, + "learning_rate": 1.3809146020713503e-05, + "loss": 0.2843886375427246, + "step": 200900 + }, + { + "epoch": 0.8625486205919477, + "grad_norm": 2.4486043453216553, + "learning_rate": 1.380483430059588e-05, + "loss": 0.2356062650680542, + "step": 200910 + }, + { + "epoch": 0.8625915526819676, + "grad_norm": 0.03232686221599579, + "learning_rate": 1.3800522580478256e-05, + "loss": 0.13071533441543579, + "step": 200920 + }, + { + "epoch": 0.8626344847719877, + "grad_norm": 1.3295395374298096, + "learning_rate": 1.3796210860360634e-05, + "loss": 0.07537164092063904, + "step": 200930 + }, + { + "epoch": 0.8626774168620077, + "grad_norm": 0.08242210745811462, + "learning_rate": 1.379189914024301e-05, + "loss": 0.10701709985733032, + "step": 200940 + }, + { + "epoch": 0.8627203489520276, + "grad_norm": 0.059901103377342224, + "learning_rate": 1.3787587420125387e-05, + "loss": 0.23902592658996583, + "step": 200950 + }, + { + "epoch": 0.8627632810420477, + "grad_norm": 2.502207040786743, + "learning_rate": 1.378327570000776e-05, + "loss": 0.16078464984893798, + "step": 200960 + }, + { + "epoch": 0.8628062131320677, + "grad_norm": 0.0037575415335595608, + "learning_rate": 1.3778963979890138e-05, + "loss": 0.040537270903587344, + "step": 200970 + }, + { + "epoch": 0.8628491452220877, + "grad_norm": 0.0015732988249510527, + "learning_rate": 1.3774652259772514e-05, + "loss": 0.24941112995147705, + "step": 200980 + }, + { + "epoch": 0.8628920773121077, + "grad_norm": 0.08129372447729111, + "learning_rate": 1.3770340539654891e-05, + "loss": 0.05371713638305664, + "step": 200990 + }, + { + "epoch": 0.8629350094021278, + "grad_norm": 0.0015491340309381485, + "learning_rate": 1.3766028819537267e-05, + "loss": 0.32582085132598876, + "step": 201000 + }, + { + "epoch": 0.8629350094021278, + "eval_loss": 0.3771650195121765, + "eval_runtime": 27.4127, + "eval_samples_per_second": 3.648, + "eval_steps_per_second": 3.648, + "step": 201000 + }, + { + "epoch": 0.8629779414921477, + "grad_norm": 1.424601674079895, + "learning_rate": 1.3761717099419644e-05, + "loss": 0.027511578798294068, + "step": 201010 + }, + { + "epoch": 0.8630208735821677, + "grad_norm": 0.012731004506349564, + "learning_rate": 1.3757405379302018e-05, + "loss": 0.20147929191589356, + "step": 201020 + }, + { + "epoch": 0.8630638056721878, + "grad_norm": 0.1110619604587555, + "learning_rate": 1.3753093659184397e-05, + "loss": 0.026392871141433717, + "step": 201030 + }, + { + "epoch": 0.8631067377622077, + "grad_norm": 1.733137845993042, + "learning_rate": 1.3748781939066771e-05, + "loss": 0.16961859464645385, + "step": 201040 + }, + { + "epoch": 0.8631496698522277, + "grad_norm": 0.0012216472532600164, + "learning_rate": 1.3744470218949148e-05, + "loss": 0.23530104160308837, + "step": 201050 + }, + { + "epoch": 0.8631926019422478, + "grad_norm": 0.03294944763183594, + "learning_rate": 1.3740158498831524e-05, + "loss": 0.08938648104667664, + "step": 201060 + }, + { + "epoch": 0.8632355340322677, + "grad_norm": 6.165449619293213, + "learning_rate": 1.3735846778713901e-05, + "loss": 0.2767988681793213, + "step": 201070 + }, + { + "epoch": 0.8632784661222878, + "grad_norm": 0.006369173992425203, + "learning_rate": 1.3731535058596275e-05, + "loss": 0.22666561603546143, + "step": 201080 + }, + { + "epoch": 0.8633213982123078, + "grad_norm": 123.1529312133789, + "learning_rate": 1.3727223338478654e-05, + "loss": 0.2879739046096802, + "step": 201090 + }, + { + "epoch": 0.8633643303023277, + "grad_norm": 1.1861544847488403, + "learning_rate": 1.3722911618361028e-05, + "loss": 0.29244720935821533, + "step": 201100 + }, + { + "epoch": 0.8634072623923478, + "grad_norm": 0.008288329467177391, + "learning_rate": 1.3718599898243406e-05, + "loss": 0.2790151834487915, + "step": 201110 + }, + { + "epoch": 0.8634501944823678, + "grad_norm": 0.02679705061018467, + "learning_rate": 1.3714288178125781e-05, + "loss": 0.2626842975616455, + "step": 201120 + }, + { + "epoch": 0.8634931265723877, + "grad_norm": 0.5207473039627075, + "learning_rate": 1.3709976458008159e-05, + "loss": 0.24442975521087645, + "step": 201130 + }, + { + "epoch": 0.8635360586624078, + "grad_norm": 0.0031045994255691767, + "learning_rate": 1.3705664737890534e-05, + "loss": 0.2642416000366211, + "step": 201140 + }, + { + "epoch": 0.8635789907524278, + "grad_norm": 0.01354249194264412, + "learning_rate": 1.3701353017772912e-05, + "loss": 0.023589606583118438, + "step": 201150 + }, + { + "epoch": 0.8636219228424478, + "grad_norm": 0.0018465573666617274, + "learning_rate": 1.3697041297655286e-05, + "loss": 0.2037327527999878, + "step": 201160 + }, + { + "epoch": 0.8636648549324678, + "grad_norm": 0.01761949621140957, + "learning_rate": 1.3692729577537663e-05, + "loss": 0.27513961791992186, + "step": 201170 + }, + { + "epoch": 0.8637077870224878, + "grad_norm": 0.001405532006174326, + "learning_rate": 1.3688417857420042e-05, + "loss": 0.16688908338546754, + "step": 201180 + }, + { + "epoch": 0.8637507191125078, + "grad_norm": 0.0007753579411655664, + "learning_rate": 1.3684106137302416e-05, + "loss": 0.23070762157440186, + "step": 201190 + }, + { + "epoch": 0.8637936512025278, + "grad_norm": 0.2772015929222107, + "learning_rate": 1.3679794417184793e-05, + "loss": 0.04422442317008972, + "step": 201200 + }, + { + "epoch": 0.8638365832925479, + "grad_norm": 0.03911440819501877, + "learning_rate": 1.3675482697067169e-05, + "loss": 0.06600427627563477, + "step": 201210 + }, + { + "epoch": 0.8638795153825679, + "grad_norm": 0.004431838635355234, + "learning_rate": 1.3671170976949546e-05, + "loss": 0.20836050510406495, + "step": 201220 + }, + { + "epoch": 0.8639224474725878, + "grad_norm": 0.0669967457652092, + "learning_rate": 1.366685925683192e-05, + "loss": 0.09750092029571533, + "step": 201230 + }, + { + "epoch": 0.8639653795626079, + "grad_norm": 2.275888681411743, + "learning_rate": 1.36625475367143e-05, + "loss": 0.33345661163330076, + "step": 201240 + }, + { + "epoch": 0.8640083116526279, + "grad_norm": 1.3841379880905151, + "learning_rate": 1.3658235816596673e-05, + "loss": 0.2437295913696289, + "step": 201250 + }, + { + "epoch": 0.8640512437426479, + "grad_norm": 0.0015755087370052934, + "learning_rate": 1.365392409647905e-05, + "loss": 0.34356591701507566, + "step": 201260 + }, + { + "epoch": 0.8640941758326679, + "grad_norm": 8.029579162597656, + "learning_rate": 1.3649612376361426e-05, + "loss": 0.13912639617919922, + "step": 201270 + }, + { + "epoch": 0.8641371079226879, + "grad_norm": 0.06813203543424606, + "learning_rate": 1.3645300656243803e-05, + "loss": 0.1133497714996338, + "step": 201280 + }, + { + "epoch": 0.8641800400127079, + "grad_norm": 7.3910980224609375, + "learning_rate": 1.3640988936126179e-05, + "loss": 0.22454369068145752, + "step": 201290 + }, + { + "epoch": 0.8642229721027279, + "grad_norm": 8.968598365783691, + "learning_rate": 1.3636677216008556e-05, + "loss": 0.2147754192352295, + "step": 201300 + }, + { + "epoch": 0.864265904192748, + "grad_norm": 0.0024705410469323397, + "learning_rate": 1.363236549589093e-05, + "loss": 0.3113959789276123, + "step": 201310 + }, + { + "epoch": 0.8643088362827679, + "grad_norm": 1.973299503326416, + "learning_rate": 1.3628053775773308e-05, + "loss": 0.23185880184173585, + "step": 201320 + }, + { + "epoch": 0.8643517683727879, + "grad_norm": 0.5137335658073425, + "learning_rate": 1.3623742055655683e-05, + "loss": 0.2932513475418091, + "step": 201330 + }, + { + "epoch": 0.864394700462808, + "grad_norm": 0.0013410047395154834, + "learning_rate": 1.361943033553806e-05, + "loss": 0.05563706159591675, + "step": 201340 + }, + { + "epoch": 0.8644376325528279, + "grad_norm": 2.642397403717041, + "learning_rate": 1.3615118615420436e-05, + "loss": 0.22566254138946534, + "step": 201350 + }, + { + "epoch": 0.864480564642848, + "grad_norm": 0.027849914506077766, + "learning_rate": 1.3610806895302814e-05, + "loss": 0.12023868560791015, + "step": 201360 + }, + { + "epoch": 0.864523496732868, + "grad_norm": 0.0002782710362225771, + "learning_rate": 1.3606495175185188e-05, + "loss": 0.40762104988098147, + "step": 201370 + }, + { + "epoch": 0.8645664288228879, + "grad_norm": 0.00015609625552315265, + "learning_rate": 1.3602183455067565e-05, + "loss": 0.3876370906829834, + "step": 201380 + }, + { + "epoch": 0.864609360912908, + "grad_norm": 8.169439315795898, + "learning_rate": 1.359787173494994e-05, + "loss": 0.14945834875106812, + "step": 201390 + }, + { + "epoch": 0.864652293002928, + "grad_norm": 0.005248130764812231, + "learning_rate": 1.3593560014832318e-05, + "loss": 0.04767285883426666, + "step": 201400 + }, + { + "epoch": 0.8646952250929479, + "grad_norm": 2.542904853820801, + "learning_rate": 1.3589248294714694e-05, + "loss": 0.10348002910614014, + "step": 201410 + }, + { + "epoch": 0.864738157182968, + "grad_norm": 0.03276196867227554, + "learning_rate": 1.3584936574597071e-05, + "loss": 0.11932778358459473, + "step": 201420 + }, + { + "epoch": 0.864781089272988, + "grad_norm": 3.358114719390869, + "learning_rate": 1.3580624854479445e-05, + "loss": 0.08350310325622559, + "step": 201430 + }, + { + "epoch": 0.864824021363008, + "grad_norm": 1.9119728803634644, + "learning_rate": 1.3576313134361824e-05, + "loss": 0.13522932529449463, + "step": 201440 + }, + { + "epoch": 0.864866953453028, + "grad_norm": 1.0199462175369263, + "learning_rate": 1.3572001414244198e-05, + "loss": 0.38537781238555907, + "step": 201450 + }, + { + "epoch": 0.864909885543048, + "grad_norm": 0.4397680461406708, + "learning_rate": 1.3567689694126575e-05, + "loss": 0.20663084983825683, + "step": 201460 + }, + { + "epoch": 0.864952817633068, + "grad_norm": 0.4000920355319977, + "learning_rate": 1.3563377974008951e-05, + "loss": 0.16495121717453004, + "step": 201470 + }, + { + "epoch": 0.864995749723088, + "grad_norm": 0.001823501312173903, + "learning_rate": 1.3559066253891328e-05, + "loss": 0.28188743591308596, + "step": 201480 + }, + { + "epoch": 0.8650386818131081, + "grad_norm": 0.06819120794534683, + "learning_rate": 1.3554754533773702e-05, + "loss": 0.12166712284088135, + "step": 201490 + }, + { + "epoch": 0.865081613903128, + "grad_norm": 0.011608834378421307, + "learning_rate": 1.3550442813656081e-05, + "loss": 0.13515074253082277, + "step": 201500 + }, + { + "epoch": 0.865124545993148, + "grad_norm": 6.033400535583496, + "learning_rate": 1.3546131093538455e-05, + "loss": 0.3407582759857178, + "step": 201510 + }, + { + "epoch": 0.8651674780831681, + "grad_norm": 2.359403610229492, + "learning_rate": 1.3541819373420833e-05, + "loss": 0.1246711015701294, + "step": 201520 + }, + { + "epoch": 0.865210410173188, + "grad_norm": 5.6313276290893555, + "learning_rate": 1.3537507653303212e-05, + "loss": 0.06634917855262756, + "step": 201530 + }, + { + "epoch": 0.865253342263208, + "grad_norm": 0.011722094379365444, + "learning_rate": 1.3533195933185586e-05, + "loss": 0.08681342005729675, + "step": 201540 + }, + { + "epoch": 0.8652962743532281, + "grad_norm": 0.0036464305594563484, + "learning_rate": 1.3528884213067963e-05, + "loss": 0.0781008780002594, + "step": 201550 + }, + { + "epoch": 0.865339206443248, + "grad_norm": 0.17769843339920044, + "learning_rate": 1.3524572492950339e-05, + "loss": 0.20671448707580567, + "step": 201560 + }, + { + "epoch": 0.8653821385332681, + "grad_norm": 2.0480868816375732, + "learning_rate": 1.3520260772832716e-05, + "loss": 0.1721822738647461, + "step": 201570 + }, + { + "epoch": 0.8654250706232881, + "grad_norm": 0.23706549406051636, + "learning_rate": 1.351594905271509e-05, + "loss": 0.22920043468475343, + "step": 201580 + }, + { + "epoch": 0.865468002713308, + "grad_norm": 0.0014394361060112715, + "learning_rate": 1.3511637332597469e-05, + "loss": 0.16023410558700563, + "step": 201590 + }, + { + "epoch": 0.8655109348033281, + "grad_norm": 1.2048903703689575, + "learning_rate": 1.3507325612479843e-05, + "loss": 0.226576828956604, + "step": 201600 + }, + { + "epoch": 0.8655538668933481, + "grad_norm": 0.035271406173706055, + "learning_rate": 1.350301389236222e-05, + "loss": 0.1346900224685669, + "step": 201610 + }, + { + "epoch": 0.865596798983368, + "grad_norm": 0.0051587289199233055, + "learning_rate": 1.3498702172244596e-05, + "loss": 0.3614091157913208, + "step": 201620 + }, + { + "epoch": 0.8656397310733881, + "grad_norm": 0.08925247192382812, + "learning_rate": 1.3494390452126973e-05, + "loss": 0.3365633964538574, + "step": 201630 + }, + { + "epoch": 0.8656826631634081, + "grad_norm": 0.009239203296601772, + "learning_rate": 1.3490078732009349e-05, + "loss": 0.1582499384880066, + "step": 201640 + }, + { + "epoch": 0.8657255952534282, + "grad_norm": 0.5474352836608887, + "learning_rate": 1.3485767011891726e-05, + "loss": 0.06596941947937011, + "step": 201650 + }, + { + "epoch": 0.8657685273434481, + "grad_norm": 0.7350939512252808, + "learning_rate": 1.34814552917741e-05, + "loss": 0.290863037109375, + "step": 201660 + }, + { + "epoch": 0.8658114594334682, + "grad_norm": 0.005053548142313957, + "learning_rate": 1.3477143571656478e-05, + "loss": 0.12502760887145997, + "step": 201670 + }, + { + "epoch": 0.8658543915234882, + "grad_norm": 3.777261734008789, + "learning_rate": 1.3472831851538853e-05, + "loss": 0.07586979866027832, + "step": 201680 + }, + { + "epoch": 0.8658973236135081, + "grad_norm": 2.031341791152954, + "learning_rate": 1.346852013142123e-05, + "loss": 0.24181299209594725, + "step": 201690 + }, + { + "epoch": 0.8659402557035282, + "grad_norm": 5.771152973175049, + "learning_rate": 1.3464208411303606e-05, + "loss": 0.08921363353729247, + "step": 201700 + }, + { + "epoch": 0.8659831877935482, + "grad_norm": 0.04179826006293297, + "learning_rate": 1.3459896691185983e-05, + "loss": 0.327194881439209, + "step": 201710 + }, + { + "epoch": 0.8660261198835681, + "grad_norm": 3.3760650157928467, + "learning_rate": 1.3455584971068357e-05, + "loss": 0.27189462184906005, + "step": 201720 + }, + { + "epoch": 0.8660690519735882, + "grad_norm": 0.0028545090463012457, + "learning_rate": 1.3451273250950735e-05, + "loss": 0.2069005250930786, + "step": 201730 + }, + { + "epoch": 0.8661119840636082, + "grad_norm": 2.0396742820739746, + "learning_rate": 1.344696153083311e-05, + "loss": 0.2259754419326782, + "step": 201740 + }, + { + "epoch": 0.8661549161536282, + "grad_norm": 0.8559262156486511, + "learning_rate": 1.3442649810715488e-05, + "loss": 0.34209017753601073, + "step": 201750 + }, + { + "epoch": 0.8661978482436482, + "grad_norm": 0.007572118658572435, + "learning_rate": 1.3438338090597863e-05, + "loss": 0.2585746765136719, + "step": 201760 + }, + { + "epoch": 0.8662407803336682, + "grad_norm": 0.5755453705787659, + "learning_rate": 1.343402637048024e-05, + "loss": 0.07282045483589172, + "step": 201770 + }, + { + "epoch": 0.8662837124236882, + "grad_norm": 0.0012946148635819554, + "learning_rate": 1.3429714650362615e-05, + "loss": 0.2800379753112793, + "step": 201780 + }, + { + "epoch": 0.8663266445137082, + "grad_norm": 0.019003266468644142, + "learning_rate": 1.3425402930244994e-05, + "loss": 0.2469172716140747, + "step": 201790 + }, + { + "epoch": 0.8663695766037283, + "grad_norm": 0.0037993162404745817, + "learning_rate": 1.3421091210127368e-05, + "loss": 0.20482521057128905, + "step": 201800 + }, + { + "epoch": 0.8664125086937482, + "grad_norm": 0.042166516184806824, + "learning_rate": 1.3416779490009745e-05, + "loss": 0.15784236192703247, + "step": 201810 + }, + { + "epoch": 0.8664554407837682, + "grad_norm": 4.952692985534668, + "learning_rate": 1.341246776989212e-05, + "loss": 0.10906834602355957, + "step": 201820 + }, + { + "epoch": 0.8664983728737883, + "grad_norm": 0.1707507222890854, + "learning_rate": 1.3408156049774498e-05, + "loss": 0.10929840803146362, + "step": 201830 + }, + { + "epoch": 0.8665413049638082, + "grad_norm": 2.6976828575134277, + "learning_rate": 1.3403844329656872e-05, + "loss": 0.10269479751586914, + "step": 201840 + }, + { + "epoch": 0.8665842370538283, + "grad_norm": 2.0937705039978027, + "learning_rate": 1.3399532609539251e-05, + "loss": 0.18717145919799805, + "step": 201850 + }, + { + "epoch": 0.8666271691438483, + "grad_norm": 1.711639642715454, + "learning_rate": 1.3395220889421625e-05, + "loss": 0.22680439949035644, + "step": 201860 + }, + { + "epoch": 0.8666701012338682, + "grad_norm": 1.208844780921936, + "learning_rate": 1.3390909169304002e-05, + "loss": 0.2210688829421997, + "step": 201870 + }, + { + "epoch": 0.8667130333238883, + "grad_norm": 0.007113473489880562, + "learning_rate": 1.3386597449186378e-05, + "loss": 0.19343369007110595, + "step": 201880 + }, + { + "epoch": 0.8667559654139083, + "grad_norm": 0.32166412472724915, + "learning_rate": 1.3382285729068755e-05, + "loss": 0.2312183380126953, + "step": 201890 + }, + { + "epoch": 0.8667988975039282, + "grad_norm": 0.40931782126426697, + "learning_rate": 1.3377974008951133e-05, + "loss": 0.24105579853057862, + "step": 201900 + }, + { + "epoch": 0.8668418295939483, + "grad_norm": 0.032758649438619614, + "learning_rate": 1.3373662288833508e-05, + "loss": 0.07932702898979187, + "step": 201910 + }, + { + "epoch": 0.8668847616839683, + "grad_norm": 0.058970626443624496, + "learning_rate": 1.3369350568715886e-05, + "loss": 0.2714974403381348, + "step": 201920 + }, + { + "epoch": 0.8669276937739883, + "grad_norm": 0.2553642988204956, + "learning_rate": 1.336503884859826e-05, + "loss": 0.1437745451927185, + "step": 201930 + }, + { + "epoch": 0.8669706258640083, + "grad_norm": 0.010126570239663124, + "learning_rate": 1.3360727128480639e-05, + "loss": 0.1601130485534668, + "step": 201940 + }, + { + "epoch": 0.8670135579540283, + "grad_norm": 0.04212572053074837, + "learning_rate": 1.3356415408363013e-05, + "loss": 0.34910471439361573, + "step": 201950 + }, + { + "epoch": 0.8670564900440483, + "grad_norm": 0.01635109633207321, + "learning_rate": 1.335210368824539e-05, + "loss": 0.04124734103679657, + "step": 201960 + }, + { + "epoch": 0.8670994221340683, + "grad_norm": 0.22834289073944092, + "learning_rate": 1.3347791968127766e-05, + "loss": 0.09849913120269775, + "step": 201970 + }, + { + "epoch": 0.8671423542240884, + "grad_norm": 3.879984140396118, + "learning_rate": 1.3343480248010143e-05, + "loss": 0.08664489984512329, + "step": 201980 + }, + { + "epoch": 0.8671852863141083, + "grad_norm": 3.0354487895965576, + "learning_rate": 1.3339168527892517e-05, + "loss": 0.2665108680725098, + "step": 201990 + }, + { + "epoch": 0.8672282184041283, + "grad_norm": 0.0027560857124626637, + "learning_rate": 1.3334856807774896e-05, + "loss": 0.14189740419387817, + "step": 202000 + }, + { + "epoch": 0.8672282184041283, + "eval_loss": 0.38031283020973206, + "eval_runtime": 27.421, + "eval_samples_per_second": 3.647, + "eval_steps_per_second": 3.647, + "step": 202000 + }, + { + "epoch": 0.8672711504941484, + "grad_norm": 0.028320224955677986, + "learning_rate": 1.333054508765727e-05, + "loss": 0.23645930290222167, + "step": 202010 + }, + { + "epoch": 0.8673140825841683, + "grad_norm": 2.802095651626587, + "learning_rate": 1.3326233367539647e-05, + "loss": 0.34255218505859375, + "step": 202020 + }, + { + "epoch": 0.8673570146741884, + "grad_norm": 5.453604221343994, + "learning_rate": 1.3321921647422023e-05, + "loss": 0.20399942398071289, + "step": 202030 + }, + { + "epoch": 0.8673999467642084, + "grad_norm": 0.44003716111183167, + "learning_rate": 1.33176099273044e-05, + "loss": 0.2527095079421997, + "step": 202040 + }, + { + "epoch": 0.8674428788542283, + "grad_norm": 0.10289528965950012, + "learning_rate": 1.3313298207186776e-05, + "loss": 0.06380432844161987, + "step": 202050 + }, + { + "epoch": 0.8674858109442484, + "grad_norm": 1.4219003915786743, + "learning_rate": 1.3308986487069153e-05, + "loss": 0.22886343002319337, + "step": 202060 + }, + { + "epoch": 0.8675287430342684, + "grad_norm": 0.4398294687271118, + "learning_rate": 1.3304674766951527e-05, + "loss": 0.1260904312133789, + "step": 202070 + }, + { + "epoch": 0.8675716751242885, + "grad_norm": 0.013393580913543701, + "learning_rate": 1.3300363046833905e-05, + "loss": 0.366540789604187, + "step": 202080 + }, + { + "epoch": 0.8676146072143084, + "grad_norm": 0.033431414514780045, + "learning_rate": 1.329605132671628e-05, + "loss": 0.34116013050079347, + "step": 202090 + }, + { + "epoch": 0.8676575393043284, + "grad_norm": 0.003720491658896208, + "learning_rate": 1.3291739606598658e-05, + "loss": 0.1548625946044922, + "step": 202100 + }, + { + "epoch": 0.8677004713943485, + "grad_norm": 1.0757026672363281, + "learning_rate": 1.3287427886481033e-05, + "loss": 0.43051700592041015, + "step": 202110 + }, + { + "epoch": 0.8677434034843684, + "grad_norm": 7.948141574859619, + "learning_rate": 1.328311616636341e-05, + "loss": 0.2517849445343018, + "step": 202120 + }, + { + "epoch": 0.8677863355743884, + "grad_norm": 7.49459171295166, + "learning_rate": 1.3278804446245784e-05, + "loss": 0.18098866939544678, + "step": 202130 + }, + { + "epoch": 0.8678292676644085, + "grad_norm": 2.5668296813964844, + "learning_rate": 1.3274492726128162e-05, + "loss": 0.2603480577468872, + "step": 202140 + }, + { + "epoch": 0.8678721997544284, + "grad_norm": 0.0003951344988308847, + "learning_rate": 1.3270181006010537e-05, + "loss": 0.15042036771774292, + "step": 202150 + }, + { + "epoch": 0.8679151318444485, + "grad_norm": 2.0018789768218994, + "learning_rate": 1.3265869285892915e-05, + "loss": 0.2826423406600952, + "step": 202160 + }, + { + "epoch": 0.8679580639344685, + "grad_norm": 0.21826449036598206, + "learning_rate": 1.326155756577529e-05, + "loss": 0.17833973169326783, + "step": 202170 + }, + { + "epoch": 0.8680009960244884, + "grad_norm": 0.00463878596201539, + "learning_rate": 1.3257245845657668e-05, + "loss": 0.16179614067077636, + "step": 202180 + }, + { + "epoch": 0.8680439281145085, + "grad_norm": 4.259723663330078, + "learning_rate": 1.3252934125540042e-05, + "loss": 0.19961766004562378, + "step": 202190 + }, + { + "epoch": 0.8680868602045285, + "grad_norm": 3.2912721633911133, + "learning_rate": 1.324862240542242e-05, + "loss": 0.24763474464416504, + "step": 202200 + }, + { + "epoch": 0.8681297922945485, + "grad_norm": 1.176949143409729, + "learning_rate": 1.3244310685304795e-05, + "loss": 0.19105069637298583, + "step": 202210 + }, + { + "epoch": 0.8681727243845685, + "grad_norm": 0.02803204394876957, + "learning_rate": 1.3239998965187172e-05, + "loss": 0.31554696559906004, + "step": 202220 + }, + { + "epoch": 0.8682156564745885, + "grad_norm": 3.055330991744995, + "learning_rate": 1.3235687245069548e-05, + "loss": 0.1737144708633423, + "step": 202230 + }, + { + "epoch": 0.8682585885646085, + "grad_norm": 0.016064148396253586, + "learning_rate": 1.3231375524951925e-05, + "loss": 0.1376855969429016, + "step": 202240 + }, + { + "epoch": 0.8683015206546285, + "grad_norm": 0.05431525409221649, + "learning_rate": 1.3227063804834299e-05, + "loss": 0.19808392524719237, + "step": 202250 + }, + { + "epoch": 0.8683444527446486, + "grad_norm": 0.003191595897078514, + "learning_rate": 1.3222752084716678e-05, + "loss": 0.12034578323364258, + "step": 202260 + }, + { + "epoch": 0.8683873848346685, + "grad_norm": 0.004532682243734598, + "learning_rate": 1.3218440364599055e-05, + "loss": 0.10594029426574707, + "step": 202270 + }, + { + "epoch": 0.8684303169246885, + "grad_norm": 0.002449595369398594, + "learning_rate": 1.321412864448143e-05, + "loss": 0.29216201305389405, + "step": 202280 + }, + { + "epoch": 0.8684732490147086, + "grad_norm": 0.0005997202824801207, + "learning_rate": 1.3209816924363808e-05, + "loss": 0.3863027095794678, + "step": 202290 + }, + { + "epoch": 0.8685161811047285, + "grad_norm": 3.148550271987915, + "learning_rate": 1.3205505204246182e-05, + "loss": 0.11254967451095581, + "step": 202300 + }, + { + "epoch": 0.8685591131947485, + "grad_norm": 2.766737461090088, + "learning_rate": 1.320119348412856e-05, + "loss": 0.12313051223754883, + "step": 202310 + }, + { + "epoch": 0.8686020452847686, + "grad_norm": 1.5454360246658325, + "learning_rate": 1.3196881764010935e-05, + "loss": 0.1295259952545166, + "step": 202320 + }, + { + "epoch": 0.8686449773747885, + "grad_norm": 6.207314491271973, + "learning_rate": 1.3192570043893313e-05, + "loss": 0.17339755296707154, + "step": 202330 + }, + { + "epoch": 0.8686879094648086, + "grad_norm": 1.4427553415298462, + "learning_rate": 1.3188258323775687e-05, + "loss": 0.18795595169067383, + "step": 202340 + }, + { + "epoch": 0.8687308415548286, + "grad_norm": 0.0015251105651259422, + "learning_rate": 1.3183946603658066e-05, + "loss": 0.06718888878822327, + "step": 202350 + }, + { + "epoch": 0.8687737736448485, + "grad_norm": 1.752411961555481, + "learning_rate": 1.317963488354044e-05, + "loss": 0.2728855848312378, + "step": 202360 + }, + { + "epoch": 0.8688167057348686, + "grad_norm": 0.49718743562698364, + "learning_rate": 1.3175323163422817e-05, + "loss": 0.18400404453277588, + "step": 202370 + }, + { + "epoch": 0.8688596378248886, + "grad_norm": 0.0021883861627429724, + "learning_rate": 1.3171011443305193e-05, + "loss": 0.06978545188903809, + "step": 202380 + }, + { + "epoch": 0.8689025699149086, + "grad_norm": 0.030992716550827026, + "learning_rate": 1.316669972318757e-05, + "loss": 0.1565848708152771, + "step": 202390 + }, + { + "epoch": 0.8689455020049286, + "grad_norm": 0.025812238454818726, + "learning_rate": 1.3162388003069946e-05, + "loss": 0.08083627223968506, + "step": 202400 + }, + { + "epoch": 0.8689884340949486, + "grad_norm": 0.0011363154044374824, + "learning_rate": 1.3158076282952323e-05, + "loss": 0.15595753192901612, + "step": 202410 + }, + { + "epoch": 0.8690313661849686, + "grad_norm": 2.2198967933654785, + "learning_rate": 1.3153764562834697e-05, + "loss": 0.23197317123413086, + "step": 202420 + }, + { + "epoch": 0.8690742982749886, + "grad_norm": 0.9877627491950989, + "learning_rate": 1.3149452842717074e-05, + "loss": 0.13361701965332032, + "step": 202430 + }, + { + "epoch": 0.8691172303650087, + "grad_norm": 0.1605759710073471, + "learning_rate": 1.314514112259945e-05, + "loss": 0.13915138244628905, + "step": 202440 + }, + { + "epoch": 0.8691601624550286, + "grad_norm": 0.05874808132648468, + "learning_rate": 1.3140829402481827e-05, + "loss": 0.5318547248840332, + "step": 202450 + }, + { + "epoch": 0.8692030945450486, + "grad_norm": 1.9270455837249756, + "learning_rate": 1.3136517682364203e-05, + "loss": 0.2182586669921875, + "step": 202460 + }, + { + "epoch": 0.8692460266350687, + "grad_norm": 0.002081893617287278, + "learning_rate": 1.313220596224658e-05, + "loss": 0.1298648715019226, + "step": 202470 + }, + { + "epoch": 0.8692889587250886, + "grad_norm": 0.01326664723455906, + "learning_rate": 1.3127894242128954e-05, + "loss": 0.18864309787750244, + "step": 202480 + }, + { + "epoch": 0.8693318908151086, + "grad_norm": 0.02235686592757702, + "learning_rate": 1.3123582522011332e-05, + "loss": 0.16074566841125487, + "step": 202490 + }, + { + "epoch": 0.8693748229051287, + "grad_norm": 0.0038623339496552944, + "learning_rate": 1.3119270801893707e-05, + "loss": 0.16679761409759522, + "step": 202500 + }, + { + "epoch": 0.8694177549951487, + "grad_norm": 0.008476986549794674, + "learning_rate": 1.3114959081776085e-05, + "loss": 0.10375416278839111, + "step": 202510 + }, + { + "epoch": 0.8694606870851687, + "grad_norm": 0.03022216260433197, + "learning_rate": 1.311064736165846e-05, + "loss": 0.11786410808563233, + "step": 202520 + }, + { + "epoch": 0.8695036191751887, + "grad_norm": 2.8388748168945312, + "learning_rate": 1.3106335641540838e-05, + "loss": 0.26753456592559816, + "step": 202530 + }, + { + "epoch": 0.8695465512652087, + "grad_norm": 2.872877836227417, + "learning_rate": 1.3102023921423211e-05, + "loss": 0.2310659408569336, + "step": 202540 + }, + { + "epoch": 0.8695894833552287, + "grad_norm": 9.196786880493164, + "learning_rate": 1.309771220130559e-05, + "loss": 0.21355884075164794, + "step": 202550 + }, + { + "epoch": 0.8696324154452487, + "grad_norm": 25.724925994873047, + "learning_rate": 1.3093400481187964e-05, + "loss": 0.15160589218139647, + "step": 202560 + }, + { + "epoch": 0.8696753475352688, + "grad_norm": 0.0017343858489766717, + "learning_rate": 1.3089088761070342e-05, + "loss": 0.38728699684143064, + "step": 202570 + }, + { + "epoch": 0.8697182796252887, + "grad_norm": 1.2419610023498535, + "learning_rate": 1.3084777040952717e-05, + "loss": 0.3935434818267822, + "step": 202580 + }, + { + "epoch": 0.8697612117153087, + "grad_norm": 1.0418486595153809, + "learning_rate": 1.3080465320835095e-05, + "loss": 0.1654996395111084, + "step": 202590 + }, + { + "epoch": 0.8698041438053288, + "grad_norm": 0.12601947784423828, + "learning_rate": 1.3076153600717469e-05, + "loss": 0.13802404403686525, + "step": 202600 + }, + { + "epoch": 0.8698470758953487, + "grad_norm": 0.09665412455797195, + "learning_rate": 1.3071841880599848e-05, + "loss": 0.08990532755851746, + "step": 202610 + }, + { + "epoch": 0.8698900079853688, + "grad_norm": 0.5923460125923157, + "learning_rate": 1.3067530160482225e-05, + "loss": 0.08348230719566345, + "step": 202620 + }, + { + "epoch": 0.8699329400753888, + "grad_norm": 3.084900140762329, + "learning_rate": 1.3063218440364599e-05, + "loss": 0.16990399360656738, + "step": 202630 + }, + { + "epoch": 0.8699758721654087, + "grad_norm": 0.005899806506931782, + "learning_rate": 1.3058906720246976e-05, + "loss": 0.18719682693481446, + "step": 202640 + }, + { + "epoch": 0.8700188042554288, + "grad_norm": 3.733660936355591, + "learning_rate": 1.3054595000129352e-05, + "loss": 0.10188864469528199, + "step": 202650 + }, + { + "epoch": 0.8700617363454488, + "grad_norm": 0.015237463638186455, + "learning_rate": 1.305028328001173e-05, + "loss": 0.10797481536865235, + "step": 202660 + }, + { + "epoch": 0.8701046684354687, + "grad_norm": 0.6044527292251587, + "learning_rate": 1.3045971559894105e-05, + "loss": 0.19505347013473512, + "step": 202670 + }, + { + "epoch": 0.8701476005254888, + "grad_norm": 0.004703771322965622, + "learning_rate": 1.3041659839776482e-05, + "loss": 0.0875515103340149, + "step": 202680 + }, + { + "epoch": 0.8701905326155088, + "grad_norm": 0.9704412221908569, + "learning_rate": 1.3037348119658856e-05, + "loss": 0.4309795379638672, + "step": 202690 + }, + { + "epoch": 0.8702334647055288, + "grad_norm": 0.047677453607320786, + "learning_rate": 1.3033036399541235e-05, + "loss": 0.04325348734855652, + "step": 202700 + }, + { + "epoch": 0.8702763967955488, + "grad_norm": 4.24571418762207, + "learning_rate": 1.302872467942361e-05, + "loss": 0.10959751605987549, + "step": 202710 + }, + { + "epoch": 0.8703193288855688, + "grad_norm": 2.259910821914673, + "learning_rate": 1.3024412959305987e-05, + "loss": 0.13984992504119872, + "step": 202720 + }, + { + "epoch": 0.8703622609755888, + "grad_norm": 0.022040946409106255, + "learning_rate": 1.3020101239188362e-05, + "loss": 0.12837090492248535, + "step": 202730 + }, + { + "epoch": 0.8704051930656088, + "grad_norm": 0.005013573449105024, + "learning_rate": 1.301578951907074e-05, + "loss": 0.02592419385910034, + "step": 202740 + }, + { + "epoch": 0.8704481251556289, + "grad_norm": 0.007160715758800507, + "learning_rate": 1.3011477798953114e-05, + "loss": 0.03959584832191467, + "step": 202750 + }, + { + "epoch": 0.8704910572456488, + "grad_norm": 0.22778935730457306, + "learning_rate": 1.3007166078835493e-05, + "loss": 0.14829691648483276, + "step": 202760 + }, + { + "epoch": 0.8705339893356688, + "grad_norm": 0.0009955308632925153, + "learning_rate": 1.3002854358717867e-05, + "loss": 0.19827834367752076, + "step": 202770 + }, + { + "epoch": 0.8705769214256889, + "grad_norm": 0.011871743947267532, + "learning_rate": 1.2998542638600244e-05, + "loss": 0.18226596117019653, + "step": 202780 + }, + { + "epoch": 0.8706198535157088, + "grad_norm": 0.030095193535089493, + "learning_rate": 1.299423091848262e-05, + "loss": 0.11577250957489013, + "step": 202790 + }, + { + "epoch": 0.8706627856057289, + "grad_norm": 1.2848188877105713, + "learning_rate": 1.2989919198364997e-05, + "loss": 0.29182541370391846, + "step": 202800 + }, + { + "epoch": 0.8707057176957489, + "grad_norm": 0.03173932060599327, + "learning_rate": 1.2985607478247373e-05, + "loss": 0.08262947797775269, + "step": 202810 + }, + { + "epoch": 0.8707486497857688, + "grad_norm": 3.3625714778900146, + "learning_rate": 1.298129575812975e-05, + "loss": 0.36866700649261475, + "step": 202820 + }, + { + "epoch": 0.8707915818757889, + "grad_norm": 0.07918155938386917, + "learning_rate": 1.2976984038012124e-05, + "loss": 0.21780040264129638, + "step": 202830 + }, + { + "epoch": 0.8708345139658089, + "grad_norm": 2.23683500289917, + "learning_rate": 1.2972672317894501e-05, + "loss": 0.05871073007583618, + "step": 202840 + }, + { + "epoch": 0.8708774460558288, + "grad_norm": 1.5801154375076294, + "learning_rate": 1.2968360597776877e-05, + "loss": 0.1820102572441101, + "step": 202850 + }, + { + "epoch": 0.8709203781458489, + "grad_norm": 0.0043759336695075035, + "learning_rate": 1.2964048877659254e-05, + "loss": 0.2038339853286743, + "step": 202860 + }, + { + "epoch": 0.8709633102358689, + "grad_norm": 0.19102871417999268, + "learning_rate": 1.295973715754163e-05, + "loss": 0.3390310525894165, + "step": 202870 + }, + { + "epoch": 0.8710062423258889, + "grad_norm": 1.966539978981018, + "learning_rate": 1.2955425437424007e-05, + "loss": 0.11660757064819335, + "step": 202880 + }, + { + "epoch": 0.8710491744159089, + "grad_norm": 0.1771833449602127, + "learning_rate": 1.2951113717306381e-05, + "loss": 0.1814139485359192, + "step": 202890 + }, + { + "epoch": 0.8710921065059289, + "grad_norm": 0.0787118449807167, + "learning_rate": 1.294680199718876e-05, + "loss": 0.37035412788391114, + "step": 202900 + }, + { + "epoch": 0.8711350385959489, + "grad_norm": 0.6835107803344727, + "learning_rate": 1.2942490277071134e-05, + "loss": 0.13978594541549683, + "step": 202910 + }, + { + "epoch": 0.8711779706859689, + "grad_norm": 9.422897338867188, + "learning_rate": 1.2938178556953512e-05, + "loss": 0.27941164970397947, + "step": 202920 + }, + { + "epoch": 0.871220902775989, + "grad_norm": 0.5118245482444763, + "learning_rate": 1.2933866836835887e-05, + "loss": 0.12363041639328003, + "step": 202930 + }, + { + "epoch": 0.871263834866009, + "grad_norm": 2.3696482181549072, + "learning_rate": 1.2929555116718265e-05, + "loss": 0.38212246894836427, + "step": 202940 + }, + { + "epoch": 0.8713067669560289, + "grad_norm": 0.0280147772282362, + "learning_rate": 1.2925243396600638e-05, + "loss": 0.09542213678359986, + "step": 202950 + }, + { + "epoch": 0.871349699046049, + "grad_norm": 2.087315082550049, + "learning_rate": 1.2920931676483018e-05, + "loss": 0.2335756540298462, + "step": 202960 + }, + { + "epoch": 0.871392631136069, + "grad_norm": 3.394486904144287, + "learning_rate": 1.2916619956365391e-05, + "loss": 0.2918868541717529, + "step": 202970 + }, + { + "epoch": 0.871435563226089, + "grad_norm": 0.46176642179489136, + "learning_rate": 1.2912308236247769e-05, + "loss": 0.030417990684509278, + "step": 202980 + }, + { + "epoch": 0.871478495316109, + "grad_norm": 0.37049049139022827, + "learning_rate": 1.2907996516130146e-05, + "loss": 0.1741700291633606, + "step": 202990 + }, + { + "epoch": 0.871521427406129, + "grad_norm": 0.10045011341571808, + "learning_rate": 1.2903684796012522e-05, + "loss": 0.30179901123046876, + "step": 203000 + }, + { + "epoch": 0.871521427406129, + "eval_loss": 0.37936079502105713, + "eval_runtime": 27.6142, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 3.621, + "step": 203000 + }, + { + "epoch": 0.871564359496149, + "grad_norm": 2.178699254989624, + "learning_rate": 1.28993730758949e-05, + "loss": 0.18508745431900026, + "step": 203010 + }, + { + "epoch": 0.871607291586169, + "grad_norm": 0.003770484123378992, + "learning_rate": 1.2895061355777275e-05, + "loss": 0.2334007740020752, + "step": 203020 + }, + { + "epoch": 0.871650223676189, + "grad_norm": 0.01089702919125557, + "learning_rate": 1.2890749635659652e-05, + "loss": 0.21399302482604982, + "step": 203030 + }, + { + "epoch": 0.871693155766209, + "grad_norm": 0.7750707268714905, + "learning_rate": 1.2886437915542026e-05, + "loss": 0.08867501616477966, + "step": 203040 + }, + { + "epoch": 0.871736087856229, + "grad_norm": 0.03722615912556648, + "learning_rate": 1.2882126195424405e-05, + "loss": 0.17752952575683595, + "step": 203050 + }, + { + "epoch": 0.8717790199462491, + "grad_norm": 0.010971063748002052, + "learning_rate": 1.2877814475306779e-05, + "loss": 0.22754526138305664, + "step": 203060 + }, + { + "epoch": 0.871821952036269, + "grad_norm": 0.010999973863363266, + "learning_rate": 1.2873502755189156e-05, + "loss": 0.21283633708953859, + "step": 203070 + }, + { + "epoch": 0.871864884126289, + "grad_norm": 0.09963197261095047, + "learning_rate": 1.2869191035071532e-05, + "loss": 0.3332798719406128, + "step": 203080 + }, + { + "epoch": 0.8719078162163091, + "grad_norm": 0.012807992286980152, + "learning_rate": 1.286487931495391e-05, + "loss": 0.07219233512878417, + "step": 203090 + }, + { + "epoch": 0.871950748306329, + "grad_norm": 60.12035369873047, + "learning_rate": 1.2860567594836283e-05, + "loss": 0.11430882215499878, + "step": 203100 + }, + { + "epoch": 0.8719936803963491, + "grad_norm": 2.6776928901672363, + "learning_rate": 1.2856255874718662e-05, + "loss": 0.2610363483428955, + "step": 203110 + }, + { + "epoch": 0.8720366124863691, + "grad_norm": 0.25831401348114014, + "learning_rate": 1.2851944154601036e-05, + "loss": 0.12929258346557618, + "step": 203120 + }, + { + "epoch": 0.872079544576389, + "grad_norm": 1.7765833139419556, + "learning_rate": 1.2847632434483414e-05, + "loss": 0.2976083755493164, + "step": 203130 + }, + { + "epoch": 0.8721224766664091, + "grad_norm": 0.03122456930577755, + "learning_rate": 1.284332071436579e-05, + "loss": 0.12493312358856201, + "step": 203140 + }, + { + "epoch": 0.8721654087564291, + "grad_norm": 2.952826499938965, + "learning_rate": 1.2839008994248167e-05, + "loss": 0.3075044870376587, + "step": 203150 + }, + { + "epoch": 0.872208340846449, + "grad_norm": 0.12431207299232483, + "learning_rate": 1.2834697274130542e-05, + "loss": 0.13878283500671387, + "step": 203160 + }, + { + "epoch": 0.8722512729364691, + "grad_norm": 1.12087881565094, + "learning_rate": 1.283038555401292e-05, + "loss": 0.11472923755645752, + "step": 203170 + }, + { + "epoch": 0.8722942050264891, + "grad_norm": 1.5404136180877686, + "learning_rate": 1.2826073833895294e-05, + "loss": 0.29040989875793455, + "step": 203180 + }, + { + "epoch": 0.8723371371165091, + "grad_norm": 0.06525274366140366, + "learning_rate": 1.2821762113777671e-05, + "loss": 0.2566260814666748, + "step": 203190 + }, + { + "epoch": 0.8723800692065291, + "grad_norm": 0.001346416654996574, + "learning_rate": 1.2817450393660047e-05, + "loss": 0.13974400758743286, + "step": 203200 + }, + { + "epoch": 0.8724230012965491, + "grad_norm": 0.10522188991308212, + "learning_rate": 1.2813138673542424e-05, + "loss": 0.22829935550689698, + "step": 203210 + }, + { + "epoch": 0.8724659333865691, + "grad_norm": 0.21968132257461548, + "learning_rate": 1.28088269534248e-05, + "loss": 0.0917892575263977, + "step": 203220 + }, + { + "epoch": 0.8725088654765891, + "grad_norm": 1.885111927986145, + "learning_rate": 1.2804515233307177e-05, + "loss": 0.2599085092544556, + "step": 203230 + }, + { + "epoch": 0.8725517975666092, + "grad_norm": 3.5005955696105957, + "learning_rate": 1.2800203513189551e-05, + "loss": 0.5183819770812989, + "step": 203240 + }, + { + "epoch": 0.8725947296566291, + "grad_norm": 0.003291301429271698, + "learning_rate": 1.2795891793071928e-05, + "loss": 0.10750803947448731, + "step": 203250 + }, + { + "epoch": 0.8726376617466491, + "grad_norm": 2.1088061332702637, + "learning_rate": 1.2791580072954304e-05, + "loss": 0.3345699071884155, + "step": 203260 + }, + { + "epoch": 0.8726805938366692, + "grad_norm": 0.0036086903419345617, + "learning_rate": 1.2787268352836681e-05, + "loss": 0.35092260837554934, + "step": 203270 + }, + { + "epoch": 0.8727235259266891, + "grad_norm": 0.058109551668167114, + "learning_rate": 1.2782956632719057e-05, + "loss": 0.3079172134399414, + "step": 203280 + }, + { + "epoch": 0.8727664580167092, + "grad_norm": 0.005001907702535391, + "learning_rate": 1.2778644912601434e-05, + "loss": 0.05406479239463806, + "step": 203290 + }, + { + "epoch": 0.8728093901067292, + "grad_norm": 1.243675708770752, + "learning_rate": 1.2774333192483808e-05, + "loss": 0.3355956792831421, + "step": 203300 + }, + { + "epoch": 0.8728523221967491, + "grad_norm": 0.004065210931003094, + "learning_rate": 1.2770021472366187e-05, + "loss": 0.14164478778839112, + "step": 203310 + }, + { + "epoch": 0.8728952542867692, + "grad_norm": 1.4152636528015137, + "learning_rate": 1.2765709752248561e-05, + "loss": 0.21300029754638672, + "step": 203320 + }, + { + "epoch": 0.8729381863767892, + "grad_norm": 0.019872266799211502, + "learning_rate": 1.2761398032130939e-05, + "loss": 0.24908978939056398, + "step": 203330 + }, + { + "epoch": 0.8729811184668091, + "grad_norm": 3.7372965812683105, + "learning_rate": 1.2757086312013314e-05, + "loss": 0.20218095779418946, + "step": 203340 + }, + { + "epoch": 0.8730240505568292, + "grad_norm": 0.014889443293213844, + "learning_rate": 1.2752774591895692e-05, + "loss": 0.14196691513061524, + "step": 203350 + }, + { + "epoch": 0.8730669826468492, + "grad_norm": 0.0060808053240180016, + "learning_rate": 1.2748462871778069e-05, + "loss": 0.22262256145477294, + "step": 203360 + }, + { + "epoch": 0.8731099147368693, + "grad_norm": 0.009378614835441113, + "learning_rate": 1.2744151151660445e-05, + "loss": 0.10588387250900269, + "step": 203370 + }, + { + "epoch": 0.8731528468268892, + "grad_norm": 0.8276627063751221, + "learning_rate": 1.2739839431542822e-05, + "loss": 0.09532456994056701, + "step": 203380 + }, + { + "epoch": 0.8731957789169092, + "grad_norm": 1.4195549488067627, + "learning_rate": 1.2735527711425196e-05, + "loss": 0.16106001138687134, + "step": 203390 + }, + { + "epoch": 0.8732387110069293, + "grad_norm": 0.006768465042114258, + "learning_rate": 1.2731215991307575e-05, + "loss": 0.11826251745223999, + "step": 203400 + }, + { + "epoch": 0.8732816430969492, + "grad_norm": 0.013329996727406979, + "learning_rate": 1.2726904271189949e-05, + "loss": 0.20020360946655275, + "step": 203410 + }, + { + "epoch": 0.8733245751869693, + "grad_norm": 1.7608102560043335, + "learning_rate": 1.2722592551072326e-05, + "loss": 0.29365689754486085, + "step": 203420 + }, + { + "epoch": 0.8733675072769893, + "grad_norm": 1.9444129467010498, + "learning_rate": 1.2718280830954702e-05, + "loss": 0.19343478679656984, + "step": 203430 + }, + { + "epoch": 0.8734104393670092, + "grad_norm": 2.3005573749542236, + "learning_rate": 1.271396911083708e-05, + "loss": 0.26775202751159666, + "step": 203440 + }, + { + "epoch": 0.8734533714570293, + "grad_norm": 3.412327527999878, + "learning_rate": 1.2709657390719453e-05, + "loss": 0.24732573032379152, + "step": 203450 + }, + { + "epoch": 0.8734963035470493, + "grad_norm": 0.0005757113103754818, + "learning_rate": 1.2705345670601832e-05, + "loss": 0.13644465208053588, + "step": 203460 + }, + { + "epoch": 0.8735392356370693, + "grad_norm": 1.6123828887939453, + "learning_rate": 1.2701033950484206e-05, + "loss": 0.22131829261779784, + "step": 203470 + }, + { + "epoch": 0.8735821677270893, + "grad_norm": 0.026981748640537262, + "learning_rate": 1.2696722230366583e-05, + "loss": 0.13728641271591185, + "step": 203480 + }, + { + "epoch": 0.8736250998171093, + "grad_norm": 2.542603015899658, + "learning_rate": 1.2692410510248959e-05, + "loss": 0.24295291900634766, + "step": 203490 + }, + { + "epoch": 0.8736680319071293, + "grad_norm": 0.00556629803031683, + "learning_rate": 1.2688098790131336e-05, + "loss": 0.2404294490814209, + "step": 203500 + }, + { + "epoch": 0.8737109639971493, + "grad_norm": 0.724449098110199, + "learning_rate": 1.268378707001371e-05, + "loss": 0.09477731585502625, + "step": 203510 + }, + { + "epoch": 0.8737538960871694, + "grad_norm": 2.029515027999878, + "learning_rate": 1.267947534989609e-05, + "loss": 0.27462499141693114, + "step": 203520 + }, + { + "epoch": 0.8737968281771893, + "grad_norm": 2.440943479537964, + "learning_rate": 1.2675163629778463e-05, + "loss": 0.16564322710037233, + "step": 203530 + }, + { + "epoch": 0.8738397602672093, + "grad_norm": 0.08441468328237534, + "learning_rate": 1.267085190966084e-05, + "loss": 0.2753757476806641, + "step": 203540 + }, + { + "epoch": 0.8738826923572294, + "grad_norm": 1.7377372980117798, + "learning_rate": 1.2666540189543216e-05, + "loss": 0.22013463973999023, + "step": 203550 + }, + { + "epoch": 0.8739256244472493, + "grad_norm": 1.6985223293304443, + "learning_rate": 1.2662228469425594e-05, + "loss": 0.11333926916122436, + "step": 203560 + }, + { + "epoch": 0.8739685565372693, + "grad_norm": 1.478522777557373, + "learning_rate": 1.265791674930797e-05, + "loss": 0.3162508010864258, + "step": 203570 + }, + { + "epoch": 0.8740114886272894, + "grad_norm": 0.9837937951087952, + "learning_rate": 1.2653605029190347e-05, + "loss": 0.18049405813217162, + "step": 203580 + }, + { + "epoch": 0.8740544207173093, + "grad_norm": 0.0009803579887375236, + "learning_rate": 1.264929330907272e-05, + "loss": 0.10179203748703003, + "step": 203590 + }, + { + "epoch": 0.8740973528073294, + "grad_norm": 2.1010031700134277, + "learning_rate": 1.2644981588955098e-05, + "loss": 0.197291100025177, + "step": 203600 + }, + { + "epoch": 0.8741402848973494, + "grad_norm": 0.005292298272252083, + "learning_rate": 1.2640669868837474e-05, + "loss": 0.2430145263671875, + "step": 203610 + }, + { + "epoch": 0.8741832169873693, + "grad_norm": 0.004215996712446213, + "learning_rate": 1.2636358148719851e-05, + "loss": 0.18761887550354003, + "step": 203620 + }, + { + "epoch": 0.8742261490773894, + "grad_norm": 0.00525982491672039, + "learning_rate": 1.2632046428602227e-05, + "loss": 0.31954474449157716, + "step": 203630 + }, + { + "epoch": 0.8742690811674094, + "grad_norm": 2.0667223930358887, + "learning_rate": 1.2627734708484604e-05, + "loss": 0.2860520362854004, + "step": 203640 + }, + { + "epoch": 0.8743120132574294, + "grad_norm": 2.997504711151123, + "learning_rate": 1.2623422988366978e-05, + "loss": 0.13857897520065307, + "step": 203650 + }, + { + "epoch": 0.8743549453474494, + "grad_norm": 6.365592956542969, + "learning_rate": 1.2619111268249357e-05, + "loss": 0.18075789213180543, + "step": 203660 + }, + { + "epoch": 0.8743978774374694, + "grad_norm": 0.13455304503440857, + "learning_rate": 1.2614799548131731e-05, + "loss": 0.1939695358276367, + "step": 203670 + }, + { + "epoch": 0.8744408095274894, + "grad_norm": 1.9599144458770752, + "learning_rate": 1.2610487828014108e-05, + "loss": 0.11105598211288452, + "step": 203680 + }, + { + "epoch": 0.8744837416175094, + "grad_norm": 8.12281608581543, + "learning_rate": 1.2606176107896484e-05, + "loss": 0.1515058159828186, + "step": 203690 + }, + { + "epoch": 0.8745266737075295, + "grad_norm": 0.005965727381408215, + "learning_rate": 1.2601864387778861e-05, + "loss": 0.2970985651016235, + "step": 203700 + }, + { + "epoch": 0.8745696057975494, + "grad_norm": 2.7481706142425537, + "learning_rate": 1.2597552667661239e-05, + "loss": 0.22161006927490234, + "step": 203710 + }, + { + "epoch": 0.8746125378875694, + "grad_norm": 0.00433462206274271, + "learning_rate": 1.2593240947543614e-05, + "loss": 0.2382342576980591, + "step": 203720 + }, + { + "epoch": 0.8746554699775895, + "grad_norm": 0.04546516761183739, + "learning_rate": 1.2588929227425992e-05, + "loss": 0.3762409210205078, + "step": 203730 + }, + { + "epoch": 0.8746984020676094, + "grad_norm": 1.7207748889923096, + "learning_rate": 1.2584617507308366e-05, + "loss": 0.2575163602828979, + "step": 203740 + }, + { + "epoch": 0.8747413341576294, + "grad_norm": 1.7368102073669434, + "learning_rate": 1.2580305787190743e-05, + "loss": 0.05370763540267944, + "step": 203750 + }, + { + "epoch": 0.8747842662476495, + "grad_norm": 0.006718280725181103, + "learning_rate": 1.2575994067073119e-05, + "loss": 0.15436301231384278, + "step": 203760 + }, + { + "epoch": 0.8748271983376694, + "grad_norm": 0.3010559678077698, + "learning_rate": 1.2571682346955496e-05, + "loss": 0.1848167061805725, + "step": 203770 + }, + { + "epoch": 0.8748701304276895, + "grad_norm": 1.1568249464035034, + "learning_rate": 1.2567370626837872e-05, + "loss": 0.24243266582489015, + "step": 203780 + }, + { + "epoch": 0.8749130625177095, + "grad_norm": 0.07093426585197449, + "learning_rate": 1.2563058906720249e-05, + "loss": 0.11582286357879638, + "step": 203790 + }, + { + "epoch": 0.8749559946077295, + "grad_norm": 0.01068392489105463, + "learning_rate": 1.2558747186602623e-05, + "loss": 0.29853043556213377, + "step": 203800 + }, + { + "epoch": 0.8749989266977495, + "grad_norm": 0.0020451010204851627, + "learning_rate": 1.2554435466485002e-05, + "loss": 0.38561258316040037, + "step": 203810 + }, + { + "epoch": 0.8750418587877695, + "grad_norm": 0.0017150049097836018, + "learning_rate": 1.2550123746367376e-05, + "loss": 0.12902814149856567, + "step": 203820 + }, + { + "epoch": 0.8750847908777896, + "grad_norm": 0.10915122926235199, + "learning_rate": 1.2545812026249753e-05, + "loss": 0.08772753477096558, + "step": 203830 + }, + { + "epoch": 0.8751277229678095, + "grad_norm": 2.126547336578369, + "learning_rate": 1.2541500306132129e-05, + "loss": 0.29000654220581057, + "step": 203840 + }, + { + "epoch": 0.8751706550578295, + "grad_norm": 0.010257849469780922, + "learning_rate": 1.2537188586014506e-05, + "loss": 0.040891838073730466, + "step": 203850 + }, + { + "epoch": 0.8752135871478496, + "grad_norm": 0.006925768218934536, + "learning_rate": 1.253287686589688e-05, + "loss": 0.18998026847839355, + "step": 203860 + }, + { + "epoch": 0.8752565192378695, + "grad_norm": 1.4014134407043457, + "learning_rate": 1.252856514577926e-05, + "loss": 0.17409541606903076, + "step": 203870 + }, + { + "epoch": 0.8752994513278896, + "grad_norm": 1.6496857404708862, + "learning_rate": 1.2524253425661633e-05, + "loss": 0.23374514579772948, + "step": 203880 + }, + { + "epoch": 0.8753423834179096, + "grad_norm": 2.2921431064605713, + "learning_rate": 1.251994170554401e-05, + "loss": 0.1461879253387451, + "step": 203890 + }, + { + "epoch": 0.8753853155079295, + "grad_norm": 0.2534264624118805, + "learning_rate": 1.2515629985426386e-05, + "loss": 0.19159697294235228, + "step": 203900 + }, + { + "epoch": 0.8754282475979496, + "grad_norm": 1.1164767742156982, + "learning_rate": 1.2511318265308763e-05, + "loss": 0.1750656008720398, + "step": 203910 + }, + { + "epoch": 0.8754711796879696, + "grad_norm": 0.02214355207979679, + "learning_rate": 1.2507006545191139e-05, + "loss": 0.18490495681762695, + "step": 203920 + }, + { + "epoch": 0.8755141117779895, + "grad_norm": 0.09135203063488007, + "learning_rate": 1.2502694825073516e-05, + "loss": 0.13188610076904297, + "step": 203930 + }, + { + "epoch": 0.8755570438680096, + "grad_norm": 0.01683737337589264, + "learning_rate": 1.2498383104955892e-05, + "loss": 0.06789767146110534, + "step": 203940 + }, + { + "epoch": 0.8755999759580296, + "grad_norm": 8.750083923339844, + "learning_rate": 1.2494071384838268e-05, + "loss": 0.31661880016326904, + "step": 203950 + }, + { + "epoch": 0.8756429080480496, + "grad_norm": 2.9150664806365967, + "learning_rate": 1.2489759664720645e-05, + "loss": 0.1383286952972412, + "step": 203960 + }, + { + "epoch": 0.8756858401380696, + "grad_norm": 0.008152827620506287, + "learning_rate": 1.248544794460302e-05, + "loss": 0.14235267639160157, + "step": 203970 + }, + { + "epoch": 0.8757287722280896, + "grad_norm": 0.06607159972190857, + "learning_rate": 1.2481136224485396e-05, + "loss": 0.26250791549682617, + "step": 203980 + }, + { + "epoch": 0.8757717043181096, + "grad_norm": 0.08335408568382263, + "learning_rate": 1.2476824504367774e-05, + "loss": 0.42444653511047364, + "step": 203990 + }, + { + "epoch": 0.8758146364081296, + "grad_norm": 0.0037135977763682604, + "learning_rate": 1.247251278425015e-05, + "loss": 0.0763792335987091, + "step": 204000 + }, + { + "epoch": 0.8758146364081296, + "eval_loss": 0.3746587336063385, + "eval_runtime": 27.3661, + "eval_samples_per_second": 3.654, + "eval_steps_per_second": 3.654, + "step": 204000 + }, + { + "epoch": 0.8758575684981497, + "grad_norm": 0.09524694830179214, + "learning_rate": 1.2468201064132525e-05, + "loss": 0.019021494686603545, + "step": 204010 + }, + { + "epoch": 0.8759005005881696, + "grad_norm": 0.05071650445461273, + "learning_rate": 1.2463889344014902e-05, + "loss": 0.4243476390838623, + "step": 204020 + }, + { + "epoch": 0.8759434326781896, + "grad_norm": 1.4659315347671509, + "learning_rate": 1.2459577623897278e-05, + "loss": 0.1035999059677124, + "step": 204030 + }, + { + "epoch": 0.8759863647682097, + "grad_norm": 1.1015068292617798, + "learning_rate": 1.2455265903779655e-05, + "loss": 0.06280344724655151, + "step": 204040 + }, + { + "epoch": 0.8760292968582296, + "grad_norm": 7.034282207489014, + "learning_rate": 1.2450954183662031e-05, + "loss": 0.29902329444885256, + "step": 204050 + }, + { + "epoch": 0.8760722289482497, + "grad_norm": 1.9581341743469238, + "learning_rate": 1.2446642463544407e-05, + "loss": 0.37990517616271974, + "step": 204060 + }, + { + "epoch": 0.8761151610382697, + "grad_norm": 0.0010750473011285067, + "learning_rate": 1.2442330743426784e-05, + "loss": 0.22955756187438964, + "step": 204070 + }, + { + "epoch": 0.8761580931282896, + "grad_norm": 0.11583001166582108, + "learning_rate": 1.243801902330916e-05, + "loss": 0.11538155078887939, + "step": 204080 + }, + { + "epoch": 0.8762010252183097, + "grad_norm": 0.027170462533831596, + "learning_rate": 1.2433707303191535e-05, + "loss": 0.13683617115020752, + "step": 204090 + }, + { + "epoch": 0.8762439573083297, + "grad_norm": 0.025869445875287056, + "learning_rate": 1.2429395583073913e-05, + "loss": 0.21306424140930175, + "step": 204100 + }, + { + "epoch": 0.8762868893983496, + "grad_norm": 6.191617488861084, + "learning_rate": 1.2425083862956288e-05, + "loss": 0.41753354072570803, + "step": 204110 + }, + { + "epoch": 0.8763298214883697, + "grad_norm": 1.990005373954773, + "learning_rate": 1.2420772142838664e-05, + "loss": 0.3240317106246948, + "step": 204120 + }, + { + "epoch": 0.8763727535783897, + "grad_norm": 40.56575393676758, + "learning_rate": 1.2416460422721041e-05, + "loss": 0.28506391048431395, + "step": 204130 + }, + { + "epoch": 0.8764156856684097, + "grad_norm": 2.0711939334869385, + "learning_rate": 1.2412148702603417e-05, + "loss": 0.29656229019165037, + "step": 204140 + }, + { + "epoch": 0.8764586177584297, + "grad_norm": 2.7142903804779053, + "learning_rate": 1.2407836982485793e-05, + "loss": 0.22822027206420897, + "step": 204150 + }, + { + "epoch": 0.8765015498484497, + "grad_norm": 0.010692157782614231, + "learning_rate": 1.240352526236817e-05, + "loss": 0.14877686500549317, + "step": 204160 + }, + { + "epoch": 0.8765444819384697, + "grad_norm": 0.011316151358187199, + "learning_rate": 1.2399213542250546e-05, + "loss": 0.31983926296234133, + "step": 204170 + }, + { + "epoch": 0.8765874140284897, + "grad_norm": 0.02851453237235546, + "learning_rate": 1.2394901822132921e-05, + "loss": 0.2908674716949463, + "step": 204180 + }, + { + "epoch": 0.8766303461185098, + "grad_norm": 0.03520075976848602, + "learning_rate": 1.2390590102015299e-05, + "loss": 0.09001702666282654, + "step": 204190 + }, + { + "epoch": 0.8766732782085297, + "grad_norm": 0.028541959822177887, + "learning_rate": 1.2386278381897674e-05, + "loss": 0.1605126976966858, + "step": 204200 + }, + { + "epoch": 0.8767162102985497, + "grad_norm": 0.47786813974380493, + "learning_rate": 1.238196666178005e-05, + "loss": 0.07542160153388977, + "step": 204210 + }, + { + "epoch": 0.8767591423885698, + "grad_norm": 0.0031443950720131397, + "learning_rate": 1.2377654941662427e-05, + "loss": 0.07932630181312561, + "step": 204220 + }, + { + "epoch": 0.8768020744785898, + "grad_norm": 0.08970996737480164, + "learning_rate": 1.2373343221544803e-05, + "loss": 0.2293954610824585, + "step": 204230 + }, + { + "epoch": 0.8768450065686098, + "grad_norm": 0.015377390198409557, + "learning_rate": 1.2369031501427179e-05, + "loss": 0.03166041374206543, + "step": 204240 + }, + { + "epoch": 0.8768879386586298, + "grad_norm": 0.0009102340554818511, + "learning_rate": 1.2364719781309556e-05, + "loss": 0.17050247192382811, + "step": 204250 + }, + { + "epoch": 0.8769308707486498, + "grad_norm": 2.917968511581421, + "learning_rate": 1.2360408061191933e-05, + "loss": 0.34798197746276854, + "step": 204260 + }, + { + "epoch": 0.8769738028386698, + "grad_norm": 0.18242336809635162, + "learning_rate": 1.2356096341074309e-05, + "loss": 0.12703689336776733, + "step": 204270 + }, + { + "epoch": 0.8770167349286898, + "grad_norm": 0.12199801206588745, + "learning_rate": 1.2351784620956686e-05, + "loss": 0.17488703727722169, + "step": 204280 + }, + { + "epoch": 0.8770596670187099, + "grad_norm": 0.022362831979990005, + "learning_rate": 1.2347472900839062e-05, + "loss": 0.2593738079071045, + "step": 204290 + }, + { + "epoch": 0.8771025991087298, + "grad_norm": 0.08351355046033859, + "learning_rate": 1.2343161180721437e-05, + "loss": 0.08270981311798095, + "step": 204300 + }, + { + "epoch": 0.8771455311987498, + "grad_norm": 1.2442940473556519, + "learning_rate": 1.2338849460603815e-05, + "loss": 0.18074887990951538, + "step": 204310 + }, + { + "epoch": 0.8771884632887699, + "grad_norm": 1.6640584468841553, + "learning_rate": 1.233453774048619e-05, + "loss": 0.30208401679992675, + "step": 204320 + }, + { + "epoch": 0.8772313953787898, + "grad_norm": 0.015673568472266197, + "learning_rate": 1.2330226020368566e-05, + "loss": 0.10918002128601074, + "step": 204330 + }, + { + "epoch": 0.8772743274688098, + "grad_norm": 0.13469268381595612, + "learning_rate": 1.2325914300250943e-05, + "loss": 0.0036360248923301697, + "step": 204340 + }, + { + "epoch": 0.8773172595588299, + "grad_norm": 7.4689764976501465, + "learning_rate": 1.2321602580133319e-05, + "loss": 0.22000887393951415, + "step": 204350 + }, + { + "epoch": 0.8773601916488498, + "grad_norm": 0.24244554340839386, + "learning_rate": 1.2317290860015695e-05, + "loss": 0.09282507896423339, + "step": 204360 + }, + { + "epoch": 0.8774031237388699, + "grad_norm": 0.7553274631500244, + "learning_rate": 1.2312979139898072e-05, + "loss": 0.3855604648590088, + "step": 204370 + }, + { + "epoch": 0.8774460558288899, + "grad_norm": 0.07456060498952866, + "learning_rate": 1.2308667419780448e-05, + "loss": 0.14472362995147706, + "step": 204380 + }, + { + "epoch": 0.8774889879189098, + "grad_norm": 1.7638195753097534, + "learning_rate": 1.2304355699662825e-05, + "loss": 0.09877208471298218, + "step": 204390 + }, + { + "epoch": 0.8775319200089299, + "grad_norm": 0.023816445842385292, + "learning_rate": 1.23000439795452e-05, + "loss": 0.14562071561813356, + "step": 204400 + }, + { + "epoch": 0.8775748520989499, + "grad_norm": 0.5713729858398438, + "learning_rate": 1.2295732259427576e-05, + "loss": 0.14468239545822142, + "step": 204410 + }, + { + "epoch": 0.8776177841889699, + "grad_norm": 0.041681379079818726, + "learning_rate": 1.2291420539309954e-05, + "loss": 0.12000983953475952, + "step": 204420 + }, + { + "epoch": 0.8776607162789899, + "grad_norm": 0.02087230794131756, + "learning_rate": 1.228710881919233e-05, + "loss": 0.410722541809082, + "step": 204430 + }, + { + "epoch": 0.8777036483690099, + "grad_norm": 0.0039020644035190344, + "learning_rate": 1.2282797099074705e-05, + "loss": 0.027035737037658693, + "step": 204440 + }, + { + "epoch": 0.8777465804590299, + "grad_norm": 0.07208481431007385, + "learning_rate": 1.2278485378957082e-05, + "loss": 0.1252536416053772, + "step": 204450 + }, + { + "epoch": 0.8777895125490499, + "grad_norm": 0.02558734640479088, + "learning_rate": 1.2274173658839458e-05, + "loss": 0.14597811698913574, + "step": 204460 + }, + { + "epoch": 0.87783244463907, + "grad_norm": 0.006909816525876522, + "learning_rate": 1.2269861938721834e-05, + "loss": 0.2177650213241577, + "step": 204470 + }, + { + "epoch": 0.8778753767290899, + "grad_norm": 0.0023716725409030914, + "learning_rate": 1.2265550218604211e-05, + "loss": 0.16003108024597168, + "step": 204480 + }, + { + "epoch": 0.8779183088191099, + "grad_norm": 0.0030329038854688406, + "learning_rate": 1.2261238498486587e-05, + "loss": 0.06489881873130798, + "step": 204490 + }, + { + "epoch": 0.87796124090913, + "grad_norm": 1.8486450910568237, + "learning_rate": 1.2256926778368962e-05, + "loss": 0.19034510850906372, + "step": 204500 + }, + { + "epoch": 0.8780041729991499, + "grad_norm": 0.08171185106039047, + "learning_rate": 1.225261505825134e-05, + "loss": 0.148298442363739, + "step": 204510 + }, + { + "epoch": 0.8780471050891699, + "grad_norm": 0.00731939310207963, + "learning_rate": 1.2248303338133715e-05, + "loss": 0.12092814445495606, + "step": 204520 + }, + { + "epoch": 0.87809003717919, + "grad_norm": 2.0191352367401123, + "learning_rate": 1.2243991618016091e-05, + "loss": 0.23588323593139648, + "step": 204530 + }, + { + "epoch": 0.8781329692692099, + "grad_norm": 0.01967192441225052, + "learning_rate": 1.2239679897898468e-05, + "loss": 0.2921638488769531, + "step": 204540 + }, + { + "epoch": 0.87817590135923, + "grad_norm": 2.9450790882110596, + "learning_rate": 1.2235368177780844e-05, + "loss": 0.18513665199279786, + "step": 204550 + }, + { + "epoch": 0.87821883344925, + "grad_norm": 0.6629845499992371, + "learning_rate": 1.223105645766322e-05, + "loss": 0.26061363220214845, + "step": 204560 + }, + { + "epoch": 0.8782617655392699, + "grad_norm": 3.5497055053710938, + "learning_rate": 1.2226744737545597e-05, + "loss": 0.27632803916931153, + "step": 204570 + }, + { + "epoch": 0.87830469762929, + "grad_norm": 0.026790611445903778, + "learning_rate": 1.2222433017427973e-05, + "loss": 0.2509056329727173, + "step": 204580 + }, + { + "epoch": 0.87834762971931, + "grad_norm": 0.017180632799863815, + "learning_rate": 1.2218121297310348e-05, + "loss": 0.15285614728927613, + "step": 204590 + }, + { + "epoch": 0.87839056180933, + "grad_norm": 0.002091821050271392, + "learning_rate": 1.2213809577192726e-05, + "loss": 0.3005859136581421, + "step": 204600 + }, + { + "epoch": 0.87843349389935, + "grad_norm": 0.010604831390082836, + "learning_rate": 1.2209497857075101e-05, + "loss": 0.16785238981246947, + "step": 204610 + }, + { + "epoch": 0.87847642598937, + "grad_norm": 7.6037468910217285, + "learning_rate": 1.2205186136957479e-05, + "loss": 0.281678295135498, + "step": 204620 + }, + { + "epoch": 0.87851935807939, + "grad_norm": 0.0036886732559651136, + "learning_rate": 1.2200874416839856e-05, + "loss": 0.0634632170200348, + "step": 204630 + }, + { + "epoch": 0.87856229016941, + "grad_norm": 0.044636115431785583, + "learning_rate": 1.2196562696722232e-05, + "loss": 0.26335391998291013, + "step": 204640 + }, + { + "epoch": 0.87860522225943, + "grad_norm": 0.009788943454623222, + "learning_rate": 1.2192250976604607e-05, + "loss": 0.20432703495025634, + "step": 204650 + }, + { + "epoch": 0.8786481543494501, + "grad_norm": 2.5842597484588623, + "learning_rate": 1.2187939256486985e-05, + "loss": 0.2513580322265625, + "step": 204660 + }, + { + "epoch": 0.87869108643947, + "grad_norm": 4.397493839263916, + "learning_rate": 1.218362753636936e-05, + "loss": 0.24058179855346679, + "step": 204670 + }, + { + "epoch": 0.8787340185294901, + "grad_norm": 0.0031634909100830555, + "learning_rate": 1.2179315816251736e-05, + "loss": 0.09127176403999329, + "step": 204680 + }, + { + "epoch": 0.8787769506195101, + "grad_norm": 0.06499703228473663, + "learning_rate": 1.2175004096134113e-05, + "loss": 0.10204474925994873, + "step": 204690 + }, + { + "epoch": 0.87881988270953, + "grad_norm": 0.9429930448532104, + "learning_rate": 1.2170692376016489e-05, + "loss": 0.17954732179641725, + "step": 204700 + }, + { + "epoch": 0.8788628147995501, + "grad_norm": 0.002309724921360612, + "learning_rate": 1.2166380655898864e-05, + "loss": 0.337858247756958, + "step": 204710 + }, + { + "epoch": 0.8789057468895701, + "grad_norm": 0.00705463532358408, + "learning_rate": 1.2162068935781242e-05, + "loss": 0.30138473510742186, + "step": 204720 + }, + { + "epoch": 0.8789486789795901, + "grad_norm": 0.021212739869952202, + "learning_rate": 1.2157757215663617e-05, + "loss": 0.16993517875671388, + "step": 204730 + }, + { + "epoch": 0.8789916110696101, + "grad_norm": 0.004442409612238407, + "learning_rate": 1.2153445495545993e-05, + "loss": 0.30782337188720704, + "step": 204740 + }, + { + "epoch": 0.8790345431596301, + "grad_norm": 0.022244207561016083, + "learning_rate": 1.214913377542837e-05, + "loss": 0.20544323921203614, + "step": 204750 + }, + { + "epoch": 0.8790774752496501, + "grad_norm": 4.958491325378418, + "learning_rate": 1.2144822055310746e-05, + "loss": 0.09967964887619019, + "step": 204760 + }, + { + "epoch": 0.8791204073396701, + "grad_norm": 0.07501170039176941, + "learning_rate": 1.2140510335193123e-05, + "loss": 0.16576032638549804, + "step": 204770 + }, + { + "epoch": 0.8791633394296902, + "grad_norm": 0.012495110742747784, + "learning_rate": 1.2136198615075499e-05, + "loss": 0.006057353690266609, + "step": 204780 + }, + { + "epoch": 0.8792062715197101, + "grad_norm": 0.05930882692337036, + "learning_rate": 1.2131886894957875e-05, + "loss": 0.1349782109260559, + "step": 204790 + }, + { + "epoch": 0.8792492036097301, + "grad_norm": 0.00040715167415328324, + "learning_rate": 1.2127575174840252e-05, + "loss": 0.09597882628440857, + "step": 204800 + }, + { + "epoch": 0.8792921356997502, + "grad_norm": 0.31938889622688293, + "learning_rate": 1.2123263454722628e-05, + "loss": 0.11955327987670898, + "step": 204810 + }, + { + "epoch": 0.8793350677897701, + "grad_norm": 0.5764264464378357, + "learning_rate": 1.2118951734605003e-05, + "loss": 0.13746170997619628, + "step": 204820 + }, + { + "epoch": 0.8793779998797902, + "grad_norm": 0.010667004622519016, + "learning_rate": 1.211464001448738e-05, + "loss": 0.05581216812133789, + "step": 204830 + }, + { + "epoch": 0.8794209319698102, + "grad_norm": 0.3324984610080719, + "learning_rate": 1.2110328294369756e-05, + "loss": 0.16010618209838867, + "step": 204840 + }, + { + "epoch": 0.8794638640598301, + "grad_norm": 0.04684317484498024, + "learning_rate": 1.2106016574252132e-05, + "loss": 0.18404499292373658, + "step": 204850 + }, + { + "epoch": 0.8795067961498502, + "grad_norm": 0.135576069355011, + "learning_rate": 1.210170485413451e-05, + "loss": 0.22846548557281493, + "step": 204860 + }, + { + "epoch": 0.8795497282398702, + "grad_norm": 0.21464096009731293, + "learning_rate": 1.2097393134016885e-05, + "loss": 0.09315667152404786, + "step": 204870 + }, + { + "epoch": 0.8795926603298901, + "grad_norm": 1.5333083868026733, + "learning_rate": 1.209308141389926e-05, + "loss": 0.18947761058807372, + "step": 204880 + }, + { + "epoch": 0.8796355924199102, + "grad_norm": 0.037847794592380524, + "learning_rate": 1.2088769693781638e-05, + "loss": 0.10737059116363526, + "step": 204890 + }, + { + "epoch": 0.8796785245099302, + "grad_norm": 0.03366300091147423, + "learning_rate": 1.2084457973664014e-05, + "loss": 0.08891225457191468, + "step": 204900 + }, + { + "epoch": 0.8797214565999502, + "grad_norm": 0.09716782718896866, + "learning_rate": 1.208014625354639e-05, + "loss": 0.11172068119049072, + "step": 204910 + }, + { + "epoch": 0.8797643886899702, + "grad_norm": 6.475722312927246, + "learning_rate": 1.2075834533428767e-05, + "loss": 0.27301716804504395, + "step": 204920 + }, + { + "epoch": 0.8798073207799902, + "grad_norm": 1.4842710494995117, + "learning_rate": 1.2071522813311142e-05, + "loss": 0.14906420707702636, + "step": 204930 + }, + { + "epoch": 0.8798502528700102, + "grad_norm": 0.01469878014177084, + "learning_rate": 1.2067211093193518e-05, + "loss": 0.16778630018234253, + "step": 204940 + }, + { + "epoch": 0.8798931849600302, + "grad_norm": 0.05601226165890694, + "learning_rate": 1.2062899373075895e-05, + "loss": 0.30366196632385256, + "step": 204950 + }, + { + "epoch": 0.8799361170500503, + "grad_norm": 0.02477823570370674, + "learning_rate": 1.2058587652958271e-05, + "loss": 0.05100439190864563, + "step": 204960 + }, + { + "epoch": 0.8799790491400702, + "grad_norm": 0.0359761118888855, + "learning_rate": 1.2054275932840647e-05, + "loss": 0.2062239408493042, + "step": 204970 + }, + { + "epoch": 0.8800219812300902, + "grad_norm": 5.375646591186523, + "learning_rate": 1.2049964212723024e-05, + "loss": 0.17303190231323243, + "step": 204980 + }, + { + "epoch": 0.8800649133201103, + "grad_norm": 0.01700400933623314, + "learning_rate": 1.2045652492605401e-05, + "loss": 0.2812765836715698, + "step": 204990 + }, + { + "epoch": 0.8801078454101302, + "grad_norm": 0.07763543725013733, + "learning_rate": 1.2041340772487777e-05, + "loss": 0.20389330387115479, + "step": 205000 + }, + { + "epoch": 0.8801078454101302, + "eval_loss": 0.37906956672668457, + "eval_runtime": 27.4916, + "eval_samples_per_second": 3.637, + "eval_steps_per_second": 3.637, + "step": 205000 + }, + { + "epoch": 0.8801507775001502, + "grad_norm": 0.5729861855506897, + "learning_rate": 1.2037029052370154e-05, + "loss": 0.2167433738708496, + "step": 205010 + }, + { + "epoch": 0.8801937095901703, + "grad_norm": 0.0022135234903544188, + "learning_rate": 1.203271733225253e-05, + "loss": 0.1361485242843628, + "step": 205020 + }, + { + "epoch": 0.8802366416801902, + "grad_norm": 3.3651068210601807, + "learning_rate": 1.2028405612134906e-05, + "loss": 0.25268898010253904, + "step": 205030 + }, + { + "epoch": 0.8802795737702103, + "grad_norm": 2.356663465499878, + "learning_rate": 1.2024093892017283e-05, + "loss": 0.22077784538269044, + "step": 205040 + }, + { + "epoch": 0.8803225058602303, + "grad_norm": 0.0038632149808108807, + "learning_rate": 1.2019782171899659e-05, + "loss": 0.3469837665557861, + "step": 205050 + }, + { + "epoch": 0.8803654379502502, + "grad_norm": 0.06996764987707138, + "learning_rate": 1.2015470451782034e-05, + "loss": 0.31807005405426025, + "step": 205060 + }, + { + "epoch": 0.8804083700402703, + "grad_norm": 0.45123931765556335, + "learning_rate": 1.2011158731664412e-05, + "loss": 0.07297312021255493, + "step": 205070 + }, + { + "epoch": 0.8804513021302903, + "grad_norm": 0.17306841909885406, + "learning_rate": 1.2006847011546787e-05, + "loss": 0.18359951972961425, + "step": 205080 + }, + { + "epoch": 0.8804942342203104, + "grad_norm": 0.001680709421634674, + "learning_rate": 1.2002535291429163e-05, + "loss": 0.08356087803840637, + "step": 205090 + }, + { + "epoch": 0.8805371663103303, + "grad_norm": 1.4745081663131714, + "learning_rate": 1.199822357131154e-05, + "loss": 0.30778489112854, + "step": 205100 + }, + { + "epoch": 0.8805800984003503, + "grad_norm": 0.49757787585258484, + "learning_rate": 1.1993911851193916e-05, + "loss": 0.19059680700302123, + "step": 205110 + }, + { + "epoch": 0.8806230304903704, + "grad_norm": 0.7082939147949219, + "learning_rate": 1.1989600131076292e-05, + "loss": 0.26523666381835936, + "step": 205120 + }, + { + "epoch": 0.8806659625803903, + "grad_norm": 3.5472426414489746, + "learning_rate": 1.1985288410958669e-05, + "loss": 0.12654772996902466, + "step": 205130 + }, + { + "epoch": 0.8807088946704104, + "grad_norm": 0.035611625760793686, + "learning_rate": 1.1980976690841045e-05, + "loss": 0.1481760025024414, + "step": 205140 + }, + { + "epoch": 0.8807518267604304, + "grad_norm": 0.024171432480216026, + "learning_rate": 1.1976664970723422e-05, + "loss": 0.11591238975524902, + "step": 205150 + }, + { + "epoch": 0.8807947588504503, + "grad_norm": 5.451761245727539, + "learning_rate": 1.1972353250605797e-05, + "loss": 0.23077406883239746, + "step": 205160 + }, + { + "epoch": 0.8808376909404704, + "grad_norm": 0.3710680305957794, + "learning_rate": 1.1968041530488173e-05, + "loss": 0.21281032562255858, + "step": 205170 + }, + { + "epoch": 0.8808806230304904, + "grad_norm": 0.9765964150428772, + "learning_rate": 1.196372981037055e-05, + "loss": 0.1998102068901062, + "step": 205180 + }, + { + "epoch": 0.8809235551205103, + "grad_norm": 0.42906275391578674, + "learning_rate": 1.1959418090252926e-05, + "loss": 0.16494978666305543, + "step": 205190 + }, + { + "epoch": 0.8809664872105304, + "grad_norm": 0.009923688136041164, + "learning_rate": 1.1955106370135302e-05, + "loss": 0.1205499529838562, + "step": 205200 + }, + { + "epoch": 0.8810094193005504, + "grad_norm": 0.004118237178772688, + "learning_rate": 1.1950794650017679e-05, + "loss": 0.11395120620727539, + "step": 205210 + }, + { + "epoch": 0.8810523513905704, + "grad_norm": 0.128899484872818, + "learning_rate": 1.1946482929900055e-05, + "loss": 0.2525834083557129, + "step": 205220 + }, + { + "epoch": 0.8810952834805904, + "grad_norm": 0.007528554182499647, + "learning_rate": 1.194217120978243e-05, + "loss": 0.2052382230758667, + "step": 205230 + }, + { + "epoch": 0.8811382155706105, + "grad_norm": 0.9687526822090149, + "learning_rate": 1.1937859489664808e-05, + "loss": 0.4134235382080078, + "step": 205240 + }, + { + "epoch": 0.8811811476606304, + "grad_norm": 0.0006184322410263121, + "learning_rate": 1.1933547769547183e-05, + "loss": 0.42797160148620605, + "step": 205250 + }, + { + "epoch": 0.8812240797506504, + "grad_norm": 1.1402137279510498, + "learning_rate": 1.1929236049429559e-05, + "loss": 0.4267298698425293, + "step": 205260 + }, + { + "epoch": 0.8812670118406705, + "grad_norm": 0.17912384867668152, + "learning_rate": 1.1924924329311936e-05, + "loss": 0.09835328459739685, + "step": 205270 + }, + { + "epoch": 0.8813099439306904, + "grad_norm": 4.9136061668396, + "learning_rate": 1.1920612609194312e-05, + "loss": 0.0916410207748413, + "step": 205280 + }, + { + "epoch": 0.8813528760207104, + "grad_norm": 0.00036412899498827755, + "learning_rate": 1.1916300889076688e-05, + "loss": 0.29571709632873533, + "step": 205290 + }, + { + "epoch": 0.8813958081107305, + "grad_norm": 0.12275026738643646, + "learning_rate": 1.1911989168959065e-05, + "loss": 0.13231356143951417, + "step": 205300 + }, + { + "epoch": 0.8814387402007504, + "grad_norm": 0.013888251036405563, + "learning_rate": 1.190767744884144e-05, + "loss": 0.0009010875597596169, + "step": 205310 + }, + { + "epoch": 0.8814816722907705, + "grad_norm": 0.5148999691009521, + "learning_rate": 1.1903365728723816e-05, + "loss": 0.33445086479187014, + "step": 205320 + }, + { + "epoch": 0.8815246043807905, + "grad_norm": 7.188920974731445, + "learning_rate": 1.1899054008606194e-05, + "loss": 0.5070527076721192, + "step": 205330 + }, + { + "epoch": 0.8815675364708104, + "grad_norm": 0.6675366163253784, + "learning_rate": 1.189474228848857e-05, + "loss": 0.24539120197296144, + "step": 205340 + }, + { + "epoch": 0.8816104685608305, + "grad_norm": 1.1379445791244507, + "learning_rate": 1.1890430568370947e-05, + "loss": 0.20651309490203856, + "step": 205350 + }, + { + "epoch": 0.8816534006508505, + "grad_norm": 0.5539625883102417, + "learning_rate": 1.1886118848253324e-05, + "loss": 0.07830089330673218, + "step": 205360 + }, + { + "epoch": 0.8816963327408704, + "grad_norm": 1.806531310081482, + "learning_rate": 1.18818071281357e-05, + "loss": 0.2538323879241943, + "step": 205370 + }, + { + "epoch": 0.8817392648308905, + "grad_norm": 0.011052205227315426, + "learning_rate": 1.1877495408018075e-05, + "loss": 0.07202536463737488, + "step": 205380 + }, + { + "epoch": 0.8817821969209105, + "grad_norm": 0.9535105228424072, + "learning_rate": 1.1873183687900453e-05, + "loss": 0.11274955272674561, + "step": 205390 + }, + { + "epoch": 0.8818251290109305, + "grad_norm": 1.3573006391525269, + "learning_rate": 1.1868871967782828e-05, + "loss": 0.12120436429977417, + "step": 205400 + }, + { + "epoch": 0.8818680611009505, + "grad_norm": 0.2514362037181854, + "learning_rate": 1.1864560247665204e-05, + "loss": 0.20958714485168456, + "step": 205410 + }, + { + "epoch": 0.8819109931909705, + "grad_norm": 0.6128543019294739, + "learning_rate": 1.1860248527547581e-05, + "loss": 0.21837210655212402, + "step": 205420 + }, + { + "epoch": 0.8819539252809905, + "grad_norm": 0.18143728375434875, + "learning_rate": 1.1855936807429957e-05, + "loss": 0.052649658918380735, + "step": 205430 + }, + { + "epoch": 0.8819968573710105, + "grad_norm": 5.726690769195557, + "learning_rate": 1.1851625087312333e-05, + "loss": 0.14876792430877686, + "step": 205440 + }, + { + "epoch": 0.8820397894610306, + "grad_norm": 1.3325129747390747, + "learning_rate": 1.184731336719471e-05, + "loss": 0.20912702083587648, + "step": 205450 + }, + { + "epoch": 0.8820827215510505, + "grad_norm": 0.056446004658937454, + "learning_rate": 1.1843001647077086e-05, + "loss": 0.10956629514694213, + "step": 205460 + }, + { + "epoch": 0.8821256536410705, + "grad_norm": 14.242410659790039, + "learning_rate": 1.1838689926959461e-05, + "loss": 0.1361212968826294, + "step": 205470 + }, + { + "epoch": 0.8821685857310906, + "grad_norm": 0.010048151947557926, + "learning_rate": 1.1834378206841839e-05, + "loss": 0.2654492616653442, + "step": 205480 + }, + { + "epoch": 0.8822115178211105, + "grad_norm": 0.0009726140415295959, + "learning_rate": 1.1830066486724214e-05, + "loss": 0.32817275524139405, + "step": 205490 + }, + { + "epoch": 0.8822544499111306, + "grad_norm": 7.682567596435547, + "learning_rate": 1.182575476660659e-05, + "loss": 0.18321444988250732, + "step": 205500 + }, + { + "epoch": 0.8822973820011506, + "grad_norm": 2.296447992324829, + "learning_rate": 1.1821443046488967e-05, + "loss": 0.14441272020339965, + "step": 205510 + }, + { + "epoch": 0.8823403140911706, + "grad_norm": 1.1311460733413696, + "learning_rate": 1.1817131326371343e-05, + "loss": 0.46404194831848145, + "step": 205520 + }, + { + "epoch": 0.8823832461811906, + "grad_norm": 0.00633773161098361, + "learning_rate": 1.181281960625372e-05, + "loss": 0.1516602635383606, + "step": 205530 + }, + { + "epoch": 0.8824261782712106, + "grad_norm": 0.0009929277002811432, + "learning_rate": 1.1808507886136096e-05, + "loss": 0.1654080033302307, + "step": 205540 + }, + { + "epoch": 0.8824691103612307, + "grad_norm": 0.9732330441474915, + "learning_rate": 1.1804196166018472e-05, + "loss": 0.38271920680999755, + "step": 205550 + }, + { + "epoch": 0.8825120424512506, + "grad_norm": 0.018041890114545822, + "learning_rate": 1.1799884445900849e-05, + "loss": 0.1938990592956543, + "step": 205560 + }, + { + "epoch": 0.8825549745412706, + "grad_norm": 0.025509031489491463, + "learning_rate": 1.1795572725783225e-05, + "loss": 0.11931205987930298, + "step": 205570 + }, + { + "epoch": 0.8825979066312907, + "grad_norm": 0.07069827616214752, + "learning_rate": 1.17912610056656e-05, + "loss": 0.26412503719329833, + "step": 205580 + }, + { + "epoch": 0.8826408387213106, + "grad_norm": 0.0899178609251976, + "learning_rate": 1.1786949285547978e-05, + "loss": 0.27751870155334474, + "step": 205590 + }, + { + "epoch": 0.8826837708113306, + "grad_norm": 0.0004098423523828387, + "learning_rate": 1.1782637565430353e-05, + "loss": 0.12910977602005005, + "step": 205600 + }, + { + "epoch": 0.8827267029013507, + "grad_norm": 0.006932375021278858, + "learning_rate": 1.1778325845312729e-05, + "loss": 0.1931472659111023, + "step": 205610 + }, + { + "epoch": 0.8827696349913706, + "grad_norm": 7.9753851890563965, + "learning_rate": 1.1774014125195106e-05, + "loss": 0.33435921669006347, + "step": 205620 + }, + { + "epoch": 0.8828125670813907, + "grad_norm": 2.1303913593292236, + "learning_rate": 1.1769702405077482e-05, + "loss": 0.3566124439239502, + "step": 205630 + }, + { + "epoch": 0.8828554991714107, + "grad_norm": 0.0010813261615112424, + "learning_rate": 1.1765390684959857e-05, + "loss": 0.19597339630126953, + "step": 205640 + }, + { + "epoch": 0.8828984312614306, + "grad_norm": 0.012582486495375633, + "learning_rate": 1.1761078964842235e-05, + "loss": 0.22391045093536377, + "step": 205650 + }, + { + "epoch": 0.8829413633514507, + "grad_norm": 0.01926124095916748, + "learning_rate": 1.175676724472461e-05, + "loss": 0.1478889226913452, + "step": 205660 + }, + { + "epoch": 0.8829842954414707, + "grad_norm": 0.0006865571485832334, + "learning_rate": 1.1752455524606986e-05, + "loss": 0.2585289001464844, + "step": 205670 + }, + { + "epoch": 0.8830272275314907, + "grad_norm": 0.01835837960243225, + "learning_rate": 1.1748143804489363e-05, + "loss": 0.03371688425540924, + "step": 205680 + }, + { + "epoch": 0.8830701596215107, + "grad_norm": 3.0662288665771484, + "learning_rate": 1.1743832084371739e-05, + "loss": 0.34697909355163575, + "step": 205690 + }, + { + "epoch": 0.8831130917115307, + "grad_norm": 0.016853464767336845, + "learning_rate": 1.1739520364254115e-05, + "loss": 0.19272353649139404, + "step": 205700 + }, + { + "epoch": 0.8831560238015507, + "grad_norm": 0.43144023418426514, + "learning_rate": 1.1735208644136492e-05, + "loss": 0.1649027109146118, + "step": 205710 + }, + { + "epoch": 0.8831989558915707, + "grad_norm": 0.008879882283508778, + "learning_rate": 1.173089692401887e-05, + "loss": 0.20535609722137452, + "step": 205720 + }, + { + "epoch": 0.8832418879815908, + "grad_norm": 0.6299045085906982, + "learning_rate": 1.1726585203901245e-05, + "loss": 0.23428215980529785, + "step": 205730 + }, + { + "epoch": 0.8832848200716107, + "grad_norm": 0.10132008045911789, + "learning_rate": 1.1722273483783622e-05, + "loss": 0.3938116550445557, + "step": 205740 + }, + { + "epoch": 0.8833277521616307, + "grad_norm": 5.267863750457764, + "learning_rate": 1.1717961763665998e-05, + "loss": 0.25835647583007815, + "step": 205750 + }, + { + "epoch": 0.8833706842516508, + "grad_norm": 1.3534311056137085, + "learning_rate": 1.1713650043548374e-05, + "loss": 0.26612701416015627, + "step": 205760 + }, + { + "epoch": 0.8834136163416707, + "grad_norm": 4.357753753662109, + "learning_rate": 1.1709338323430751e-05, + "loss": 0.26286730766296384, + "step": 205770 + }, + { + "epoch": 0.8834565484316907, + "grad_norm": 1.7099212408065796, + "learning_rate": 1.1705026603313127e-05, + "loss": 0.39784243106842043, + "step": 205780 + }, + { + "epoch": 0.8834994805217108, + "grad_norm": 0.8876595497131348, + "learning_rate": 1.1700714883195502e-05, + "loss": 0.2518022060394287, + "step": 205790 + }, + { + "epoch": 0.8835424126117307, + "grad_norm": 0.0008838191861286759, + "learning_rate": 1.169640316307788e-05, + "loss": 0.11853432655334473, + "step": 205800 + }, + { + "epoch": 0.8835853447017508, + "grad_norm": 2.1805479526519775, + "learning_rate": 1.1692091442960255e-05, + "loss": 0.19648573398590088, + "step": 205810 + }, + { + "epoch": 0.8836282767917708, + "grad_norm": 0.008458657190203667, + "learning_rate": 1.1687779722842631e-05, + "loss": 0.14152785539627075, + "step": 205820 + }, + { + "epoch": 0.8836712088817907, + "grad_norm": 8.96318244934082, + "learning_rate": 1.1683468002725008e-05, + "loss": 0.1602509140968323, + "step": 205830 + }, + { + "epoch": 0.8837141409718108, + "grad_norm": 0.2068006992340088, + "learning_rate": 1.1679156282607384e-05, + "loss": 0.3406901597976685, + "step": 205840 + }, + { + "epoch": 0.8837570730618308, + "grad_norm": 0.8047705292701721, + "learning_rate": 1.167484456248976e-05, + "loss": 0.11222801208496094, + "step": 205850 + }, + { + "epoch": 0.8838000051518508, + "grad_norm": 0.01122902799397707, + "learning_rate": 1.1670532842372137e-05, + "loss": 0.11741714477539063, + "step": 205860 + }, + { + "epoch": 0.8838429372418708, + "grad_norm": 0.18358559906482697, + "learning_rate": 1.1666221122254513e-05, + "loss": 0.08803790211677551, + "step": 205870 + }, + { + "epoch": 0.8838858693318908, + "grad_norm": 0.005441271234303713, + "learning_rate": 1.1661909402136888e-05, + "loss": 0.1691023588180542, + "step": 205880 + }, + { + "epoch": 0.8839288014219108, + "grad_norm": 0.003943410702049732, + "learning_rate": 1.1657597682019266e-05, + "loss": 0.1766132116317749, + "step": 205890 + }, + { + "epoch": 0.8839717335119308, + "grad_norm": 0.026890434324741364, + "learning_rate": 1.1653285961901641e-05, + "loss": 0.20017850399017334, + "step": 205900 + }, + { + "epoch": 0.8840146656019509, + "grad_norm": 0.0028352816589176655, + "learning_rate": 1.1648974241784019e-05, + "loss": 0.10084348917007446, + "step": 205910 + }, + { + "epoch": 0.8840575976919708, + "grad_norm": 1.1735332012176514, + "learning_rate": 1.1644662521666394e-05, + "loss": 0.43771066665649416, + "step": 205920 + }, + { + "epoch": 0.8841005297819908, + "grad_norm": 0.0006905131740495563, + "learning_rate": 1.164035080154877e-05, + "loss": 0.2366577386856079, + "step": 205930 + }, + { + "epoch": 0.8841434618720109, + "grad_norm": 3.063614845275879, + "learning_rate": 1.1636039081431147e-05, + "loss": 0.11916078329086303, + "step": 205940 + }, + { + "epoch": 0.8841863939620309, + "grad_norm": 1.343090295791626, + "learning_rate": 1.1631727361313523e-05, + "loss": 0.2360858917236328, + "step": 205950 + }, + { + "epoch": 0.8842293260520508, + "grad_norm": 0.03908395394682884, + "learning_rate": 1.1627415641195899e-05, + "loss": 0.06640787124633789, + "step": 205960 + }, + { + "epoch": 0.8842722581420709, + "grad_norm": 1.5124359130859375, + "learning_rate": 1.1623103921078276e-05, + "loss": 0.3608893394470215, + "step": 205970 + }, + { + "epoch": 0.8843151902320909, + "grad_norm": 0.008707244880497456, + "learning_rate": 1.1618792200960652e-05, + "loss": 0.18862802982330323, + "step": 205980 + }, + { + "epoch": 0.8843581223221109, + "grad_norm": 0.005090769380331039, + "learning_rate": 1.1614480480843027e-05, + "loss": 0.22757859230041505, + "step": 205990 + }, + { + "epoch": 0.8844010544121309, + "grad_norm": 0.019245756790041924, + "learning_rate": 1.1610168760725405e-05, + "loss": 0.12244853973388672, + "step": 206000 + }, + { + "epoch": 0.8844010544121309, + "eval_loss": 0.37726035714149475, + "eval_runtime": 27.513, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 3.635, + "step": 206000 + }, + { + "epoch": 0.884443986502151, + "grad_norm": 0.016391757875680923, + "learning_rate": 1.160585704060778e-05, + "loss": 0.33532443046569826, + "step": 206010 + }, + { + "epoch": 0.8844869185921709, + "grad_norm": 0.0012011586222797632, + "learning_rate": 1.1601545320490156e-05, + "loss": 0.0573084831237793, + "step": 206020 + }, + { + "epoch": 0.8845298506821909, + "grad_norm": 0.030159875750541687, + "learning_rate": 1.1597233600372533e-05, + "loss": 0.12054593563079834, + "step": 206030 + }, + { + "epoch": 0.884572782772211, + "grad_norm": 0.03495888411998749, + "learning_rate": 1.1592921880254909e-05, + "loss": 0.13690305948257447, + "step": 206040 + }, + { + "epoch": 0.8846157148622309, + "grad_norm": 0.02454495243728161, + "learning_rate": 1.1588610160137284e-05, + "loss": 0.023973910510540007, + "step": 206050 + }, + { + "epoch": 0.8846586469522509, + "grad_norm": 0.003935964312404394, + "learning_rate": 1.1584298440019662e-05, + "loss": 0.31903369426727296, + "step": 206060 + }, + { + "epoch": 0.884701579042271, + "grad_norm": 0.007954302243888378, + "learning_rate": 1.1579986719902037e-05, + "loss": 0.10175185203552246, + "step": 206070 + }, + { + "epoch": 0.8847445111322909, + "grad_norm": 1.4729177951812744, + "learning_rate": 1.1575674999784415e-05, + "loss": 0.2837111711502075, + "step": 206080 + }, + { + "epoch": 0.884787443222311, + "grad_norm": 0.010839829221367836, + "learning_rate": 1.1571363279666792e-05, + "loss": 0.10710896253585815, + "step": 206090 + }, + { + "epoch": 0.884830375312331, + "grad_norm": 0.6962838172912598, + "learning_rate": 1.1567051559549168e-05, + "loss": 0.41863112449645995, + "step": 206100 + }, + { + "epoch": 0.8848733074023509, + "grad_norm": 4.106805324554443, + "learning_rate": 1.1562739839431543e-05, + "loss": 0.16724135875701904, + "step": 206110 + }, + { + "epoch": 0.884916239492371, + "grad_norm": 0.00478664506226778, + "learning_rate": 1.155842811931392e-05, + "loss": 0.199246084690094, + "step": 206120 + }, + { + "epoch": 0.884959171582391, + "grad_norm": 0.021048052236437798, + "learning_rate": 1.1554116399196296e-05, + "loss": 0.14670164585113527, + "step": 206130 + }, + { + "epoch": 0.885002103672411, + "grad_norm": 1.4622784852981567, + "learning_rate": 1.1549804679078672e-05, + "loss": 0.17531541585922242, + "step": 206140 + }, + { + "epoch": 0.885045035762431, + "grad_norm": 2.0894620418548584, + "learning_rate": 1.154549295896105e-05, + "loss": 0.2334801435470581, + "step": 206150 + }, + { + "epoch": 0.885087967852451, + "grad_norm": 0.1139649897813797, + "learning_rate": 1.1541181238843425e-05, + "loss": 0.15220773220062256, + "step": 206160 + }, + { + "epoch": 0.885130899942471, + "grad_norm": 0.0686078742146492, + "learning_rate": 1.15368695187258e-05, + "loss": 0.08027942180633545, + "step": 206170 + }, + { + "epoch": 0.885173832032491, + "grad_norm": 1.212646722793579, + "learning_rate": 1.1532557798608178e-05, + "loss": 0.3642886638641357, + "step": 206180 + }, + { + "epoch": 0.885216764122511, + "grad_norm": 0.38929513096809387, + "learning_rate": 1.1528246078490554e-05, + "loss": 0.1459059476852417, + "step": 206190 + }, + { + "epoch": 0.885259696212531, + "grad_norm": 0.5141419768333435, + "learning_rate": 1.152393435837293e-05, + "loss": 0.21101291179656984, + "step": 206200 + }, + { + "epoch": 0.885302628302551, + "grad_norm": 0.023184938356280327, + "learning_rate": 1.1519622638255307e-05, + "loss": 0.12943390607833863, + "step": 206210 + }, + { + "epoch": 0.8853455603925711, + "grad_norm": 0.010622471570968628, + "learning_rate": 1.1515310918137682e-05, + "loss": 0.1018107533454895, + "step": 206220 + }, + { + "epoch": 0.885388492482591, + "grad_norm": 0.05979568138718605, + "learning_rate": 1.1510999198020058e-05, + "loss": 0.017998765408992767, + "step": 206230 + }, + { + "epoch": 0.885431424572611, + "grad_norm": 0.03342049568891525, + "learning_rate": 1.1506687477902435e-05, + "loss": 0.32997517585754393, + "step": 206240 + }, + { + "epoch": 0.8854743566626311, + "grad_norm": 1.3702374696731567, + "learning_rate": 1.1502375757784811e-05, + "loss": 0.33046650886535645, + "step": 206250 + }, + { + "epoch": 0.885517288752651, + "grad_norm": 1.2958252429962158, + "learning_rate": 1.1498064037667187e-05, + "loss": 0.2511059522628784, + "step": 206260 + }, + { + "epoch": 0.885560220842671, + "grad_norm": 0.011626111343502998, + "learning_rate": 1.1493752317549564e-05, + "loss": 0.14074764251708985, + "step": 206270 + }, + { + "epoch": 0.8856031529326911, + "grad_norm": 0.7215688228607178, + "learning_rate": 1.148944059743194e-05, + "loss": 0.11617779731750488, + "step": 206280 + }, + { + "epoch": 0.885646085022711, + "grad_norm": 3.217930316925049, + "learning_rate": 1.1485128877314317e-05, + "loss": 0.07321255207061768, + "step": 206290 + }, + { + "epoch": 0.8856890171127311, + "grad_norm": 0.014127644710242748, + "learning_rate": 1.1480817157196693e-05, + "loss": 0.002158048748970032, + "step": 206300 + }, + { + "epoch": 0.8857319492027511, + "grad_norm": 0.008127226494252682, + "learning_rate": 1.1476505437079068e-05, + "loss": 0.20068845748901368, + "step": 206310 + }, + { + "epoch": 0.885774881292771, + "grad_norm": 0.0022074533626437187, + "learning_rate": 1.1472193716961446e-05, + "loss": 0.36427195072174073, + "step": 206320 + }, + { + "epoch": 0.8858178133827911, + "grad_norm": 0.003371837781742215, + "learning_rate": 1.1467881996843821e-05, + "loss": 0.2856950044631958, + "step": 206330 + }, + { + "epoch": 0.8858607454728111, + "grad_norm": 0.054005175828933716, + "learning_rate": 1.1463570276726197e-05, + "loss": 0.1003882884979248, + "step": 206340 + }, + { + "epoch": 0.8859036775628311, + "grad_norm": 0.008997203782200813, + "learning_rate": 1.1459258556608574e-05, + "loss": 0.07773151993751526, + "step": 206350 + }, + { + "epoch": 0.8859466096528511, + "grad_norm": 0.004392173606902361, + "learning_rate": 1.145494683649095e-05, + "loss": 0.22653238773345946, + "step": 206360 + }, + { + "epoch": 0.8859895417428711, + "grad_norm": 0.18398471176624298, + "learning_rate": 1.1450635116373326e-05, + "loss": 0.07900636792182922, + "step": 206370 + }, + { + "epoch": 0.8860324738328912, + "grad_norm": 0.023309681564569473, + "learning_rate": 1.1446323396255703e-05, + "loss": 0.1441459536552429, + "step": 206380 + }, + { + "epoch": 0.8860754059229111, + "grad_norm": 0.2616986036300659, + "learning_rate": 1.1442011676138079e-05, + "loss": 0.0349333643913269, + "step": 206390 + }, + { + "epoch": 0.8861183380129312, + "grad_norm": 0.15079903602600098, + "learning_rate": 1.1437699956020454e-05, + "loss": 0.14630622863769532, + "step": 206400 + }, + { + "epoch": 0.8861612701029512, + "grad_norm": 1.4602254629135132, + "learning_rate": 1.1433388235902832e-05, + "loss": 0.15438275337219237, + "step": 206410 + }, + { + "epoch": 0.8862042021929711, + "grad_norm": 0.011119797825813293, + "learning_rate": 1.1429076515785207e-05, + "loss": 0.12381850481033325, + "step": 206420 + }, + { + "epoch": 0.8862471342829912, + "grad_norm": 0.019156260415911674, + "learning_rate": 1.1424764795667583e-05, + "loss": 0.29505267143249514, + "step": 206430 + }, + { + "epoch": 0.8862900663730112, + "grad_norm": 1.6167775392532349, + "learning_rate": 1.1420453075549962e-05, + "loss": 0.32378649711608887, + "step": 206440 + }, + { + "epoch": 0.8863329984630312, + "grad_norm": 0.0651068389415741, + "learning_rate": 1.1416141355432338e-05, + "loss": 0.08244922161102294, + "step": 206450 + }, + { + "epoch": 0.8863759305530512, + "grad_norm": 4.568314552307129, + "learning_rate": 1.1411829635314713e-05, + "loss": 0.15970878601074218, + "step": 206460 + }, + { + "epoch": 0.8864188626430712, + "grad_norm": 0.5490240454673767, + "learning_rate": 1.140751791519709e-05, + "loss": 0.18453452587127686, + "step": 206470 + }, + { + "epoch": 0.8864617947330912, + "grad_norm": 0.026446960866451263, + "learning_rate": 1.1403206195079466e-05, + "loss": 0.1261613130569458, + "step": 206480 + }, + { + "epoch": 0.8865047268231112, + "grad_norm": 0.05164752155542374, + "learning_rate": 1.1398894474961842e-05, + "loss": 0.05536790490150452, + "step": 206490 + }, + { + "epoch": 0.8865476589131313, + "grad_norm": 0.013431284576654434, + "learning_rate": 1.1394582754844219e-05, + "loss": 0.005943649634718895, + "step": 206500 + }, + { + "epoch": 0.8865905910031512, + "grad_norm": 0.007348786108195782, + "learning_rate": 1.1390271034726595e-05, + "loss": 0.10574545860290527, + "step": 206510 + }, + { + "epoch": 0.8866335230931712, + "grad_norm": 2.562957525253296, + "learning_rate": 1.138595931460897e-05, + "loss": 0.3311743259429932, + "step": 206520 + }, + { + "epoch": 0.8866764551831913, + "grad_norm": 0.6146745085716248, + "learning_rate": 1.1381647594491348e-05, + "loss": 0.27206060886383054, + "step": 206530 + }, + { + "epoch": 0.8867193872732112, + "grad_norm": 0.12736624479293823, + "learning_rate": 1.1377335874373723e-05, + "loss": 0.12166001796722412, + "step": 206540 + }, + { + "epoch": 0.8867623193632312, + "grad_norm": 0.02262021414935589, + "learning_rate": 1.1373024154256099e-05, + "loss": 0.1580941915512085, + "step": 206550 + }, + { + "epoch": 0.8868052514532513, + "grad_norm": 3.1980385780334473, + "learning_rate": 1.1368712434138476e-05, + "loss": 0.0834578275680542, + "step": 206560 + }, + { + "epoch": 0.8868481835432712, + "grad_norm": 0.7768859267234802, + "learning_rate": 1.1364400714020852e-05, + "loss": 0.19401477575302123, + "step": 206570 + }, + { + "epoch": 0.8868911156332913, + "grad_norm": 1.658769965171814, + "learning_rate": 1.1360088993903228e-05, + "loss": 0.27375426292419436, + "step": 206580 + }, + { + "epoch": 0.8869340477233113, + "grad_norm": 0.08683007210493088, + "learning_rate": 1.1355777273785605e-05, + "loss": 0.06745948195457459, + "step": 206590 + }, + { + "epoch": 0.8869769798133312, + "grad_norm": 3.6138076782226562, + "learning_rate": 1.135146555366798e-05, + "loss": 0.14725286960601808, + "step": 206600 + }, + { + "epoch": 0.8870199119033513, + "grad_norm": 0.0008120244019664824, + "learning_rate": 1.1347153833550356e-05, + "loss": 0.05979236364364624, + "step": 206610 + }, + { + "epoch": 0.8870628439933713, + "grad_norm": 0.33934497833251953, + "learning_rate": 1.1342842113432734e-05, + "loss": 0.2910740375518799, + "step": 206620 + }, + { + "epoch": 0.8871057760833913, + "grad_norm": 0.019929246976971626, + "learning_rate": 1.133853039331511e-05, + "loss": 0.050988197326660156, + "step": 206630 + }, + { + "epoch": 0.8871487081734113, + "grad_norm": 0.017778104171156883, + "learning_rate": 1.1334218673197487e-05, + "loss": 0.257867693901062, + "step": 206640 + }, + { + "epoch": 0.8871916402634313, + "grad_norm": 0.01611817069351673, + "learning_rate": 1.1329906953079862e-05, + "loss": 0.2223726272583008, + "step": 206650 + }, + { + "epoch": 0.8872345723534513, + "grad_norm": 11.27148151397705, + "learning_rate": 1.1325595232962238e-05, + "loss": 0.1859837532043457, + "step": 206660 + }, + { + "epoch": 0.8872775044434713, + "grad_norm": 0.07977546006441116, + "learning_rate": 1.1321283512844615e-05, + "loss": 0.1255193829536438, + "step": 206670 + }, + { + "epoch": 0.8873204365334914, + "grad_norm": 3.2276129722595215, + "learning_rate": 1.1316971792726991e-05, + "loss": 0.07970626354217529, + "step": 206680 + }, + { + "epoch": 0.8873633686235113, + "grad_norm": 3.979701042175293, + "learning_rate": 1.1312660072609367e-05, + "loss": 0.22427015304565429, + "step": 206690 + }, + { + "epoch": 0.8874063007135313, + "grad_norm": 2.818377733230591, + "learning_rate": 1.1308348352491744e-05, + "loss": 0.47805194854736327, + "step": 206700 + }, + { + "epoch": 0.8874492328035514, + "grad_norm": 0.0031118851620703936, + "learning_rate": 1.130403663237412e-05, + "loss": 0.06364747881889343, + "step": 206710 + }, + { + "epoch": 0.8874921648935713, + "grad_norm": 0.010256568901240826, + "learning_rate": 1.1299724912256495e-05, + "loss": 0.26986443996429443, + "step": 206720 + }, + { + "epoch": 0.8875350969835913, + "grad_norm": 1.0608967542648315, + "learning_rate": 1.1295413192138873e-05, + "loss": 0.15425010919570922, + "step": 206730 + }, + { + "epoch": 0.8875780290736114, + "grad_norm": 0.5740428566932678, + "learning_rate": 1.1291101472021248e-05, + "loss": 0.16769996881484986, + "step": 206740 + }, + { + "epoch": 0.8876209611636313, + "grad_norm": 1.803846001625061, + "learning_rate": 1.1286789751903624e-05, + "loss": 0.167777681350708, + "step": 206750 + }, + { + "epoch": 0.8876638932536514, + "grad_norm": 0.028705893084406853, + "learning_rate": 1.1282478031786001e-05, + "loss": 0.10598093271255493, + "step": 206760 + }, + { + "epoch": 0.8877068253436714, + "grad_norm": 0.013095211237668991, + "learning_rate": 1.1278166311668377e-05, + "loss": 0.3958571910858154, + "step": 206770 + }, + { + "epoch": 0.8877497574336913, + "grad_norm": 1.313372015953064, + "learning_rate": 1.1273854591550753e-05, + "loss": 0.45798640251159667, + "step": 206780 + }, + { + "epoch": 0.8877926895237114, + "grad_norm": 0.05569923669099808, + "learning_rate": 1.126954287143313e-05, + "loss": 0.18811848163604736, + "step": 206790 + }, + { + "epoch": 0.8878356216137314, + "grad_norm": 0.0055416957475245, + "learning_rate": 1.1265231151315506e-05, + "loss": 0.22820720672607422, + "step": 206800 + }, + { + "epoch": 0.8878785537037515, + "grad_norm": 0.001364398980513215, + "learning_rate": 1.1260919431197883e-05, + "loss": 0.15015835762023927, + "step": 206810 + }, + { + "epoch": 0.8879214857937714, + "grad_norm": 2.2645983695983887, + "learning_rate": 1.125660771108026e-05, + "loss": 0.2777566432952881, + "step": 206820 + }, + { + "epoch": 0.8879644178837914, + "grad_norm": 2.002868413925171, + "learning_rate": 1.1252295990962636e-05, + "loss": 0.20114946365356445, + "step": 206830 + }, + { + "epoch": 0.8880073499738115, + "grad_norm": 0.0017295647412538528, + "learning_rate": 1.1247984270845012e-05, + "loss": 0.24505915641784667, + "step": 206840 + }, + { + "epoch": 0.8880502820638314, + "grad_norm": 0.03503365069627762, + "learning_rate": 1.1243672550727389e-05, + "loss": 0.13042792081832885, + "step": 206850 + }, + { + "epoch": 0.8880932141538515, + "grad_norm": 0.08353490382432938, + "learning_rate": 1.1239360830609765e-05, + "loss": 0.22364115715026855, + "step": 206860 + }, + { + "epoch": 0.8881361462438715, + "grad_norm": 0.9352854490280151, + "learning_rate": 1.123504911049214e-05, + "loss": 0.1533583402633667, + "step": 206870 + }, + { + "epoch": 0.8881790783338914, + "grad_norm": 0.19706374406814575, + "learning_rate": 1.1230737390374518e-05, + "loss": 0.20712320804595946, + "step": 206880 + }, + { + "epoch": 0.8882220104239115, + "grad_norm": 0.04519926756620407, + "learning_rate": 1.1226425670256893e-05, + "loss": 0.3285240173339844, + "step": 206890 + }, + { + "epoch": 0.8882649425139315, + "grad_norm": 6.046555995941162, + "learning_rate": 1.1222113950139269e-05, + "loss": 0.07612007856369019, + "step": 206900 + }, + { + "epoch": 0.8883078746039514, + "grad_norm": 1.6961110830307007, + "learning_rate": 1.1217802230021646e-05, + "loss": 0.2191645622253418, + "step": 206910 + }, + { + "epoch": 0.8883508066939715, + "grad_norm": 0.0015003492590039968, + "learning_rate": 1.1213490509904022e-05, + "loss": 0.3165169954299927, + "step": 206920 + }, + { + "epoch": 0.8883937387839915, + "grad_norm": 0.7118803858757019, + "learning_rate": 1.1209178789786397e-05, + "loss": 0.14242100715637207, + "step": 206930 + }, + { + "epoch": 0.8884366708740115, + "grad_norm": 0.0008318256586790085, + "learning_rate": 1.1204867069668775e-05, + "loss": 0.15933756828308104, + "step": 206940 + }, + { + "epoch": 0.8884796029640315, + "grad_norm": 0.40112194418907166, + "learning_rate": 1.120055534955115e-05, + "loss": 0.15181329250335693, + "step": 206950 + }, + { + "epoch": 0.8885225350540515, + "grad_norm": 0.003861044766381383, + "learning_rate": 1.1196243629433526e-05, + "loss": 0.14451063871383668, + "step": 206960 + }, + { + "epoch": 0.8885654671440715, + "grad_norm": 0.0033035404048860073, + "learning_rate": 1.1191931909315903e-05, + "loss": 0.17607152462005615, + "step": 206970 + }, + { + "epoch": 0.8886083992340915, + "grad_norm": 0.004272112622857094, + "learning_rate": 1.1187620189198279e-05, + "loss": 0.30315728187561036, + "step": 206980 + }, + { + "epoch": 0.8886513313241116, + "grad_norm": 0.6283832788467407, + "learning_rate": 1.1183308469080655e-05, + "loss": 0.15836750268936156, + "step": 206990 + }, + { + "epoch": 0.8886942634141315, + "grad_norm": 0.1068863794207573, + "learning_rate": 1.1178996748963032e-05, + "loss": 0.19578337669372559, + "step": 207000 + }, + { + "epoch": 0.8886942634141315, + "eval_loss": 0.37495312094688416, + "eval_runtime": 27.3443, + "eval_samples_per_second": 3.657, + "eval_steps_per_second": 3.657, + "step": 207000 + }, + { + "epoch": 0.8887371955041515, + "grad_norm": 1.5647714138031006, + "learning_rate": 1.1174685028845408e-05, + "loss": 0.14250789880752562, + "step": 207010 + }, + { + "epoch": 0.8887801275941716, + "grad_norm": 0.289928138256073, + "learning_rate": 1.1170373308727785e-05, + "loss": 0.34872961044311523, + "step": 207020 + }, + { + "epoch": 0.8888230596841915, + "grad_norm": 0.03888840228319168, + "learning_rate": 1.116606158861016e-05, + "loss": 0.1511203169822693, + "step": 207030 + }, + { + "epoch": 0.8888659917742116, + "grad_norm": 1.086020827293396, + "learning_rate": 1.1161749868492536e-05, + "loss": 0.17135267257690429, + "step": 207040 + }, + { + "epoch": 0.8889089238642316, + "grad_norm": 0.035367049276828766, + "learning_rate": 1.1157438148374914e-05, + "loss": 0.2833311319351196, + "step": 207050 + }, + { + "epoch": 0.8889518559542515, + "grad_norm": 0.39125993847846985, + "learning_rate": 1.115312642825729e-05, + "loss": 0.3519448280334473, + "step": 207060 + }, + { + "epoch": 0.8889947880442716, + "grad_norm": 1.0476738214492798, + "learning_rate": 1.1148814708139665e-05, + "loss": 0.28065240383148193, + "step": 207070 + }, + { + "epoch": 0.8890377201342916, + "grad_norm": 2.892183303833008, + "learning_rate": 1.1144502988022042e-05, + "loss": 0.29977333545684814, + "step": 207080 + }, + { + "epoch": 0.8890806522243115, + "grad_norm": 0.21420015394687653, + "learning_rate": 1.1140191267904418e-05, + "loss": 0.17063156366348267, + "step": 207090 + }, + { + "epoch": 0.8891235843143316, + "grad_norm": 1.2925959825515747, + "learning_rate": 1.1135879547786794e-05, + "loss": 0.2224264621734619, + "step": 207100 + }, + { + "epoch": 0.8891665164043516, + "grad_norm": 0.03948453813791275, + "learning_rate": 1.1131567827669171e-05, + "loss": 0.1689953088760376, + "step": 207110 + }, + { + "epoch": 0.8892094484943716, + "grad_norm": 1.0124317407608032, + "learning_rate": 1.1127256107551547e-05, + "loss": 0.16605607271194459, + "step": 207120 + }, + { + "epoch": 0.8892523805843916, + "grad_norm": 0.6965276002883911, + "learning_rate": 1.1122944387433922e-05, + "loss": 0.22367043495178224, + "step": 207130 + }, + { + "epoch": 0.8892953126744116, + "grad_norm": 0.02940557524561882, + "learning_rate": 1.11186326673163e-05, + "loss": 0.2905860424041748, + "step": 207140 + }, + { + "epoch": 0.8893382447644316, + "grad_norm": 0.003474108874797821, + "learning_rate": 1.1114320947198675e-05, + "loss": 0.060731494426727296, + "step": 207150 + }, + { + "epoch": 0.8893811768544516, + "grad_norm": 0.02715556137263775, + "learning_rate": 1.1110009227081051e-05, + "loss": 0.08220630884170532, + "step": 207160 + }, + { + "epoch": 0.8894241089444717, + "grad_norm": 0.05262776091694832, + "learning_rate": 1.110569750696343e-05, + "loss": 0.022940675914287566, + "step": 207170 + }, + { + "epoch": 0.8894670410344916, + "grad_norm": 0.09899931401014328, + "learning_rate": 1.1101385786845806e-05, + "loss": 0.18971925973892212, + "step": 207180 + }, + { + "epoch": 0.8895099731245116, + "grad_norm": 0.6711128354072571, + "learning_rate": 1.1097074066728181e-05, + "loss": 0.13049598932266235, + "step": 207190 + }, + { + "epoch": 0.8895529052145317, + "grad_norm": 0.6323684453964233, + "learning_rate": 1.1092762346610559e-05, + "loss": 0.2400278091430664, + "step": 207200 + }, + { + "epoch": 0.8895958373045516, + "grad_norm": 2.824448823928833, + "learning_rate": 1.1088450626492934e-05, + "loss": 0.387871241569519, + "step": 207210 + }, + { + "epoch": 0.8896387693945716, + "grad_norm": 0.0035468984860926867, + "learning_rate": 1.108413890637531e-05, + "loss": 0.08591824173927307, + "step": 207220 + }, + { + "epoch": 0.8896817014845917, + "grad_norm": 0.00021397089585661888, + "learning_rate": 1.1079827186257687e-05, + "loss": 0.16451675891876222, + "step": 207230 + }, + { + "epoch": 0.8897246335746117, + "grad_norm": 3.756059169769287, + "learning_rate": 1.1075515466140063e-05, + "loss": 0.3345004081726074, + "step": 207240 + }, + { + "epoch": 0.8897675656646317, + "grad_norm": 0.030414363369345665, + "learning_rate": 1.1071203746022439e-05, + "loss": 0.18696179389953613, + "step": 207250 + }, + { + "epoch": 0.8898104977546517, + "grad_norm": 1.4751427173614502, + "learning_rate": 1.1066892025904816e-05, + "loss": 0.0779729425907135, + "step": 207260 + }, + { + "epoch": 0.8898534298446718, + "grad_norm": 0.0060063861310482025, + "learning_rate": 1.1062580305787192e-05, + "loss": 0.06680658459663391, + "step": 207270 + }, + { + "epoch": 0.8898963619346917, + "grad_norm": 0.002740606665611267, + "learning_rate": 1.1058268585669567e-05, + "loss": 0.22256386280059814, + "step": 207280 + }, + { + "epoch": 0.8899392940247117, + "grad_norm": 1.8619122505187988, + "learning_rate": 1.1053956865551945e-05, + "loss": 0.08997167348861694, + "step": 207290 + }, + { + "epoch": 0.8899822261147318, + "grad_norm": 0.28505250811576843, + "learning_rate": 1.104964514543432e-05, + "loss": 0.2243257999420166, + "step": 207300 + }, + { + "epoch": 0.8900251582047517, + "grad_norm": 0.00235749501734972, + "learning_rate": 1.1045333425316696e-05, + "loss": 0.12256654500961303, + "step": 207310 + }, + { + "epoch": 0.8900680902947717, + "grad_norm": 4.915469646453857, + "learning_rate": 1.1041021705199073e-05, + "loss": 0.4654879570007324, + "step": 207320 + }, + { + "epoch": 0.8901110223847918, + "grad_norm": 2.4316060543060303, + "learning_rate": 1.1036709985081449e-05, + "loss": 0.3756813287734985, + "step": 207330 + }, + { + "epoch": 0.8901539544748117, + "grad_norm": 0.8675051331520081, + "learning_rate": 1.1032398264963824e-05, + "loss": 0.3095096588134766, + "step": 207340 + }, + { + "epoch": 0.8901968865648318, + "grad_norm": 0.29333287477493286, + "learning_rate": 1.1028086544846202e-05, + "loss": 0.31053957939147947, + "step": 207350 + }, + { + "epoch": 0.8902398186548518, + "grad_norm": 0.033922795206308365, + "learning_rate": 1.1023774824728577e-05, + "loss": 0.2523657321929932, + "step": 207360 + }, + { + "epoch": 0.8902827507448717, + "grad_norm": 0.062226712703704834, + "learning_rate": 1.1019463104610953e-05, + "loss": 0.2878741979598999, + "step": 207370 + }, + { + "epoch": 0.8903256828348918, + "grad_norm": 0.1028006374835968, + "learning_rate": 1.101515138449333e-05, + "loss": 0.19018113613128662, + "step": 207380 + }, + { + "epoch": 0.8903686149249118, + "grad_norm": 2.000626802444458, + "learning_rate": 1.1010839664375706e-05, + "loss": 0.24007706642150878, + "step": 207390 + }, + { + "epoch": 0.8904115470149317, + "grad_norm": 0.6968321800231934, + "learning_rate": 1.1006527944258083e-05, + "loss": 0.29462342262268065, + "step": 207400 + }, + { + "epoch": 0.8904544791049518, + "grad_norm": 0.012917857617139816, + "learning_rate": 1.1002216224140459e-05, + "loss": 0.1639692783355713, + "step": 207410 + }, + { + "epoch": 0.8904974111949718, + "grad_norm": 21.311800003051758, + "learning_rate": 1.0997904504022835e-05, + "loss": 0.2652838706970215, + "step": 207420 + }, + { + "epoch": 0.8905403432849918, + "grad_norm": 0.28604063391685486, + "learning_rate": 1.0993592783905212e-05, + "loss": 0.0822974681854248, + "step": 207430 + }, + { + "epoch": 0.8905832753750118, + "grad_norm": 0.0694635659456253, + "learning_rate": 1.0989281063787588e-05, + "loss": 0.05964369773864746, + "step": 207440 + }, + { + "epoch": 0.8906262074650318, + "grad_norm": 0.04713095724582672, + "learning_rate": 1.0984969343669963e-05, + "loss": 0.20178651809692383, + "step": 207450 + }, + { + "epoch": 0.8906691395550518, + "grad_norm": 0.023525338619947433, + "learning_rate": 1.098065762355234e-05, + "loss": 0.19469608068466188, + "step": 207460 + }, + { + "epoch": 0.8907120716450718, + "grad_norm": 6.417396545410156, + "learning_rate": 1.0976345903434716e-05, + "loss": 0.24268784523010253, + "step": 207470 + }, + { + "epoch": 0.8907550037350919, + "grad_norm": 0.015842819586396217, + "learning_rate": 1.0972034183317092e-05, + "loss": 0.09640743136405945, + "step": 207480 + }, + { + "epoch": 0.8907979358251118, + "grad_norm": 0.0036308271810412407, + "learning_rate": 1.096772246319947e-05, + "loss": 0.028143587708473205, + "step": 207490 + }, + { + "epoch": 0.8908408679151318, + "grad_norm": 0.8678433895111084, + "learning_rate": 1.0963410743081845e-05, + "loss": 0.03296833634376526, + "step": 207500 + }, + { + "epoch": 0.8908838000051519, + "grad_norm": 0.0009780797408893704, + "learning_rate": 1.095909902296422e-05, + "loss": 0.04464678466320038, + "step": 207510 + }, + { + "epoch": 0.8909267320951718, + "grad_norm": 3.87165904045105, + "learning_rate": 1.0954787302846598e-05, + "loss": 0.3635662317276001, + "step": 207520 + }, + { + "epoch": 0.8909696641851919, + "grad_norm": 0.0022383269388228655, + "learning_rate": 1.0950475582728975e-05, + "loss": 0.001991683803498745, + "step": 207530 + }, + { + "epoch": 0.8910125962752119, + "grad_norm": 0.037035245448350906, + "learning_rate": 1.0946163862611351e-05, + "loss": 0.16891145706176758, + "step": 207540 + }, + { + "epoch": 0.8910555283652318, + "grad_norm": 1.5228081941604614, + "learning_rate": 1.0941852142493728e-05, + "loss": 0.15557491779327393, + "step": 207550 + }, + { + "epoch": 0.8910984604552519, + "grad_norm": 0.03345203027129173, + "learning_rate": 1.0937540422376104e-05, + "loss": 0.18587608337402345, + "step": 207560 + }, + { + "epoch": 0.8911413925452719, + "grad_norm": 0.016381222754716873, + "learning_rate": 1.093322870225848e-05, + "loss": 0.03826359212398529, + "step": 207570 + }, + { + "epoch": 0.8911843246352918, + "grad_norm": 7.473484516143799, + "learning_rate": 1.0928916982140857e-05, + "loss": 0.09259976148605346, + "step": 207580 + }, + { + "epoch": 0.8912272567253119, + "grad_norm": 0.07392115145921707, + "learning_rate": 1.0924605262023233e-05, + "loss": 0.21983919143676758, + "step": 207590 + }, + { + "epoch": 0.8912701888153319, + "grad_norm": 0.028260618448257446, + "learning_rate": 1.0920293541905608e-05, + "loss": 0.1875922679901123, + "step": 207600 + }, + { + "epoch": 0.8913131209053519, + "grad_norm": 1.9657236337661743, + "learning_rate": 1.0915981821787986e-05, + "loss": 0.30432312488555907, + "step": 207610 + }, + { + "epoch": 0.8913560529953719, + "grad_norm": 0.05618586018681526, + "learning_rate": 1.0911670101670361e-05, + "loss": 0.025335219502449036, + "step": 207620 + }, + { + "epoch": 0.891398985085392, + "grad_norm": 1.6123510599136353, + "learning_rate": 1.0907358381552737e-05, + "loss": 0.15341012477874755, + "step": 207630 + }, + { + "epoch": 0.8914419171754119, + "grad_norm": 4.691909313201904, + "learning_rate": 1.0903046661435114e-05, + "loss": 0.24398250579833985, + "step": 207640 + }, + { + "epoch": 0.8914848492654319, + "grad_norm": 2.1337225437164307, + "learning_rate": 1.089873494131749e-05, + "loss": 0.5305446147918701, + "step": 207650 + }, + { + "epoch": 0.891527781355452, + "grad_norm": 0.0023214505054056644, + "learning_rate": 1.0894423221199866e-05, + "loss": 0.05925256609916687, + "step": 207660 + }, + { + "epoch": 0.891570713445472, + "grad_norm": 2.3543307781219482, + "learning_rate": 1.0890111501082243e-05, + "loss": 0.350689959526062, + "step": 207670 + }, + { + "epoch": 0.8916136455354919, + "grad_norm": 0.0017880657687783241, + "learning_rate": 1.0885799780964619e-05, + "loss": 0.16659703254699706, + "step": 207680 + }, + { + "epoch": 0.891656577625512, + "grad_norm": 2.1300582885742188, + "learning_rate": 1.0881488060846994e-05, + "loss": 0.38850128650665283, + "step": 207690 + }, + { + "epoch": 0.891699509715532, + "grad_norm": 0.002209410071372986, + "learning_rate": 1.0877176340729372e-05, + "loss": 0.18450218439102173, + "step": 207700 + }, + { + "epoch": 0.891742441805552, + "grad_norm": 1.4890061616897583, + "learning_rate": 1.0872864620611747e-05, + "loss": 0.13996822834014894, + "step": 207710 + }, + { + "epoch": 0.891785373895572, + "grad_norm": 0.8673306107521057, + "learning_rate": 1.0868552900494123e-05, + "loss": 0.2520700454711914, + "step": 207720 + }, + { + "epoch": 0.891828305985592, + "grad_norm": 0.0014024653937667608, + "learning_rate": 1.08642411803765e-05, + "loss": 0.1759459137916565, + "step": 207730 + }, + { + "epoch": 0.891871238075612, + "grad_norm": 0.02599116787314415, + "learning_rate": 1.0859929460258876e-05, + "loss": 0.059044384956359865, + "step": 207740 + }, + { + "epoch": 0.891914170165632, + "grad_norm": 0.46189165115356445, + "learning_rate": 1.0855617740141251e-05, + "loss": 0.09625746607780457, + "step": 207750 + }, + { + "epoch": 0.8919571022556521, + "grad_norm": 6.523098468780518, + "learning_rate": 1.0851306020023629e-05, + "loss": 0.20586519241333007, + "step": 207760 + }, + { + "epoch": 0.892000034345672, + "grad_norm": 0.0002252467820653692, + "learning_rate": 1.0846994299906004e-05, + "loss": 0.14474284648895264, + "step": 207770 + }, + { + "epoch": 0.892042966435692, + "grad_norm": 1.4413135051727295, + "learning_rate": 1.0842682579788382e-05, + "loss": 0.18990509510040282, + "step": 207780 + }, + { + "epoch": 0.8920858985257121, + "grad_norm": 0.4706697165966034, + "learning_rate": 1.0838370859670757e-05, + "loss": 0.1626267671585083, + "step": 207790 + }, + { + "epoch": 0.892128830615732, + "grad_norm": 1.6966557502746582, + "learning_rate": 1.0834059139553133e-05, + "loss": 0.03528638184070587, + "step": 207800 + }, + { + "epoch": 0.892171762705752, + "grad_norm": 0.001013890141621232, + "learning_rate": 1.082974741943551e-05, + "loss": 0.3139493942260742, + "step": 207810 + }, + { + "epoch": 0.8922146947957721, + "grad_norm": 0.04221909120678902, + "learning_rate": 1.0825435699317886e-05, + "loss": 0.21665351390838622, + "step": 207820 + }, + { + "epoch": 0.892257626885792, + "grad_norm": 0.04389188066124916, + "learning_rate": 1.0821123979200262e-05, + "loss": 0.12443130016326905, + "step": 207830 + }, + { + "epoch": 0.8923005589758121, + "grad_norm": 0.0007254289230331779, + "learning_rate": 1.0816812259082639e-05, + "loss": 0.07733795642852784, + "step": 207840 + }, + { + "epoch": 0.8923434910658321, + "grad_norm": 0.002695313189178705, + "learning_rate": 1.0812500538965015e-05, + "loss": 0.20181338787078856, + "step": 207850 + }, + { + "epoch": 0.892386423155852, + "grad_norm": 1.518416166305542, + "learning_rate": 1.080818881884739e-05, + "loss": 0.35289008617401124, + "step": 207860 + }, + { + "epoch": 0.8924293552458721, + "grad_norm": 0.09882375597953796, + "learning_rate": 1.0803877098729768e-05, + "loss": 0.084866863489151, + "step": 207870 + }, + { + "epoch": 0.8924722873358921, + "grad_norm": 0.07617539167404175, + "learning_rate": 1.0799565378612143e-05, + "loss": 0.25410733222961424, + "step": 207880 + }, + { + "epoch": 0.892515219425912, + "grad_norm": 0.005790808238089085, + "learning_rate": 1.0795253658494519e-05, + "loss": 0.09027189016342163, + "step": 207890 + }, + { + "epoch": 0.8925581515159321, + "grad_norm": 0.0614863745868206, + "learning_rate": 1.0790941938376898e-05, + "loss": 0.02990849018096924, + "step": 207900 + }, + { + "epoch": 0.8926010836059521, + "grad_norm": 1.8789814710617065, + "learning_rate": 1.0786630218259274e-05, + "loss": 0.2544694423675537, + "step": 207910 + }, + { + "epoch": 0.8926440156959721, + "grad_norm": 1.8106611967086792, + "learning_rate": 1.078231849814165e-05, + "loss": 0.1708831787109375, + "step": 207920 + }, + { + "epoch": 0.8926869477859921, + "grad_norm": 0.44066721200942993, + "learning_rate": 1.0778006778024027e-05, + "loss": 0.3115999698638916, + "step": 207930 + }, + { + "epoch": 0.8927298798760122, + "grad_norm": 0.7485963702201843, + "learning_rate": 1.0773695057906402e-05, + "loss": 0.10244852304458618, + "step": 207940 + }, + { + "epoch": 0.8927728119660321, + "grad_norm": 0.002917962847277522, + "learning_rate": 1.0769383337788778e-05, + "loss": 0.08186596035957336, + "step": 207950 + }, + { + "epoch": 0.8928157440560521, + "grad_norm": 0.25023436546325684, + "learning_rate": 1.0765071617671155e-05, + "loss": 0.2952109336853027, + "step": 207960 + }, + { + "epoch": 0.8928586761460722, + "grad_norm": 0.0062995050102472305, + "learning_rate": 1.0760759897553531e-05, + "loss": 0.13027873039245605, + "step": 207970 + }, + { + "epoch": 0.8929016082360921, + "grad_norm": 0.4849872589111328, + "learning_rate": 1.0756448177435907e-05, + "loss": 0.08536691069602967, + "step": 207980 + }, + { + "epoch": 0.8929445403261121, + "grad_norm": 0.04339861497282982, + "learning_rate": 1.0752136457318284e-05, + "loss": 0.1852457642555237, + "step": 207990 + }, + { + "epoch": 0.8929874724161322, + "grad_norm": 0.7880849838256836, + "learning_rate": 1.074782473720066e-05, + "loss": 0.15559797286987304, + "step": 208000 + }, + { + "epoch": 0.8929874724161322, + "eval_loss": 0.3726086914539337, + "eval_runtime": 27.5821, + "eval_samples_per_second": 3.626, + "eval_steps_per_second": 3.626, + "step": 208000 + }, + { + "epoch": 0.8930304045061521, + "grad_norm": 0.05981164053082466, + "learning_rate": 1.0743513017083035e-05, + "loss": 0.1855257749557495, + "step": 208010 + }, + { + "epoch": 0.8930733365961722, + "grad_norm": 2.3654754161834717, + "learning_rate": 1.0739201296965413e-05, + "loss": 0.2869313478469849, + "step": 208020 + }, + { + "epoch": 0.8931162686861922, + "grad_norm": 0.8840840458869934, + "learning_rate": 1.0734889576847788e-05, + "loss": 0.18838484287261964, + "step": 208030 + }, + { + "epoch": 0.8931592007762121, + "grad_norm": 0.018801458179950714, + "learning_rate": 1.0730577856730164e-05, + "loss": 0.0698580026626587, + "step": 208040 + }, + { + "epoch": 0.8932021328662322, + "grad_norm": 0.7063366770744324, + "learning_rate": 1.0726266136612541e-05, + "loss": 0.03508914709091186, + "step": 208050 + }, + { + "epoch": 0.8932450649562522, + "grad_norm": 0.012687929905951023, + "learning_rate": 1.0721954416494917e-05, + "loss": 0.3084988594055176, + "step": 208060 + }, + { + "epoch": 0.8932879970462722, + "grad_norm": 0.000954024086240679, + "learning_rate": 1.0717642696377293e-05, + "loss": 0.4633500576019287, + "step": 208070 + }, + { + "epoch": 0.8933309291362922, + "grad_norm": 0.08769218623638153, + "learning_rate": 1.071333097625967e-05, + "loss": 0.05249186158180237, + "step": 208080 + }, + { + "epoch": 0.8933738612263122, + "grad_norm": 0.07643234729766846, + "learning_rate": 1.0709019256142046e-05, + "loss": 0.12548816204071045, + "step": 208090 + }, + { + "epoch": 0.8934167933163323, + "grad_norm": 0.32027769088745117, + "learning_rate": 1.0704707536024421e-05, + "loss": 0.10189043283462525, + "step": 208100 + }, + { + "epoch": 0.8934597254063522, + "grad_norm": 0.014366092160344124, + "learning_rate": 1.0700395815906799e-05, + "loss": 0.04180750846862793, + "step": 208110 + }, + { + "epoch": 0.8935026574963723, + "grad_norm": 0.028544161468744278, + "learning_rate": 1.0696084095789174e-05, + "loss": 0.025064852833747864, + "step": 208120 + }, + { + "epoch": 0.8935455895863923, + "grad_norm": 0.4280235171318054, + "learning_rate": 1.069177237567155e-05, + "loss": 0.0633910834789276, + "step": 208130 + }, + { + "epoch": 0.8935885216764122, + "grad_norm": 0.09054075181484222, + "learning_rate": 1.0687460655553927e-05, + "loss": 0.1530647873878479, + "step": 208140 + }, + { + "epoch": 0.8936314537664323, + "grad_norm": 0.03635436296463013, + "learning_rate": 1.0683148935436303e-05, + "loss": 0.32849485874176027, + "step": 208150 + }, + { + "epoch": 0.8936743858564523, + "grad_norm": 0.02411261945962906, + "learning_rate": 1.067883721531868e-05, + "loss": 0.2045769453048706, + "step": 208160 + }, + { + "epoch": 0.8937173179464722, + "grad_norm": 0.014103651978075504, + "learning_rate": 1.0674525495201056e-05, + "loss": 0.288296103477478, + "step": 208170 + }, + { + "epoch": 0.8937602500364923, + "grad_norm": 4.653670787811279, + "learning_rate": 1.0670213775083431e-05, + "loss": 0.28550784587860106, + "step": 208180 + }, + { + "epoch": 0.8938031821265123, + "grad_norm": 0.015572491101920605, + "learning_rate": 1.0665902054965809e-05, + "loss": 0.08331742286682128, + "step": 208190 + }, + { + "epoch": 0.8938461142165323, + "grad_norm": 0.33414509892463684, + "learning_rate": 1.0661590334848184e-05, + "loss": 0.12296531200408936, + "step": 208200 + }, + { + "epoch": 0.8938890463065523, + "grad_norm": 3.6274185180664062, + "learning_rate": 1.065727861473056e-05, + "loss": 0.28098833560943604, + "step": 208210 + }, + { + "epoch": 0.8939319783965723, + "grad_norm": 0.025328239426016808, + "learning_rate": 1.0652966894612937e-05, + "loss": 0.1296234130859375, + "step": 208220 + }, + { + "epoch": 0.8939749104865923, + "grad_norm": 1.3595960140228271, + "learning_rate": 1.0648655174495313e-05, + "loss": 0.323833966255188, + "step": 208230 + }, + { + "epoch": 0.8940178425766123, + "grad_norm": 3.208756685256958, + "learning_rate": 1.0644343454377689e-05, + "loss": 0.33398821353912356, + "step": 208240 + }, + { + "epoch": 0.8940607746666324, + "grad_norm": 1.5382039546966553, + "learning_rate": 1.0640031734260066e-05, + "loss": 0.17398445606231688, + "step": 208250 + }, + { + "epoch": 0.8941037067566523, + "grad_norm": 0.04903976246714592, + "learning_rate": 1.0635720014142443e-05, + "loss": 0.044875967502594, + "step": 208260 + }, + { + "epoch": 0.8941466388466723, + "grad_norm": 22.92563247680664, + "learning_rate": 1.0631408294024819e-05, + "loss": 0.321004581451416, + "step": 208270 + }, + { + "epoch": 0.8941895709366924, + "grad_norm": 0.141753152012825, + "learning_rate": 1.0627096573907196e-05, + "loss": 0.2542982578277588, + "step": 208280 + }, + { + "epoch": 0.8942325030267123, + "grad_norm": 0.01915988139808178, + "learning_rate": 1.0622784853789572e-05, + "loss": 0.19424819946289062, + "step": 208290 + }, + { + "epoch": 0.8942754351167324, + "grad_norm": 0.2076554298400879, + "learning_rate": 1.0618473133671948e-05, + "loss": 0.23011200428009032, + "step": 208300 + }, + { + "epoch": 0.8943183672067524, + "grad_norm": 1.0068254470825195, + "learning_rate": 1.0614161413554325e-05, + "loss": 0.3052649974822998, + "step": 208310 + }, + { + "epoch": 0.8943612992967723, + "grad_norm": 0.005863682366907597, + "learning_rate": 1.06098496934367e-05, + "loss": 0.06528003215789795, + "step": 208320 + }, + { + "epoch": 0.8944042313867924, + "grad_norm": 1.8667118549346924, + "learning_rate": 1.0605537973319076e-05, + "loss": 0.3344250679016113, + "step": 208330 + }, + { + "epoch": 0.8944471634768124, + "grad_norm": 0.011455858126282692, + "learning_rate": 1.0601226253201454e-05, + "loss": 0.29726667404174806, + "step": 208340 + }, + { + "epoch": 0.8944900955668323, + "grad_norm": 1.6951905488967896, + "learning_rate": 1.059691453308383e-05, + "loss": 0.12936344146728515, + "step": 208350 + }, + { + "epoch": 0.8945330276568524, + "grad_norm": 2.5541203022003174, + "learning_rate": 1.0592602812966205e-05, + "loss": 0.12269244194030762, + "step": 208360 + }, + { + "epoch": 0.8945759597468724, + "grad_norm": 0.7697427272796631, + "learning_rate": 1.0588291092848582e-05, + "loss": 0.07301286458969117, + "step": 208370 + }, + { + "epoch": 0.8946188918368924, + "grad_norm": 0.27730637788772583, + "learning_rate": 1.0583979372730958e-05, + "loss": 0.0558599591255188, + "step": 208380 + }, + { + "epoch": 0.8946618239269124, + "grad_norm": 0.004843627102673054, + "learning_rate": 1.0579667652613334e-05, + "loss": 0.017573785781860352, + "step": 208390 + }, + { + "epoch": 0.8947047560169324, + "grad_norm": 0.006611781660467386, + "learning_rate": 1.0575355932495711e-05, + "loss": 0.13027034997940062, + "step": 208400 + }, + { + "epoch": 0.8947476881069524, + "grad_norm": 0.008200961165130138, + "learning_rate": 1.0571044212378087e-05, + "loss": 0.4753425121307373, + "step": 208410 + }, + { + "epoch": 0.8947906201969724, + "grad_norm": 0.02584204636514187, + "learning_rate": 1.0566732492260462e-05, + "loss": 0.09846428036689758, + "step": 208420 + }, + { + "epoch": 0.8948335522869925, + "grad_norm": 0.9147371649742126, + "learning_rate": 1.056242077214284e-05, + "loss": 0.18001638650894164, + "step": 208430 + }, + { + "epoch": 0.8948764843770124, + "grad_norm": 1.8289741277694702, + "learning_rate": 1.0558109052025215e-05, + "loss": 0.24366312026977538, + "step": 208440 + }, + { + "epoch": 0.8949194164670324, + "grad_norm": 5.123477458953857, + "learning_rate": 1.0553797331907591e-05, + "loss": 0.2094179391860962, + "step": 208450 + }, + { + "epoch": 0.8949623485570525, + "grad_norm": 0.017321955412626266, + "learning_rate": 1.0549485611789968e-05, + "loss": 0.09925681352615356, + "step": 208460 + }, + { + "epoch": 0.8950052806470724, + "grad_norm": 0.020616520196199417, + "learning_rate": 1.0545173891672344e-05, + "loss": 0.28967747688293455, + "step": 208470 + }, + { + "epoch": 0.8950482127370925, + "grad_norm": 1.7180979251861572, + "learning_rate": 1.054086217155472e-05, + "loss": 0.15444715023040773, + "step": 208480 + }, + { + "epoch": 0.8950911448271125, + "grad_norm": 0.021595356985926628, + "learning_rate": 1.0536550451437097e-05, + "loss": 0.14605026245117186, + "step": 208490 + }, + { + "epoch": 0.8951340769171324, + "grad_norm": 0.1364668309688568, + "learning_rate": 1.0532238731319473e-05, + "loss": 0.16470075845718385, + "step": 208500 + }, + { + "epoch": 0.8951770090071525, + "grad_norm": 0.002942474326118827, + "learning_rate": 1.052792701120185e-05, + "loss": 0.21846382617950438, + "step": 208510 + }, + { + "epoch": 0.8952199410971725, + "grad_norm": 0.008422262966632843, + "learning_rate": 1.0523615291084226e-05, + "loss": 0.002869504317641258, + "step": 208520 + }, + { + "epoch": 0.8952628731871926, + "grad_norm": 5.913967132568359, + "learning_rate": 1.0519303570966601e-05, + "loss": 0.43233413696289064, + "step": 208530 + }, + { + "epoch": 0.8953058052772125, + "grad_norm": 0.0015407500322908163, + "learning_rate": 1.0514991850848979e-05, + "loss": 0.17136805057525634, + "step": 208540 + }, + { + "epoch": 0.8953487373672325, + "grad_norm": 1.3527334928512573, + "learning_rate": 1.0510680130731354e-05, + "loss": 0.2805386781692505, + "step": 208550 + }, + { + "epoch": 0.8953916694572526, + "grad_norm": 7.946166038513184, + "learning_rate": 1.050636841061373e-05, + "loss": 0.1672977924346924, + "step": 208560 + }, + { + "epoch": 0.8954346015472725, + "grad_norm": 0.03738050535321236, + "learning_rate": 1.0502056690496107e-05, + "loss": 0.22332425117492677, + "step": 208570 + }, + { + "epoch": 0.8954775336372925, + "grad_norm": 0.008645240217447281, + "learning_rate": 1.0497744970378483e-05, + "loss": 0.25367834568023684, + "step": 208580 + }, + { + "epoch": 0.8955204657273126, + "grad_norm": 1.5730016231536865, + "learning_rate": 1.0493433250260859e-05, + "loss": 0.20938484668731688, + "step": 208590 + }, + { + "epoch": 0.8955633978173325, + "grad_norm": 4.809794902801514, + "learning_rate": 1.0489121530143236e-05, + "loss": 0.0948636293411255, + "step": 208600 + }, + { + "epoch": 0.8956063299073526, + "grad_norm": 0.0076020825654268265, + "learning_rate": 1.0484809810025612e-05, + "loss": 0.11845968961715699, + "step": 208610 + }, + { + "epoch": 0.8956492619973726, + "grad_norm": 0.049854811280965805, + "learning_rate": 1.0480498089907987e-05, + "loss": 0.070538729429245, + "step": 208620 + }, + { + "epoch": 0.8956921940873925, + "grad_norm": 0.04433738440275192, + "learning_rate": 1.0476186369790364e-05, + "loss": 0.13895654678344727, + "step": 208630 + }, + { + "epoch": 0.8957351261774126, + "grad_norm": 0.3433583676815033, + "learning_rate": 1.0471874649672742e-05, + "loss": 0.10408983230590821, + "step": 208640 + }, + { + "epoch": 0.8957780582674326, + "grad_norm": 1.599429965019226, + "learning_rate": 1.0467562929555117e-05, + "loss": 0.2278376340866089, + "step": 208650 + }, + { + "epoch": 0.8958209903574526, + "grad_norm": 0.6149447560310364, + "learning_rate": 1.0463251209437495e-05, + "loss": 0.1790841341018677, + "step": 208660 + }, + { + "epoch": 0.8958639224474726, + "grad_norm": 0.06281206011772156, + "learning_rate": 1.045893948931987e-05, + "loss": 0.1051477074623108, + "step": 208670 + }, + { + "epoch": 0.8959068545374926, + "grad_norm": 6.033206462860107, + "learning_rate": 1.0454627769202246e-05, + "loss": 0.17541825771331787, + "step": 208680 + }, + { + "epoch": 0.8959497866275126, + "grad_norm": 0.015856314450502396, + "learning_rate": 1.0450316049084623e-05, + "loss": 0.07973178029060364, + "step": 208690 + }, + { + "epoch": 0.8959927187175326, + "grad_norm": 0.0016035563312470913, + "learning_rate": 1.0446004328966999e-05, + "loss": 0.13870831727981567, + "step": 208700 + }, + { + "epoch": 0.8960356508075527, + "grad_norm": 8.894113540649414, + "learning_rate": 1.0441692608849375e-05, + "loss": 0.3145705223083496, + "step": 208710 + }, + { + "epoch": 0.8960785828975726, + "grad_norm": 0.20434823632240295, + "learning_rate": 1.0437380888731752e-05, + "loss": 0.17712016105651857, + "step": 208720 + }, + { + "epoch": 0.8961215149875926, + "grad_norm": 0.5490210056304932, + "learning_rate": 1.0433069168614128e-05, + "loss": 0.22801218032836915, + "step": 208730 + }, + { + "epoch": 0.8961644470776127, + "grad_norm": 1.5750205516815186, + "learning_rate": 1.0428757448496503e-05, + "loss": 0.16703424453735352, + "step": 208740 + }, + { + "epoch": 0.8962073791676326, + "grad_norm": 1.4231799840927124, + "learning_rate": 1.042444572837888e-05, + "loss": 0.22915937900543212, + "step": 208750 + }, + { + "epoch": 0.8962503112576526, + "grad_norm": 0.0203181691467762, + "learning_rate": 1.0420134008261256e-05, + "loss": 0.1963837265968323, + "step": 208760 + }, + { + "epoch": 0.8962932433476727, + "grad_norm": 0.01275878306478262, + "learning_rate": 1.0415822288143632e-05, + "loss": 0.14313185214996338, + "step": 208770 + }, + { + "epoch": 0.8963361754376926, + "grad_norm": 4.891313076019287, + "learning_rate": 1.041151056802601e-05, + "loss": 0.2985722064971924, + "step": 208780 + }, + { + "epoch": 0.8963791075277127, + "grad_norm": 11.108732223510742, + "learning_rate": 1.0407198847908385e-05, + "loss": 0.07313599586486816, + "step": 208790 + }, + { + "epoch": 0.8964220396177327, + "grad_norm": 0.8342699408531189, + "learning_rate": 1.040288712779076e-05, + "loss": 0.19693690538406372, + "step": 208800 + }, + { + "epoch": 0.8964649717077526, + "grad_norm": 0.0011004661209881306, + "learning_rate": 1.0398575407673138e-05, + "loss": 0.091287761926651, + "step": 208810 + }, + { + "epoch": 0.8965079037977727, + "grad_norm": 0.12642458081245422, + "learning_rate": 1.0394263687555514e-05, + "loss": 0.2455592632293701, + "step": 208820 + }, + { + "epoch": 0.8965508358877927, + "grad_norm": 1.3348584175109863, + "learning_rate": 1.038995196743789e-05, + "loss": 0.1055836796760559, + "step": 208830 + }, + { + "epoch": 0.8965937679778127, + "grad_norm": 0.014487615786492825, + "learning_rate": 1.0385640247320267e-05, + "loss": 0.10463262796401977, + "step": 208840 + }, + { + "epoch": 0.8966367000678327, + "grad_norm": 0.07032379508018494, + "learning_rate": 1.0381328527202642e-05, + "loss": 0.13996758460998535, + "step": 208850 + }, + { + "epoch": 0.8966796321578527, + "grad_norm": 2.5771610736846924, + "learning_rate": 1.0377016807085018e-05, + "loss": 0.287884259223938, + "step": 208860 + }, + { + "epoch": 0.8967225642478727, + "grad_norm": 0.010654770769178867, + "learning_rate": 1.0372705086967395e-05, + "loss": 0.19098736047744752, + "step": 208870 + }, + { + "epoch": 0.8967654963378927, + "grad_norm": 0.006161325611174107, + "learning_rate": 1.0368393366849771e-05, + "loss": 0.15580826997756958, + "step": 208880 + }, + { + "epoch": 0.8968084284279128, + "grad_norm": 1.6130363941192627, + "learning_rate": 1.0364081646732148e-05, + "loss": 0.10374679565429687, + "step": 208890 + }, + { + "epoch": 0.8968513605179327, + "grad_norm": 0.014442683197557926, + "learning_rate": 1.0359769926614524e-05, + "loss": 0.26409647464752195, + "step": 208900 + }, + { + "epoch": 0.8968942926079527, + "grad_norm": 0.0012823556317016482, + "learning_rate": 1.03554582064969e-05, + "loss": 0.16642024517059326, + "step": 208910 + }, + { + "epoch": 0.8969372246979728, + "grad_norm": 0.2990255057811737, + "learning_rate": 1.0351146486379277e-05, + "loss": 0.31660277843475343, + "step": 208920 + }, + { + "epoch": 0.8969801567879927, + "grad_norm": 0.025890527293086052, + "learning_rate": 1.0346834766261653e-05, + "loss": 0.27528440952301025, + "step": 208930 + }, + { + "epoch": 0.8970230888780127, + "grad_norm": 1.0845426321029663, + "learning_rate": 1.0342523046144028e-05, + "loss": 0.32809431552886964, + "step": 208940 + }, + { + "epoch": 0.8970660209680328, + "grad_norm": 0.003917692694813013, + "learning_rate": 1.0338211326026406e-05, + "loss": 0.2939892768859863, + "step": 208950 + }, + { + "epoch": 0.8971089530580528, + "grad_norm": 0.002184345619753003, + "learning_rate": 1.0333899605908781e-05, + "loss": 0.21517295837402345, + "step": 208960 + }, + { + "epoch": 0.8971518851480728, + "grad_norm": 2.200176239013672, + "learning_rate": 1.0329587885791157e-05, + "loss": 0.45783376693725586, + "step": 208970 + }, + { + "epoch": 0.8971948172380928, + "grad_norm": 0.03883425518870354, + "learning_rate": 1.0325276165673534e-05, + "loss": 0.12506805658340453, + "step": 208980 + }, + { + "epoch": 0.8972377493281128, + "grad_norm": 0.08557221293449402, + "learning_rate": 1.0320964445555912e-05, + "loss": 0.2575039863586426, + "step": 208990 + }, + { + "epoch": 0.8972806814181328, + "grad_norm": 0.6455543637275696, + "learning_rate": 1.0316652725438287e-05, + "loss": 0.22111158370971679, + "step": 209000 + }, + { + "epoch": 0.8972806814181328, + "eval_loss": 0.37228667736053467, + "eval_runtime": 27.3584, + "eval_samples_per_second": 3.655, + "eval_steps_per_second": 3.655, + "step": 209000 + }, + { + "epoch": 0.8973236135081528, + "grad_norm": 0.00818126555532217, + "learning_rate": 1.0312341005320663e-05, + "loss": 0.13509939908981322, + "step": 209010 + }, + { + "epoch": 0.8973665455981729, + "grad_norm": 0.07581299543380737, + "learning_rate": 1.030802928520304e-05, + "loss": 0.25842645168304446, + "step": 209020 + }, + { + "epoch": 0.8974094776881928, + "grad_norm": 0.009542779996991158, + "learning_rate": 1.0303717565085416e-05, + "loss": 0.24384644031524658, + "step": 209030 + }, + { + "epoch": 0.8974524097782128, + "grad_norm": 1.313149333000183, + "learning_rate": 1.0299405844967793e-05, + "loss": 0.12051091194152833, + "step": 209040 + }, + { + "epoch": 0.8974953418682329, + "grad_norm": 0.1645997017621994, + "learning_rate": 1.0295094124850169e-05, + "loss": 0.24603877067565919, + "step": 209050 + }, + { + "epoch": 0.8975382739582528, + "grad_norm": 1.1862980127334595, + "learning_rate": 1.0290782404732545e-05, + "loss": 0.18507742881774902, + "step": 209060 + }, + { + "epoch": 0.8975812060482729, + "grad_norm": 0.012851928360760212, + "learning_rate": 1.0286470684614922e-05, + "loss": 0.16939222812652588, + "step": 209070 + }, + { + "epoch": 0.8976241381382929, + "grad_norm": 0.10280074924230576, + "learning_rate": 1.0282158964497297e-05, + "loss": 0.1931609630584717, + "step": 209080 + }, + { + "epoch": 0.8976670702283128, + "grad_norm": 0.09403973817825317, + "learning_rate": 1.0277847244379673e-05, + "loss": 0.31662075519561766, + "step": 209090 + }, + { + "epoch": 0.8977100023183329, + "grad_norm": 0.09060845524072647, + "learning_rate": 1.027353552426205e-05, + "loss": 0.2866966724395752, + "step": 209100 + }, + { + "epoch": 0.8977529344083529, + "grad_norm": 2.3493640422821045, + "learning_rate": 1.0269223804144426e-05, + "loss": 0.2028360605239868, + "step": 209110 + }, + { + "epoch": 0.8977958664983728, + "grad_norm": 2.533294439315796, + "learning_rate": 1.0264912084026802e-05, + "loss": 0.2530220985412598, + "step": 209120 + }, + { + "epoch": 0.8978387985883929, + "grad_norm": 0.041833195835351944, + "learning_rate": 1.0260600363909179e-05, + "loss": 0.3378680944442749, + "step": 209130 + }, + { + "epoch": 0.8978817306784129, + "grad_norm": 2.1199188232421875, + "learning_rate": 1.0256288643791555e-05, + "loss": 0.20430638790130615, + "step": 209140 + }, + { + "epoch": 0.8979246627684329, + "grad_norm": 0.4007624387741089, + "learning_rate": 1.025197692367393e-05, + "loss": 0.10560801029205322, + "step": 209150 + }, + { + "epoch": 0.8979675948584529, + "grad_norm": 3.4828829765319824, + "learning_rate": 1.0247665203556308e-05, + "loss": 0.1327006459236145, + "step": 209160 + }, + { + "epoch": 0.8980105269484729, + "grad_norm": 0.8840770721435547, + "learning_rate": 1.0243353483438683e-05, + "loss": 0.2786088466644287, + "step": 209170 + }, + { + "epoch": 0.8980534590384929, + "grad_norm": 3.045027494430542, + "learning_rate": 1.0239041763321059e-05, + "loss": 0.33854291439056394, + "step": 209180 + }, + { + "epoch": 0.8980963911285129, + "grad_norm": 0.05625593662261963, + "learning_rate": 1.0234730043203436e-05, + "loss": 0.2884523868560791, + "step": 209190 + }, + { + "epoch": 0.898139323218533, + "grad_norm": 0.054473888128995895, + "learning_rate": 1.0230418323085812e-05, + "loss": 0.16954405307769777, + "step": 209200 + }, + { + "epoch": 0.8981822553085529, + "grad_norm": 0.9436574578285217, + "learning_rate": 1.0226106602968188e-05, + "loss": 0.17064402103424073, + "step": 209210 + }, + { + "epoch": 0.8982251873985729, + "grad_norm": 0.5394967198371887, + "learning_rate": 1.0221794882850565e-05, + "loss": 0.2750392913818359, + "step": 209220 + }, + { + "epoch": 0.898268119488593, + "grad_norm": 0.009204737842082977, + "learning_rate": 1.021748316273294e-05, + "loss": 0.07998819351196289, + "step": 209230 + }, + { + "epoch": 0.8983110515786129, + "grad_norm": 1.3306227922439575, + "learning_rate": 1.0213171442615316e-05, + "loss": 0.1779198169708252, + "step": 209240 + }, + { + "epoch": 0.898353983668633, + "grad_norm": 0.01629238948225975, + "learning_rate": 1.0208859722497694e-05, + "loss": 0.39297688007354736, + "step": 209250 + }, + { + "epoch": 0.898396915758653, + "grad_norm": 0.01804766058921814, + "learning_rate": 1.020454800238007e-05, + "loss": 0.18484385013580323, + "step": 209260 + }, + { + "epoch": 0.8984398478486729, + "grad_norm": 0.0010492533911019564, + "learning_rate": 1.0200236282262447e-05, + "loss": 0.36370325088500977, + "step": 209270 + }, + { + "epoch": 0.898482779938693, + "grad_norm": 1.7016007900238037, + "learning_rate": 1.0195924562144822e-05, + "loss": 0.33094918727874756, + "step": 209280 + }, + { + "epoch": 0.898525712028713, + "grad_norm": 0.005374387372285128, + "learning_rate": 1.0191612842027198e-05, + "loss": 0.009152711182832719, + "step": 209290 + }, + { + "epoch": 0.8985686441187329, + "grad_norm": 0.2994307279586792, + "learning_rate": 1.0187301121909575e-05, + "loss": 0.32453336715698244, + "step": 209300 + }, + { + "epoch": 0.898611576208753, + "grad_norm": 0.13801302015781403, + "learning_rate": 1.0182989401791951e-05, + "loss": 0.19398313760757446, + "step": 209310 + }, + { + "epoch": 0.898654508298773, + "grad_norm": 2.9953441619873047, + "learning_rate": 1.0178677681674327e-05, + "loss": 0.18454222679138182, + "step": 209320 + }, + { + "epoch": 0.898697440388793, + "grad_norm": 0.0009266235865652561, + "learning_rate": 1.0174365961556704e-05, + "loss": 0.056571030616760255, + "step": 209330 + }, + { + "epoch": 0.898740372478813, + "grad_norm": 0.0011994290398433805, + "learning_rate": 1.017005424143908e-05, + "loss": 0.1038630723953247, + "step": 209340 + }, + { + "epoch": 0.898783304568833, + "grad_norm": 0.3412676155567169, + "learning_rate": 1.0165742521321457e-05, + "loss": 0.1812652587890625, + "step": 209350 + }, + { + "epoch": 0.898826236658853, + "grad_norm": 0.03324000537395477, + "learning_rate": 1.0161430801203833e-05, + "loss": 0.24029359817504883, + "step": 209360 + }, + { + "epoch": 0.898869168748873, + "grad_norm": 0.03261468559503555, + "learning_rate": 1.015711908108621e-05, + "loss": 0.057292830944061277, + "step": 209370 + }, + { + "epoch": 0.8989121008388931, + "grad_norm": 0.714746356010437, + "learning_rate": 1.0152807360968586e-05, + "loss": 0.026148182153701783, + "step": 209380 + }, + { + "epoch": 0.8989550329289131, + "grad_norm": 0.6475493311882019, + "learning_rate": 1.0148495640850963e-05, + "loss": 0.23983364105224608, + "step": 209390 + }, + { + "epoch": 0.898997965018933, + "grad_norm": 0.5558874607086182, + "learning_rate": 1.0144183920733339e-05, + "loss": 0.2596889972686768, + "step": 209400 + }, + { + "epoch": 0.8990408971089531, + "grad_norm": 0.0003981745394412428, + "learning_rate": 1.0139872200615714e-05, + "loss": 0.09359182119369507, + "step": 209410 + }, + { + "epoch": 0.8990838291989731, + "grad_norm": 1.4816875457763672, + "learning_rate": 1.0135560480498092e-05, + "loss": 0.3855120658874512, + "step": 209420 + }, + { + "epoch": 0.899126761288993, + "grad_norm": 2.404386281967163, + "learning_rate": 1.0131248760380467e-05, + "loss": 0.21844077110290527, + "step": 209430 + }, + { + "epoch": 0.8991696933790131, + "grad_norm": 5.189345359802246, + "learning_rate": 1.0126937040262843e-05, + "loss": 0.3697656154632568, + "step": 209440 + }, + { + "epoch": 0.8992126254690331, + "grad_norm": 6.110927581787109, + "learning_rate": 1.012262532014522e-05, + "loss": 0.415781831741333, + "step": 209450 + }, + { + "epoch": 0.8992555575590531, + "grad_norm": 2.683704376220703, + "learning_rate": 1.0118313600027596e-05, + "loss": 0.2659297943115234, + "step": 209460 + }, + { + "epoch": 0.8992984896490731, + "grad_norm": 1.1903349161148071, + "learning_rate": 1.0114001879909972e-05, + "loss": 0.17215741872787477, + "step": 209470 + }, + { + "epoch": 0.8993414217390931, + "grad_norm": 0.04750831425189972, + "learning_rate": 1.0109690159792349e-05, + "loss": 0.25689048767089845, + "step": 209480 + }, + { + "epoch": 0.8993843538291131, + "grad_norm": 2.1213433742523193, + "learning_rate": 1.0105378439674725e-05, + "loss": 0.3482259511947632, + "step": 209490 + }, + { + "epoch": 0.8994272859191331, + "grad_norm": 5.7372565269470215, + "learning_rate": 1.01010667195571e-05, + "loss": 0.16127365827560425, + "step": 209500 + }, + { + "epoch": 0.8994702180091532, + "grad_norm": 0.12518486380577087, + "learning_rate": 1.0096754999439478e-05, + "loss": 0.11983962059020996, + "step": 209510 + }, + { + "epoch": 0.8995131500991731, + "grad_norm": 8.179588317871094, + "learning_rate": 1.0092443279321853e-05, + "loss": 0.2647406578063965, + "step": 209520 + }, + { + "epoch": 0.8995560821891931, + "grad_norm": 0.024799851700663567, + "learning_rate": 1.0088131559204229e-05, + "loss": 0.0050085954368114475, + "step": 209530 + }, + { + "epoch": 0.8995990142792132, + "grad_norm": 0.0018341508693993092, + "learning_rate": 1.0083819839086606e-05, + "loss": 0.22504839897155762, + "step": 209540 + }, + { + "epoch": 0.8996419463692331, + "grad_norm": 1.1719921827316284, + "learning_rate": 1.0079508118968982e-05, + "loss": 0.20294253826141356, + "step": 209550 + }, + { + "epoch": 0.8996848784592532, + "grad_norm": 0.005458078347146511, + "learning_rate": 1.0075196398851357e-05, + "loss": 0.1725464105606079, + "step": 209560 + }, + { + "epoch": 0.8997278105492732, + "grad_norm": 0.2729476988315582, + "learning_rate": 1.0070884678733735e-05, + "loss": 0.1303679823875427, + "step": 209570 + }, + { + "epoch": 0.8997707426392931, + "grad_norm": 0.017656253650784492, + "learning_rate": 1.006657295861611e-05, + "loss": 0.31265428066253664, + "step": 209580 + }, + { + "epoch": 0.8998136747293132, + "grad_norm": 1.6056455373764038, + "learning_rate": 1.0062261238498486e-05, + "loss": 0.26979079246521, + "step": 209590 + }, + { + "epoch": 0.8998566068193332, + "grad_norm": 1.8567376136779785, + "learning_rate": 1.0057949518380863e-05, + "loss": 0.40013885498046875, + "step": 209600 + }, + { + "epoch": 0.8998995389093531, + "grad_norm": 0.03454438969492912, + "learning_rate": 1.0053637798263239e-05, + "loss": 0.18960126638412475, + "step": 209610 + }, + { + "epoch": 0.8999424709993732, + "grad_norm": 0.03088500164449215, + "learning_rate": 1.0049326078145615e-05, + "loss": 0.06449623107910156, + "step": 209620 + }, + { + "epoch": 0.8999854030893932, + "grad_norm": 0.0003505227214191109, + "learning_rate": 1.0045014358027992e-05, + "loss": 0.1564359188079834, + "step": 209630 + }, + { + "epoch": 0.9000283351794132, + "grad_norm": 0.01360271405428648, + "learning_rate": 1.0040702637910368e-05, + "loss": 0.21048970222473146, + "step": 209640 + }, + { + "epoch": 0.9000712672694332, + "grad_norm": 0.07835225015878677, + "learning_rate": 1.0036390917792745e-05, + "loss": 0.1049225926399231, + "step": 209650 + }, + { + "epoch": 0.9001141993594532, + "grad_norm": 2.60602068901062, + "learning_rate": 1.003207919767512e-05, + "loss": 0.10012807846069335, + "step": 209660 + }, + { + "epoch": 0.9001571314494732, + "grad_norm": 0.1185954362154007, + "learning_rate": 1.0027767477557496e-05, + "loss": 0.2436140775680542, + "step": 209670 + }, + { + "epoch": 0.9002000635394932, + "grad_norm": 0.005014302209019661, + "learning_rate": 1.0023455757439874e-05, + "loss": 0.14921751022338867, + "step": 209680 + }, + { + "epoch": 0.9002429956295133, + "grad_norm": 3.485819101333618, + "learning_rate": 1.001914403732225e-05, + "loss": 0.26135404109954835, + "step": 209690 + }, + { + "epoch": 0.9002859277195332, + "grad_norm": 0.08411333709955215, + "learning_rate": 1.0014832317204625e-05, + "loss": 0.14293501377105713, + "step": 209700 + }, + { + "epoch": 0.9003288598095532, + "grad_norm": 0.004507078789174557, + "learning_rate": 1.0010520597087002e-05, + "loss": 0.33165478706359863, + "step": 209710 + }, + { + "epoch": 0.9003717918995733, + "grad_norm": 0.027253407984972, + "learning_rate": 1.000620887696938e-05, + "loss": 0.3246056079864502, + "step": 209720 + }, + { + "epoch": 0.9004147239895932, + "grad_norm": 0.01824076473712921, + "learning_rate": 1.0001897156851755e-05, + "loss": 0.42398953437805176, + "step": 209730 + }, + { + "epoch": 0.9004576560796133, + "grad_norm": 5.792724132537842, + "learning_rate": 9.997585436734131e-06, + "loss": 0.418946361541748, + "step": 209740 + }, + { + "epoch": 0.9005005881696333, + "grad_norm": 1.4856534004211426, + "learning_rate": 9.993273716616508e-06, + "loss": 0.3223897457122803, + "step": 209750 + }, + { + "epoch": 0.9005435202596532, + "grad_norm": 0.013289397582411766, + "learning_rate": 9.988961996498884e-06, + "loss": 0.3965138912200928, + "step": 209760 + }, + { + "epoch": 0.9005864523496733, + "grad_norm": 0.04032348468899727, + "learning_rate": 9.984650276381261e-06, + "loss": 0.034949111938476565, + "step": 209770 + }, + { + "epoch": 0.9006293844396933, + "grad_norm": 0.0031058243475854397, + "learning_rate": 9.980338556263637e-06, + "loss": 0.0654286801815033, + "step": 209780 + }, + { + "epoch": 0.9006723165297132, + "grad_norm": 2.0327398777008057, + "learning_rate": 9.976026836146013e-06, + "loss": 0.0602993369102478, + "step": 209790 + }, + { + "epoch": 0.9007152486197333, + "grad_norm": 0.06954237818717957, + "learning_rate": 9.97171511602839e-06, + "loss": 0.11823323965072632, + "step": 209800 + }, + { + "epoch": 0.9007581807097533, + "grad_norm": 1.5096991062164307, + "learning_rate": 9.967403395910766e-06, + "loss": 0.0629551887512207, + "step": 209810 + }, + { + "epoch": 0.9008011127997734, + "grad_norm": 0.5546233654022217, + "learning_rate": 9.963091675793141e-06, + "loss": 0.12162181138992309, + "step": 209820 + }, + { + "epoch": 0.9008440448897933, + "grad_norm": 0.009273167699575424, + "learning_rate": 9.958779955675519e-06, + "loss": 0.1455420136451721, + "step": 209830 + }, + { + "epoch": 0.9008869769798133, + "grad_norm": 1.0391302108764648, + "learning_rate": 9.954468235557894e-06, + "loss": 0.04995992183685303, + "step": 209840 + }, + { + "epoch": 0.9009299090698334, + "grad_norm": 0.0062095304019749165, + "learning_rate": 9.95015651544027e-06, + "loss": 0.05889610648155212, + "step": 209850 + }, + { + "epoch": 0.9009728411598533, + "grad_norm": 0.008527030237019062, + "learning_rate": 9.945844795322647e-06, + "loss": 0.15333669185638427, + "step": 209860 + }, + { + "epoch": 0.9010157732498734, + "grad_norm": 0.010419109836220741, + "learning_rate": 9.941533075205023e-06, + "loss": 0.18245283365249634, + "step": 209870 + }, + { + "epoch": 0.9010587053398934, + "grad_norm": 1.6431654691696167, + "learning_rate": 9.937221355087399e-06, + "loss": 0.2896125793457031, + "step": 209880 + }, + { + "epoch": 0.9011016374299133, + "grad_norm": 0.1526016741991043, + "learning_rate": 9.932909634969776e-06, + "loss": 0.002722269482910633, + "step": 209890 + }, + { + "epoch": 0.9011445695199334, + "grad_norm": 0.023740194737911224, + "learning_rate": 9.928597914852152e-06, + "loss": 0.09291549921035766, + "step": 209900 + }, + { + "epoch": 0.9011875016099534, + "grad_norm": 0.02442866750061512, + "learning_rate": 9.924286194734527e-06, + "loss": 0.3072220325469971, + "step": 209910 + }, + { + "epoch": 0.9012304336999734, + "grad_norm": 0.11724057048559189, + "learning_rate": 9.919974474616905e-06, + "loss": 0.10614768266677857, + "step": 209920 + }, + { + "epoch": 0.9012733657899934, + "grad_norm": 0.6770175695419312, + "learning_rate": 9.91566275449928e-06, + "loss": 0.09750730395317078, + "step": 209930 + }, + { + "epoch": 0.9013162978800134, + "grad_norm": 9.686930656433105, + "learning_rate": 9.911351034381656e-06, + "loss": 0.3479560136795044, + "step": 209940 + }, + { + "epoch": 0.9013592299700334, + "grad_norm": 1.8891156911849976, + "learning_rate": 9.907039314264033e-06, + "loss": 0.1551816463470459, + "step": 209950 + }, + { + "epoch": 0.9014021620600534, + "grad_norm": 0.15253959596157074, + "learning_rate": 9.902727594146409e-06, + "loss": 0.11488509178161621, + "step": 209960 + }, + { + "epoch": 0.9014450941500735, + "grad_norm": 1.0364890098571777, + "learning_rate": 9.898415874028784e-06, + "loss": 0.26020739078521726, + "step": 209970 + }, + { + "epoch": 0.9014880262400934, + "grad_norm": 0.0004387570661492646, + "learning_rate": 9.894104153911162e-06, + "loss": 0.333538818359375, + "step": 209980 + }, + { + "epoch": 0.9015309583301134, + "grad_norm": 3.0880167484283447, + "learning_rate": 9.889792433793537e-06, + "loss": 0.2603978872299194, + "step": 209990 + }, + { + "epoch": 0.9015738904201335, + "grad_norm": 1.891642451286316, + "learning_rate": 9.885480713675913e-06, + "loss": 0.2281651735305786, + "step": 210000 + }, + { + "epoch": 0.9015738904201335, + "eval_loss": 0.3739822506904602, + "eval_runtime": 27.5102, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 3.635, + "step": 210000 + }, + { + "epoch": 0.9016168225101534, + "grad_norm": 1.4768375158309937, + "learning_rate": 9.88116899355829e-06, + "loss": 0.15798479318618774, + "step": 210010 + }, + { + "epoch": 0.9016597546001734, + "grad_norm": 0.018229328095912933, + "learning_rate": 9.876857273440666e-06, + "loss": 0.0977150857448578, + "step": 210020 + }, + { + "epoch": 0.9017026866901935, + "grad_norm": 0.001907564583234489, + "learning_rate": 9.872545553323043e-06, + "loss": 0.04974755644798279, + "step": 210030 + }, + { + "epoch": 0.9017456187802134, + "grad_norm": 0.035649724304676056, + "learning_rate": 9.868233833205419e-06, + "loss": 0.1306628942489624, + "step": 210040 + }, + { + "epoch": 0.9017885508702335, + "grad_norm": 3.1833529472351074, + "learning_rate": 9.863922113087795e-06, + "loss": 0.14224931001663207, + "step": 210050 + }, + { + "epoch": 0.9018314829602535, + "grad_norm": 2.376718282699585, + "learning_rate": 9.859610392970172e-06, + "loss": 0.17341439723968505, + "step": 210060 + }, + { + "epoch": 0.9018744150502734, + "grad_norm": 2.405601739883423, + "learning_rate": 9.855298672852548e-06, + "loss": 0.1730472445487976, + "step": 210070 + }, + { + "epoch": 0.9019173471402935, + "grad_norm": 3.975780487060547, + "learning_rate": 9.850986952734925e-06, + "loss": 0.18226372003555297, + "step": 210080 + }, + { + "epoch": 0.9019602792303135, + "grad_norm": 0.012735238298773766, + "learning_rate": 9.8466752326173e-06, + "loss": 0.2455826759338379, + "step": 210090 + }, + { + "epoch": 0.9020032113203335, + "grad_norm": 3.667410135269165, + "learning_rate": 9.842363512499678e-06, + "loss": 0.20534975528717042, + "step": 210100 + }, + { + "epoch": 0.9020461434103535, + "grad_norm": 0.0019849713426083326, + "learning_rate": 9.838051792382054e-06, + "loss": 0.18063912391662598, + "step": 210110 + }, + { + "epoch": 0.9020890755003735, + "grad_norm": 0.21961329877376556, + "learning_rate": 9.83374007226443e-06, + "loss": 0.23487327098846436, + "step": 210120 + }, + { + "epoch": 0.9021320075903935, + "grad_norm": 3.4133858680725098, + "learning_rate": 9.829428352146807e-06, + "loss": 0.20193076133728027, + "step": 210130 + }, + { + "epoch": 0.9021749396804135, + "grad_norm": 0.020273273810744286, + "learning_rate": 9.825116632029182e-06, + "loss": 0.18437005281448365, + "step": 210140 + }, + { + "epoch": 0.9022178717704336, + "grad_norm": 0.00016728635819163173, + "learning_rate": 9.82080491191156e-06, + "loss": 0.29014551639556885, + "step": 210150 + }, + { + "epoch": 0.9022608038604535, + "grad_norm": 0.03587735816836357, + "learning_rate": 9.816493191793935e-06, + "loss": 0.3731184959411621, + "step": 210160 + }, + { + "epoch": 0.9023037359504735, + "grad_norm": 1.1249128580093384, + "learning_rate": 9.812181471676311e-06, + "loss": 0.18152229785919188, + "step": 210170 + }, + { + "epoch": 0.9023466680404936, + "grad_norm": 0.5035809278488159, + "learning_rate": 9.807869751558688e-06, + "loss": 0.20793728828430175, + "step": 210180 + }, + { + "epoch": 0.9023896001305135, + "grad_norm": 17.78410530090332, + "learning_rate": 9.803558031441064e-06, + "loss": 0.21700966358184814, + "step": 210190 + }, + { + "epoch": 0.9024325322205335, + "grad_norm": 2.60648775100708, + "learning_rate": 9.79924631132344e-06, + "loss": 0.3313996076583862, + "step": 210200 + }, + { + "epoch": 0.9024754643105536, + "grad_norm": 0.9339752197265625, + "learning_rate": 9.794934591205817e-06, + "loss": 0.23482167720794678, + "step": 210210 + }, + { + "epoch": 0.9025183964005735, + "grad_norm": 0.0010642610723152757, + "learning_rate": 9.790622871088193e-06, + "loss": 0.12294025421142578, + "step": 210220 + }, + { + "epoch": 0.9025613284905936, + "grad_norm": 0.041032496839761734, + "learning_rate": 9.786311150970568e-06, + "loss": 0.06782339811325074, + "step": 210230 + }, + { + "epoch": 0.9026042605806136, + "grad_norm": 9.69448184967041, + "learning_rate": 9.781999430852946e-06, + "loss": 0.19246318340301513, + "step": 210240 + }, + { + "epoch": 0.9026471926706336, + "grad_norm": 0.236909881234169, + "learning_rate": 9.777687710735321e-06, + "loss": 0.004842896386981011, + "step": 210250 + }, + { + "epoch": 0.9026901247606536, + "grad_norm": 1.3408589363098145, + "learning_rate": 9.773375990617697e-06, + "loss": 0.13109498023986815, + "step": 210260 + }, + { + "epoch": 0.9027330568506736, + "grad_norm": 0.00923833530396223, + "learning_rate": 9.769064270500074e-06, + "loss": 0.17234526872634887, + "step": 210270 + }, + { + "epoch": 0.9027759889406937, + "grad_norm": 0.0002061129198409617, + "learning_rate": 9.76475255038245e-06, + "loss": 0.12138881683349609, + "step": 210280 + }, + { + "epoch": 0.9028189210307136, + "grad_norm": 0.00944637693464756, + "learning_rate": 9.760440830264826e-06, + "loss": 0.2013624429702759, + "step": 210290 + }, + { + "epoch": 0.9028618531207336, + "grad_norm": 1.6908912658691406, + "learning_rate": 9.756129110147203e-06, + "loss": 0.40554208755493165, + "step": 210300 + }, + { + "epoch": 0.9029047852107537, + "grad_norm": 0.00027729306020773947, + "learning_rate": 9.751817390029579e-06, + "loss": 0.25098981857299807, + "step": 210310 + }, + { + "epoch": 0.9029477173007736, + "grad_norm": 2.2863595485687256, + "learning_rate": 9.747505669911954e-06, + "loss": 0.36655559539794924, + "step": 210320 + }, + { + "epoch": 0.9029906493907937, + "grad_norm": 0.005614591762423515, + "learning_rate": 9.743193949794332e-06, + "loss": 0.18383264541625977, + "step": 210330 + }, + { + "epoch": 0.9030335814808137, + "grad_norm": 0.0846930667757988, + "learning_rate": 9.738882229676707e-06, + "loss": 0.17577136754989625, + "step": 210340 + }, + { + "epoch": 0.9030765135708336, + "grad_norm": 0.006774316541850567, + "learning_rate": 9.734570509559083e-06, + "loss": 0.10552970170974732, + "step": 210350 + }, + { + "epoch": 0.9031194456608537, + "grad_norm": 4.395589828491211, + "learning_rate": 9.73025878944146e-06, + "loss": 0.22056665420532226, + "step": 210360 + }, + { + "epoch": 0.9031623777508737, + "grad_norm": 0.7510838508605957, + "learning_rate": 9.725947069323836e-06, + "loss": 0.3048895835876465, + "step": 210370 + }, + { + "epoch": 0.9032053098408936, + "grad_norm": 10.735414505004883, + "learning_rate": 9.721635349206213e-06, + "loss": 0.2514230728149414, + "step": 210380 + }, + { + "epoch": 0.9032482419309137, + "grad_norm": 9.217434883117676, + "learning_rate": 9.717323629088589e-06, + "loss": 0.20851621627807618, + "step": 210390 + }, + { + "epoch": 0.9032911740209337, + "grad_norm": 3.4000942707061768, + "learning_rate": 9.713011908970964e-06, + "loss": 0.10003012418746948, + "step": 210400 + }, + { + "epoch": 0.9033341061109537, + "grad_norm": 1.2130926847457886, + "learning_rate": 9.708700188853342e-06, + "loss": 0.1683989644050598, + "step": 210410 + }, + { + "epoch": 0.9033770382009737, + "grad_norm": 0.11750691384077072, + "learning_rate": 9.704388468735717e-06, + "loss": 0.06318738460540771, + "step": 210420 + }, + { + "epoch": 0.9034199702909937, + "grad_norm": 0.0006076234858483076, + "learning_rate": 9.700076748618093e-06, + "loss": 0.2571662187576294, + "step": 210430 + }, + { + "epoch": 0.9034629023810137, + "grad_norm": 1.1019152402877808, + "learning_rate": 9.69576502850047e-06, + "loss": 0.1616765022277832, + "step": 210440 + }, + { + "epoch": 0.9035058344710337, + "grad_norm": 4.9832892417907715, + "learning_rate": 9.691453308382848e-06, + "loss": 0.22566719055175782, + "step": 210450 + }, + { + "epoch": 0.9035487665610538, + "grad_norm": 41.49699783325195, + "learning_rate": 9.687141588265223e-06, + "loss": 0.3436073064804077, + "step": 210460 + }, + { + "epoch": 0.9035916986510737, + "grad_norm": 0.0022543699014931917, + "learning_rate": 9.682829868147599e-06, + "loss": 0.052913957834243776, + "step": 210470 + }, + { + "epoch": 0.9036346307410937, + "grad_norm": 0.04838306084275246, + "learning_rate": 9.678518148029976e-06, + "loss": 0.16422059535980224, + "step": 210480 + }, + { + "epoch": 0.9036775628311138, + "grad_norm": 2.2827861309051514, + "learning_rate": 9.674206427912352e-06, + "loss": 0.22558994293212892, + "step": 210490 + }, + { + "epoch": 0.9037204949211337, + "grad_norm": 0.0007248317124322057, + "learning_rate": 9.669894707794728e-06, + "loss": 0.16456080675125123, + "step": 210500 + }, + { + "epoch": 0.9037634270111538, + "grad_norm": 0.001483508967794478, + "learning_rate": 9.665582987677105e-06, + "loss": 0.05367158055305481, + "step": 210510 + }, + { + "epoch": 0.9038063591011738, + "grad_norm": 0.0016992673045024276, + "learning_rate": 9.66127126755948e-06, + "loss": 0.32336156368255614, + "step": 210520 + }, + { + "epoch": 0.9038492911911937, + "grad_norm": 0.013364183716475964, + "learning_rate": 9.656959547441858e-06, + "loss": 0.18762919902801514, + "step": 210530 + }, + { + "epoch": 0.9038922232812138, + "grad_norm": 0.0003909058286808431, + "learning_rate": 9.652647827324234e-06, + "loss": 0.0872824728488922, + "step": 210540 + }, + { + "epoch": 0.9039351553712338, + "grad_norm": 0.04263646900653839, + "learning_rate": 9.64833610720661e-06, + "loss": 0.23260953426361083, + "step": 210550 + }, + { + "epoch": 0.9039780874612537, + "grad_norm": 0.14210690557956696, + "learning_rate": 9.644024387088987e-06, + "loss": 0.16822166442871095, + "step": 210560 + }, + { + "epoch": 0.9040210195512738, + "grad_norm": 0.18358734250068665, + "learning_rate": 9.639712666971362e-06, + "loss": 0.05378770232200623, + "step": 210570 + }, + { + "epoch": 0.9040639516412938, + "grad_norm": 0.019276317209005356, + "learning_rate": 9.635400946853738e-06, + "loss": 0.16520183086395263, + "step": 210580 + }, + { + "epoch": 0.9041068837313138, + "grad_norm": 1.3517718315124512, + "learning_rate": 9.631089226736115e-06, + "loss": 0.22130036354064941, + "step": 210590 + }, + { + "epoch": 0.9041498158213338, + "grad_norm": 0.06858087331056595, + "learning_rate": 9.626777506618491e-06, + "loss": 0.20281364917755126, + "step": 210600 + }, + { + "epoch": 0.9041927479113538, + "grad_norm": 0.004181146156042814, + "learning_rate": 9.622465786500867e-06, + "loss": 0.33087265491485596, + "step": 210610 + }, + { + "epoch": 0.9042356800013738, + "grad_norm": 0.023237429559230804, + "learning_rate": 9.618154066383244e-06, + "loss": 0.043955230712890626, + "step": 210620 + }, + { + "epoch": 0.9042786120913938, + "grad_norm": 6.121235370635986, + "learning_rate": 9.61384234626562e-06, + "loss": 0.1775528073310852, + "step": 210630 + }, + { + "epoch": 0.9043215441814139, + "grad_norm": 0.0028020916506648064, + "learning_rate": 9.609530626147995e-06, + "loss": 0.22397143840789796, + "step": 210640 + }, + { + "epoch": 0.9043644762714338, + "grad_norm": 0.005821366794407368, + "learning_rate": 9.605218906030373e-06, + "loss": 0.19856249094009398, + "step": 210650 + }, + { + "epoch": 0.9044074083614538, + "grad_norm": 1.4921270608901978, + "learning_rate": 9.600907185912748e-06, + "loss": 0.31009862422943113, + "step": 210660 + }, + { + "epoch": 0.9044503404514739, + "grad_norm": 2.3841261863708496, + "learning_rate": 9.596595465795124e-06, + "loss": 0.17027231454849243, + "step": 210670 + }, + { + "epoch": 0.9044932725414939, + "grad_norm": 6.284855365753174, + "learning_rate": 9.592283745677501e-06, + "loss": 0.1498428463935852, + "step": 210680 + }, + { + "epoch": 0.9045362046315139, + "grad_norm": 8.141776925185695e-05, + "learning_rate": 9.587972025559877e-06, + "loss": 0.29921822547912597, + "step": 210690 + }, + { + "epoch": 0.9045791367215339, + "grad_norm": 1.557852029800415, + "learning_rate": 9.583660305442253e-06, + "loss": 0.11124341487884522, + "step": 210700 + }, + { + "epoch": 0.9046220688115539, + "grad_norm": 0.002186469966545701, + "learning_rate": 9.57934858532463e-06, + "loss": 0.057540792226791385, + "step": 210710 + }, + { + "epoch": 0.9046650009015739, + "grad_norm": 0.014679819345474243, + "learning_rate": 9.575036865207006e-06, + "loss": 0.07504054307937622, + "step": 210720 + }, + { + "epoch": 0.9047079329915939, + "grad_norm": 3.3283891677856445, + "learning_rate": 9.570725145089381e-06, + "loss": 0.3486778974533081, + "step": 210730 + }, + { + "epoch": 0.904750865081614, + "grad_norm": 3.6053216457366943, + "learning_rate": 9.566413424971759e-06, + "loss": 0.08856486082077027, + "step": 210740 + }, + { + "epoch": 0.9047937971716339, + "grad_norm": 0.024385172873735428, + "learning_rate": 9.562101704854134e-06, + "loss": 0.17349275350570678, + "step": 210750 + }, + { + "epoch": 0.9048367292616539, + "grad_norm": 2.2132294178009033, + "learning_rate": 9.557789984736512e-06, + "loss": 0.123107647895813, + "step": 210760 + }, + { + "epoch": 0.904879661351674, + "grad_norm": 0.9034229516983032, + "learning_rate": 9.553478264618887e-06, + "loss": 0.22174947261810302, + "step": 210770 + }, + { + "epoch": 0.9049225934416939, + "grad_norm": 0.03812728449702263, + "learning_rate": 9.549166544501263e-06, + "loss": 0.11002792119979858, + "step": 210780 + }, + { + "epoch": 0.9049655255317139, + "grad_norm": 0.6964645385742188, + "learning_rate": 9.54485482438364e-06, + "loss": 0.30134100914001466, + "step": 210790 + }, + { + "epoch": 0.905008457621734, + "grad_norm": 1.9698841571807861, + "learning_rate": 9.540543104266016e-06, + "loss": 0.11750224828720093, + "step": 210800 + }, + { + "epoch": 0.9050513897117539, + "grad_norm": 0.021683400496840477, + "learning_rate": 9.536231384148393e-06, + "loss": 0.157227087020874, + "step": 210810 + }, + { + "epoch": 0.905094321801774, + "grad_norm": 1.2105274200439453, + "learning_rate": 9.531919664030769e-06, + "loss": 0.11554534435272217, + "step": 210820 + }, + { + "epoch": 0.905137253891794, + "grad_norm": 0.0017459297087043524, + "learning_rate": 9.527607943913146e-06, + "loss": 0.08362597823143006, + "step": 210830 + }, + { + "epoch": 0.9051801859818139, + "grad_norm": 1.1244165897369385, + "learning_rate": 9.523296223795522e-06, + "loss": 0.2152784824371338, + "step": 210840 + }, + { + "epoch": 0.905223118071834, + "grad_norm": 0.003775811055675149, + "learning_rate": 9.518984503677897e-06, + "loss": 0.1833783507347107, + "step": 210850 + }, + { + "epoch": 0.905266050161854, + "grad_norm": 2.9542245864868164, + "learning_rate": 9.514672783560275e-06, + "loss": 0.30322258472442626, + "step": 210860 + }, + { + "epoch": 0.905308982251874, + "grad_norm": 0.00222378084436059, + "learning_rate": 9.51036106344265e-06, + "loss": 0.3002403020858765, + "step": 210870 + }, + { + "epoch": 0.905351914341894, + "grad_norm": 0.02239265665411949, + "learning_rate": 9.506049343325026e-06, + "loss": 0.027442681789398193, + "step": 210880 + }, + { + "epoch": 0.905394846431914, + "grad_norm": 0.09150674194097519, + "learning_rate": 9.501737623207403e-06, + "loss": 0.17352490425109862, + "step": 210890 + }, + { + "epoch": 0.905437778521934, + "grad_norm": 2.7677178382873535, + "learning_rate": 9.497425903089779e-06, + "loss": 0.22967958450317383, + "step": 210900 + }, + { + "epoch": 0.905480710611954, + "grad_norm": 0.7799781560897827, + "learning_rate": 9.493114182972156e-06, + "loss": 0.14056270122528075, + "step": 210910 + }, + { + "epoch": 0.905523642701974, + "grad_norm": 1.656139612197876, + "learning_rate": 9.488802462854532e-06, + "loss": 0.21149537563323975, + "step": 210920 + }, + { + "epoch": 0.905566574791994, + "grad_norm": 0.04829051345586777, + "learning_rate": 9.484490742736908e-06, + "loss": 0.2642256259918213, + "step": 210930 + }, + { + "epoch": 0.905609506882014, + "grad_norm": 0.02896782010793686, + "learning_rate": 9.480179022619285e-06, + "loss": 0.37795684337615965, + "step": 210940 + }, + { + "epoch": 0.9056524389720341, + "grad_norm": 0.1285865157842636, + "learning_rate": 9.47586730250166e-06, + "loss": 0.13206915855407714, + "step": 210950 + }, + { + "epoch": 0.905695371062054, + "grad_norm": 0.0013368797954171896, + "learning_rate": 9.471555582384036e-06, + "loss": 0.45220026969909666, + "step": 210960 + }, + { + "epoch": 0.905738303152074, + "grad_norm": 3.8433640003204346, + "learning_rate": 9.467243862266414e-06, + "loss": 0.2438734531402588, + "step": 210970 + }, + { + "epoch": 0.9057812352420941, + "grad_norm": 0.7117074728012085, + "learning_rate": 9.46293214214879e-06, + "loss": 0.15083757638931275, + "step": 210980 + }, + { + "epoch": 0.905824167332114, + "grad_norm": 1.0885868072509766, + "learning_rate": 9.458620422031165e-06, + "loss": 0.39199352264404297, + "step": 210990 + }, + { + "epoch": 0.9058670994221341, + "grad_norm": 0.20401152968406677, + "learning_rate": 9.454308701913542e-06, + "loss": 0.21293511390686035, + "step": 211000 + }, + { + "epoch": 0.9058670994221341, + "eval_loss": 0.37001845240592957, + "eval_runtime": 27.3722, + "eval_samples_per_second": 3.653, + "eval_steps_per_second": 3.653, + "step": 211000 + }, + { + "epoch": 0.9059100315121541, + "grad_norm": 1.95425546169281, + "learning_rate": 9.449996981795918e-06, + "loss": 0.20344960689544678, + "step": 211010 + }, + { + "epoch": 0.905952963602174, + "grad_norm": 0.04089081287384033, + "learning_rate": 9.445685261678294e-06, + "loss": 0.12624008655548097, + "step": 211020 + }, + { + "epoch": 0.9059958956921941, + "grad_norm": 0.2765030264854431, + "learning_rate": 9.441373541560671e-06, + "loss": 0.35709438323974607, + "step": 211030 + }, + { + "epoch": 0.9060388277822141, + "grad_norm": 0.0029109471943229437, + "learning_rate": 9.437061821443047e-06, + "loss": 0.086311936378479, + "step": 211040 + }, + { + "epoch": 0.906081759872234, + "grad_norm": 0.1786552518606186, + "learning_rate": 9.432750101325422e-06, + "loss": 0.19341384172439574, + "step": 211050 + }, + { + "epoch": 0.9061246919622541, + "grad_norm": 6.021338939666748, + "learning_rate": 9.4284383812078e-06, + "loss": 0.10097607374191284, + "step": 211060 + }, + { + "epoch": 0.9061676240522741, + "grad_norm": 0.04982906952500343, + "learning_rate": 9.424126661090175e-06, + "loss": 0.04032069146633148, + "step": 211070 + }, + { + "epoch": 0.9062105561422941, + "grad_norm": 0.0056654238142073154, + "learning_rate": 9.419814940972551e-06, + "loss": 0.07370581030845642, + "step": 211080 + }, + { + "epoch": 0.9062534882323141, + "grad_norm": 0.0005656041321344674, + "learning_rate": 9.415503220854928e-06, + "loss": 0.35269713401794434, + "step": 211090 + }, + { + "epoch": 0.9062964203223342, + "grad_norm": 0.04776056110858917, + "learning_rate": 9.411191500737304e-06, + "loss": 0.2374497890472412, + "step": 211100 + }, + { + "epoch": 0.9063393524123542, + "grad_norm": 0.15905514359474182, + "learning_rate": 9.40687978061968e-06, + "loss": 0.17020422220230103, + "step": 211110 + }, + { + "epoch": 0.9063822845023741, + "grad_norm": 0.006161512341350317, + "learning_rate": 9.402568060502057e-06, + "loss": 0.1747220277786255, + "step": 211120 + }, + { + "epoch": 0.9064252165923942, + "grad_norm": 6.532177925109863, + "learning_rate": 9.398256340384433e-06, + "loss": 0.2477626085281372, + "step": 211130 + }, + { + "epoch": 0.9064681486824142, + "grad_norm": 0.21067188680171967, + "learning_rate": 9.39394462026681e-06, + "loss": 0.07029619812965393, + "step": 211140 + }, + { + "epoch": 0.9065110807724341, + "grad_norm": 3.6878209114074707, + "learning_rate": 9.389632900149186e-06, + "loss": 0.22565534114837646, + "step": 211150 + }, + { + "epoch": 0.9065540128624542, + "grad_norm": 0.0373564250767231, + "learning_rate": 9.385321180031561e-06, + "loss": 0.38879761695861814, + "step": 211160 + }, + { + "epoch": 0.9065969449524742, + "grad_norm": 2.5509607791900635, + "learning_rate": 9.381009459913939e-06, + "loss": 0.28637146949768066, + "step": 211170 + }, + { + "epoch": 0.9066398770424942, + "grad_norm": 1.6686248779296875, + "learning_rate": 9.376697739796316e-06, + "loss": 0.28002395629882815, + "step": 211180 + }, + { + "epoch": 0.9066828091325142, + "grad_norm": 0.0021680134814232588, + "learning_rate": 9.372386019678692e-06, + "loss": 0.445535945892334, + "step": 211190 + }, + { + "epoch": 0.9067257412225342, + "grad_norm": 0.0014827148988842964, + "learning_rate": 9.368074299561067e-06, + "loss": 0.09994487166404724, + "step": 211200 + }, + { + "epoch": 0.9067686733125542, + "grad_norm": 0.12028845399618149, + "learning_rate": 9.363762579443445e-06, + "loss": 0.26585865020751953, + "step": 211210 + }, + { + "epoch": 0.9068116054025742, + "grad_norm": 0.941667914390564, + "learning_rate": 9.35945085932582e-06, + "loss": 0.39935662746429446, + "step": 211220 + }, + { + "epoch": 0.9068545374925943, + "grad_norm": 3.0790231227874756, + "learning_rate": 9.355139139208196e-06, + "loss": 0.1506575345993042, + "step": 211230 + }, + { + "epoch": 0.9068974695826142, + "grad_norm": 2.296433687210083, + "learning_rate": 9.350827419090573e-06, + "loss": 0.15209686756134033, + "step": 211240 + }, + { + "epoch": 0.9069404016726342, + "grad_norm": 0.02622704952955246, + "learning_rate": 9.346515698972949e-06, + "loss": 0.11668375730514527, + "step": 211250 + }, + { + "epoch": 0.9069833337626543, + "grad_norm": 0.0014148653717711568, + "learning_rate": 9.342203978855326e-06, + "loss": 0.27220356464385986, + "step": 211260 + }, + { + "epoch": 0.9070262658526742, + "grad_norm": 0.12697049975395203, + "learning_rate": 9.337892258737702e-06, + "loss": 0.23106029033660888, + "step": 211270 + }, + { + "epoch": 0.9070691979426942, + "grad_norm": 0.04394913837313652, + "learning_rate": 9.333580538620077e-06, + "loss": 0.15961995124816894, + "step": 211280 + }, + { + "epoch": 0.9071121300327143, + "grad_norm": 3.6546521186828613, + "learning_rate": 9.329268818502455e-06, + "loss": 0.2648077249526978, + "step": 211290 + }, + { + "epoch": 0.9071550621227342, + "grad_norm": 2.3274571895599365, + "learning_rate": 9.32495709838483e-06, + "loss": 0.2588292837142944, + "step": 211300 + }, + { + "epoch": 0.9071979942127543, + "grad_norm": 0.2122945338487625, + "learning_rate": 9.320645378267206e-06, + "loss": 0.2704058885574341, + "step": 211310 + }, + { + "epoch": 0.9072409263027743, + "grad_norm": 0.0172574520111084, + "learning_rate": 9.316333658149583e-06, + "loss": 0.34654572010040285, + "step": 211320 + }, + { + "epoch": 0.9072838583927942, + "grad_norm": 0.005942110437899828, + "learning_rate": 9.312021938031959e-06, + "loss": 0.14186539649963378, + "step": 211330 + }, + { + "epoch": 0.9073267904828143, + "grad_norm": 0.016240037977695465, + "learning_rate": 9.307710217914335e-06, + "loss": 0.07858587503433227, + "step": 211340 + }, + { + "epoch": 0.9073697225728343, + "grad_norm": 0.010405509732663631, + "learning_rate": 9.303398497796712e-06, + "loss": 0.18252902030944823, + "step": 211350 + }, + { + "epoch": 0.9074126546628543, + "grad_norm": 0.0003283452242612839, + "learning_rate": 9.299086777679088e-06, + "loss": 0.21496164798736572, + "step": 211360 + }, + { + "epoch": 0.9074555867528743, + "grad_norm": 0.1149737760424614, + "learning_rate": 9.294775057561463e-06, + "loss": 0.21276600360870362, + "step": 211370 + }, + { + "epoch": 0.9074985188428943, + "grad_norm": 1.8674908876419067, + "learning_rate": 9.29046333744384e-06, + "loss": 0.2720707178115845, + "step": 211380 + }, + { + "epoch": 0.9075414509329143, + "grad_norm": 0.16619011759757996, + "learning_rate": 9.286151617326216e-06, + "loss": 0.14984419345855712, + "step": 211390 + }, + { + "epoch": 0.9075843830229343, + "grad_norm": 2.006049156188965, + "learning_rate": 9.281839897208592e-06, + "loss": 0.11598966121673585, + "step": 211400 + }, + { + "epoch": 0.9076273151129544, + "grad_norm": 0.18155886232852936, + "learning_rate": 9.27752817709097e-06, + "loss": 0.24428968429565429, + "step": 211410 + }, + { + "epoch": 0.9076702472029743, + "grad_norm": 0.004149468149989843, + "learning_rate": 9.273216456973345e-06, + "loss": 0.16037646532058716, + "step": 211420 + }, + { + "epoch": 0.9077131792929943, + "grad_norm": 0.004715626128017902, + "learning_rate": 9.26890473685572e-06, + "loss": 0.13179171085357666, + "step": 211430 + }, + { + "epoch": 0.9077561113830144, + "grad_norm": 0.010400134138762951, + "learning_rate": 9.264593016738098e-06, + "loss": 0.03884969651699066, + "step": 211440 + }, + { + "epoch": 0.9077990434730343, + "grad_norm": 0.022108979523181915, + "learning_rate": 9.260281296620474e-06, + "loss": 0.3082466125488281, + "step": 211450 + }, + { + "epoch": 0.9078419755630543, + "grad_norm": 3.668081045150757, + "learning_rate": 9.25596957650285e-06, + "loss": 0.19297831058502196, + "step": 211460 + }, + { + "epoch": 0.9078849076530744, + "grad_norm": 1.232636570930481, + "learning_rate": 9.251657856385227e-06, + "loss": 0.372821044921875, + "step": 211470 + }, + { + "epoch": 0.9079278397430943, + "grad_norm": 1.6673029661178589, + "learning_rate": 9.247346136267602e-06, + "loss": 0.19048078060150148, + "step": 211480 + }, + { + "epoch": 0.9079707718331144, + "grad_norm": 0.03407606855034828, + "learning_rate": 9.243034416149978e-06, + "loss": 0.17276411056518554, + "step": 211490 + }, + { + "epoch": 0.9080137039231344, + "grad_norm": 0.004653999116271734, + "learning_rate": 9.238722696032355e-06, + "loss": 0.18268084526062012, + "step": 211500 + }, + { + "epoch": 0.9080566360131543, + "grad_norm": 0.010216489434242249, + "learning_rate": 9.234410975914731e-06, + "loss": 0.2223001480102539, + "step": 211510 + }, + { + "epoch": 0.9080995681031744, + "grad_norm": 0.30999666452407837, + "learning_rate": 9.230099255797108e-06, + "loss": 0.3233329296112061, + "step": 211520 + }, + { + "epoch": 0.9081425001931944, + "grad_norm": 0.0008284636423923075, + "learning_rate": 9.225787535679484e-06, + "loss": 0.05378011465072632, + "step": 211530 + }, + { + "epoch": 0.9081854322832145, + "grad_norm": 0.007144168484956026, + "learning_rate": 9.221475815561861e-06, + "loss": 0.09841606616973878, + "step": 211540 + }, + { + "epoch": 0.9082283643732344, + "grad_norm": 0.050571274012327194, + "learning_rate": 9.217164095444237e-06, + "loss": 0.162974750995636, + "step": 211550 + }, + { + "epoch": 0.9082712964632544, + "grad_norm": 0.004725494422018528, + "learning_rate": 9.212852375326614e-06, + "loss": 0.2045898199081421, + "step": 211560 + }, + { + "epoch": 0.9083142285532745, + "grad_norm": 0.09760285913944244, + "learning_rate": 9.20854065520899e-06, + "loss": 0.1423335075378418, + "step": 211570 + }, + { + "epoch": 0.9083571606432944, + "grad_norm": 1.1642075777053833, + "learning_rate": 9.204228935091366e-06, + "loss": 0.06477875113487244, + "step": 211580 + }, + { + "epoch": 0.9084000927333145, + "grad_norm": 0.21785888075828552, + "learning_rate": 9.199917214973743e-06, + "loss": 0.22252657413482665, + "step": 211590 + }, + { + "epoch": 0.9084430248233345, + "grad_norm": 0.015162704512476921, + "learning_rate": 9.195605494856119e-06, + "loss": 0.2382740259170532, + "step": 211600 + }, + { + "epoch": 0.9084859569133544, + "grad_norm": 0.0008446506108157337, + "learning_rate": 9.191293774738494e-06, + "loss": 0.16340986490249634, + "step": 211610 + }, + { + "epoch": 0.9085288890033745, + "grad_norm": 0.006048992741852999, + "learning_rate": 9.186982054620872e-06, + "loss": 0.09929171800613404, + "step": 211620 + }, + { + "epoch": 0.9085718210933945, + "grad_norm": 0.021925954148173332, + "learning_rate": 9.182670334503247e-06, + "loss": 0.12175835371017456, + "step": 211630 + }, + { + "epoch": 0.9086147531834144, + "grad_norm": 0.0023411933798342943, + "learning_rate": 9.178358614385625e-06, + "loss": 0.1825516939163208, + "step": 211640 + }, + { + "epoch": 0.9086576852734345, + "grad_norm": 0.113210529088974, + "learning_rate": 9.174046894268e-06, + "loss": 0.23193886280059814, + "step": 211650 + }, + { + "epoch": 0.9087006173634545, + "grad_norm": 0.00030858899117447436, + "learning_rate": 9.169735174150376e-06, + "loss": 0.22229115962982177, + "step": 211660 + }, + { + "epoch": 0.9087435494534745, + "grad_norm": 0.0011662208708003163, + "learning_rate": 9.165423454032753e-06, + "loss": 0.15364946126937867, + "step": 211670 + }, + { + "epoch": 0.9087864815434945, + "grad_norm": 4.0690460205078125, + "learning_rate": 9.161111733915129e-06, + "loss": 0.18665337562561035, + "step": 211680 + }, + { + "epoch": 0.9088294136335145, + "grad_norm": 0.00017444766126573086, + "learning_rate": 9.156800013797504e-06, + "loss": 0.1485775351524353, + "step": 211690 + }, + { + "epoch": 0.9088723457235345, + "grad_norm": 0.768441379070282, + "learning_rate": 9.152488293679882e-06, + "loss": 0.14072189331054688, + "step": 211700 + }, + { + "epoch": 0.9089152778135545, + "grad_norm": 0.07776512950658798, + "learning_rate": 9.148176573562257e-06, + "loss": 0.25699448585510254, + "step": 211710 + }, + { + "epoch": 0.9089582099035746, + "grad_norm": 0.013703049160540104, + "learning_rate": 9.143864853444633e-06, + "loss": 0.21622865200042723, + "step": 211720 + }, + { + "epoch": 0.9090011419935945, + "grad_norm": 0.17358940839767456, + "learning_rate": 9.13955313332701e-06, + "loss": 0.10401270389556885, + "step": 211730 + }, + { + "epoch": 0.9090440740836145, + "grad_norm": 0.0002648585650604218, + "learning_rate": 9.135241413209386e-06, + "loss": 0.1151541829109192, + "step": 211740 + }, + { + "epoch": 0.9090870061736346, + "grad_norm": 4.667215824127197, + "learning_rate": 9.130929693091762e-06, + "loss": 0.23117105960845946, + "step": 211750 + }, + { + "epoch": 0.9091299382636545, + "grad_norm": 1.013177514076233, + "learning_rate": 9.126617972974139e-06, + "loss": 0.23333847522735596, + "step": 211760 + }, + { + "epoch": 0.9091728703536746, + "grad_norm": 1.5600218772888184, + "learning_rate": 9.122306252856515e-06, + "loss": 0.15483250617980956, + "step": 211770 + }, + { + "epoch": 0.9092158024436946, + "grad_norm": 0.0004753020766656846, + "learning_rate": 9.11799453273889e-06, + "loss": 0.2777076244354248, + "step": 211780 + }, + { + "epoch": 0.9092587345337145, + "grad_norm": 0.014906748197972775, + "learning_rate": 9.113682812621268e-06, + "loss": 0.14254034757614137, + "step": 211790 + }, + { + "epoch": 0.9093016666237346, + "grad_norm": 0.22179877758026123, + "learning_rate": 9.109371092503643e-06, + "loss": 0.14603278636932374, + "step": 211800 + }, + { + "epoch": 0.9093445987137546, + "grad_norm": 0.06097375229001045, + "learning_rate": 9.105059372386019e-06, + "loss": 0.18807802200317383, + "step": 211810 + }, + { + "epoch": 0.9093875308037745, + "grad_norm": 0.002239939058199525, + "learning_rate": 9.100747652268396e-06, + "loss": 0.15530016422271728, + "step": 211820 + }, + { + "epoch": 0.9094304628937946, + "grad_norm": 51.984317779541016, + "learning_rate": 9.096435932150772e-06, + "loss": 0.12407512664794922, + "step": 211830 + }, + { + "epoch": 0.9094733949838146, + "grad_norm": 7.28548526763916, + "learning_rate": 9.092124212033148e-06, + "loss": 0.21014230251312255, + "step": 211840 + }, + { + "epoch": 0.9095163270738346, + "grad_norm": 0.004259404726326466, + "learning_rate": 9.087812491915525e-06, + "loss": 0.1882183074951172, + "step": 211850 + }, + { + "epoch": 0.9095592591638546, + "grad_norm": 0.014772419817745686, + "learning_rate": 9.0835007717979e-06, + "loss": 0.147011661529541, + "step": 211860 + }, + { + "epoch": 0.9096021912538746, + "grad_norm": 0.49753573536872864, + "learning_rate": 9.079189051680276e-06, + "loss": 0.29521842002868653, + "step": 211870 + }, + { + "epoch": 0.9096451233438946, + "grad_norm": 0.015458498150110245, + "learning_rate": 9.074877331562654e-06, + "loss": 0.1639685869216919, + "step": 211880 + }, + { + "epoch": 0.9096880554339146, + "grad_norm": 2.9252068996429443, + "learning_rate": 9.07056561144503e-06, + "loss": 0.18361132144927977, + "step": 211890 + }, + { + "epoch": 0.9097309875239347, + "grad_norm": 0.1760420799255371, + "learning_rate": 9.066253891327407e-06, + "loss": 0.11042402982711792, + "step": 211900 + }, + { + "epoch": 0.9097739196139546, + "grad_norm": 0.00045009804307483137, + "learning_rate": 9.061942171209784e-06, + "loss": 0.2616447925567627, + "step": 211910 + }, + { + "epoch": 0.9098168517039746, + "grad_norm": 0.006046078633517027, + "learning_rate": 9.05763045109216e-06, + "loss": 0.06336274743080139, + "step": 211920 + }, + { + "epoch": 0.9098597837939947, + "grad_norm": 0.051460232585668564, + "learning_rate": 9.053318730974535e-06, + "loss": 0.18316928148269654, + "step": 211930 + }, + { + "epoch": 0.9099027158840146, + "grad_norm": 2.8884639739990234, + "learning_rate": 9.049007010856913e-06, + "loss": 0.13370351791381835, + "step": 211940 + }, + { + "epoch": 0.9099456479740347, + "grad_norm": 0.01390179991722107, + "learning_rate": 9.044695290739288e-06, + "loss": 0.3180847644805908, + "step": 211950 + }, + { + "epoch": 0.9099885800640547, + "grad_norm": 1.4461475610733032, + "learning_rate": 9.040383570621664e-06, + "loss": 0.12614543437957765, + "step": 211960 + }, + { + "epoch": 0.9100315121540747, + "grad_norm": 0.0034605904947966337, + "learning_rate": 9.036071850504041e-06, + "loss": 0.2389918804168701, + "step": 211970 + }, + { + "epoch": 0.9100744442440947, + "grad_norm": 0.00030550433439202607, + "learning_rate": 9.031760130386417e-06, + "loss": 0.04597398638725281, + "step": 211980 + }, + { + "epoch": 0.9101173763341147, + "grad_norm": 0.091501384973526, + "learning_rate": 9.027448410268793e-06, + "loss": 0.023420873284339904, + "step": 211990 + }, + { + "epoch": 0.9101603084241348, + "grad_norm": 0.005600926466286182, + "learning_rate": 9.02313669015117e-06, + "loss": 0.07838650941848754, + "step": 212000 + }, + { + "epoch": 0.9101603084241348, + "eval_loss": 0.37726593017578125, + "eval_runtime": 27.4014, + "eval_samples_per_second": 3.649, + "eval_steps_per_second": 3.649, + "step": 212000 + }, + { + "epoch": 0.9102032405141547, + "grad_norm": 0.11374276131391525, + "learning_rate": 9.018824970033546e-06, + "loss": 0.15132025480270386, + "step": 212010 + }, + { + "epoch": 0.9102461726041747, + "grad_norm": 2.1611249446868896, + "learning_rate": 9.014513249915923e-06, + "loss": 0.21653666496276855, + "step": 212020 + }, + { + "epoch": 0.9102891046941948, + "grad_norm": 0.1119178906083107, + "learning_rate": 9.010201529798299e-06, + "loss": 0.19596610069274903, + "step": 212030 + }, + { + "epoch": 0.9103320367842147, + "grad_norm": 0.03157566860318184, + "learning_rate": 9.005889809680674e-06, + "loss": 0.17952189445495606, + "step": 212040 + }, + { + "epoch": 0.9103749688742347, + "grad_norm": 0.029103923588991165, + "learning_rate": 9.001578089563052e-06, + "loss": 0.17300734519958497, + "step": 212050 + }, + { + "epoch": 0.9104179009642548, + "grad_norm": 0.058678023517131805, + "learning_rate": 8.997266369445427e-06, + "loss": 0.05254848003387451, + "step": 212060 + }, + { + "epoch": 0.9104608330542747, + "grad_norm": 2.070000648498535, + "learning_rate": 8.992954649327803e-06, + "loss": 0.2808130502700806, + "step": 212070 + }, + { + "epoch": 0.9105037651442948, + "grad_norm": 0.09030576795339584, + "learning_rate": 8.98864292921018e-06, + "loss": 0.17629364728927613, + "step": 212080 + }, + { + "epoch": 0.9105466972343148, + "grad_norm": 1.5905768871307373, + "learning_rate": 8.984331209092556e-06, + "loss": 0.3039500951766968, + "step": 212090 + }, + { + "epoch": 0.9105896293243347, + "grad_norm": 0.016770660877227783, + "learning_rate": 8.980019488974931e-06, + "loss": 0.12196837663650513, + "step": 212100 + }, + { + "epoch": 0.9106325614143548, + "grad_norm": 0.0024727019481360912, + "learning_rate": 8.975707768857309e-06, + "loss": 0.18381850719451903, + "step": 212110 + }, + { + "epoch": 0.9106754935043748, + "grad_norm": 0.027349425479769707, + "learning_rate": 8.971396048739684e-06, + "loss": 0.16867611408233643, + "step": 212120 + }, + { + "epoch": 0.9107184255943948, + "grad_norm": 0.14424623548984528, + "learning_rate": 8.96708432862206e-06, + "loss": 0.18001662492752074, + "step": 212130 + }, + { + "epoch": 0.9107613576844148, + "grad_norm": 0.02970905415713787, + "learning_rate": 8.962772608504437e-06, + "loss": 0.21140286922454835, + "step": 212140 + }, + { + "epoch": 0.9108042897744348, + "grad_norm": 1.0737354755401611, + "learning_rate": 8.958460888386813e-06, + "loss": 0.2667649269104004, + "step": 212150 + }, + { + "epoch": 0.9108472218644548, + "grad_norm": 0.9957414269447327, + "learning_rate": 8.954149168269189e-06, + "loss": 0.2296832323074341, + "step": 212160 + }, + { + "epoch": 0.9108901539544748, + "grad_norm": 0.07176550477743149, + "learning_rate": 8.949837448151566e-06, + "loss": 0.07566173076629638, + "step": 212170 + }, + { + "epoch": 0.9109330860444949, + "grad_norm": 28.415855407714844, + "learning_rate": 8.945525728033942e-06, + "loss": 0.10729323625564575, + "step": 212180 + }, + { + "epoch": 0.9109760181345148, + "grad_norm": 0.0053515927866101265, + "learning_rate": 8.941214007916317e-06, + "loss": 0.031461399793624875, + "step": 212190 + }, + { + "epoch": 0.9110189502245348, + "grad_norm": 3.31968092918396, + "learning_rate": 8.936902287798695e-06, + "loss": 0.0983917772769928, + "step": 212200 + }, + { + "epoch": 0.9110618823145549, + "grad_norm": 0.0015259032370522618, + "learning_rate": 8.93259056768107e-06, + "loss": 0.049478965997695926, + "step": 212210 + }, + { + "epoch": 0.9111048144045748, + "grad_norm": 7.493229866027832, + "learning_rate": 8.928278847563446e-06, + "loss": 0.11486443281173705, + "step": 212220 + }, + { + "epoch": 0.9111477464945948, + "grad_norm": 0.7252117991447449, + "learning_rate": 8.923967127445823e-06, + "loss": 0.25599918365478513, + "step": 212230 + }, + { + "epoch": 0.9111906785846149, + "grad_norm": 0.6110628843307495, + "learning_rate": 8.919655407328199e-06, + "loss": 0.2323148250579834, + "step": 212240 + }, + { + "epoch": 0.9112336106746348, + "grad_norm": 15.844795227050781, + "learning_rate": 8.915343687210576e-06, + "loss": 0.20794005393981935, + "step": 212250 + }, + { + "epoch": 0.9112765427646549, + "grad_norm": 1.656082034111023, + "learning_rate": 8.911031967092954e-06, + "loss": 0.14436640739440917, + "step": 212260 + }, + { + "epoch": 0.9113194748546749, + "grad_norm": 0.0013086560647934675, + "learning_rate": 8.90672024697533e-06, + "loss": 0.35257253646850584, + "step": 212270 + }, + { + "epoch": 0.9113624069446948, + "grad_norm": 0.011829572729766369, + "learning_rate": 8.902408526857705e-06, + "loss": 0.13194645643234254, + "step": 212280 + }, + { + "epoch": 0.9114053390347149, + "grad_norm": 3.3504648208618164, + "learning_rate": 8.898096806740082e-06, + "loss": 0.27801511287689207, + "step": 212290 + }, + { + "epoch": 0.9114482711247349, + "grad_norm": 0.08066578209400177, + "learning_rate": 8.893785086622458e-06, + "loss": 0.1585480809211731, + "step": 212300 + }, + { + "epoch": 0.9114912032147549, + "grad_norm": 0.1084263026714325, + "learning_rate": 8.889473366504834e-06, + "loss": 0.1131742000579834, + "step": 212310 + }, + { + "epoch": 0.9115341353047749, + "grad_norm": 0.308685302734375, + "learning_rate": 8.885161646387211e-06, + "loss": 0.1919371724128723, + "step": 212320 + }, + { + "epoch": 0.9115770673947949, + "grad_norm": 0.10141505300998688, + "learning_rate": 8.880849926269587e-06, + "loss": 0.11400158405303955, + "step": 212330 + }, + { + "epoch": 0.9116199994848149, + "grad_norm": 0.0039230696856975555, + "learning_rate": 8.876538206151962e-06, + "loss": 0.05479676723480224, + "step": 212340 + }, + { + "epoch": 0.9116629315748349, + "grad_norm": 0.024995839223265648, + "learning_rate": 8.87222648603434e-06, + "loss": 0.15745952129364013, + "step": 212350 + }, + { + "epoch": 0.911705863664855, + "grad_norm": 0.0010867923265323043, + "learning_rate": 8.867914765916715e-06, + "loss": 0.3819491624832153, + "step": 212360 + }, + { + "epoch": 0.9117487957548749, + "grad_norm": 1.1967419385910034, + "learning_rate": 8.863603045799091e-06, + "loss": 0.475917911529541, + "step": 212370 + }, + { + "epoch": 0.9117917278448949, + "grad_norm": 1.2156577110290527, + "learning_rate": 8.859291325681468e-06, + "loss": 0.24176313877105712, + "step": 212380 + }, + { + "epoch": 0.911834659934915, + "grad_norm": 0.0021465690806508064, + "learning_rate": 8.854979605563844e-06, + "loss": 0.23308084011077881, + "step": 212390 + }, + { + "epoch": 0.911877592024935, + "grad_norm": 2.2919228076934814, + "learning_rate": 8.850667885446221e-06, + "loss": 0.11274752616882325, + "step": 212400 + }, + { + "epoch": 0.911920524114955, + "grad_norm": 3.1902475357055664, + "learning_rate": 8.846356165328597e-06, + "loss": 0.16625409126281737, + "step": 212410 + }, + { + "epoch": 0.911963456204975, + "grad_norm": 0.0027463510632514954, + "learning_rate": 8.842044445210973e-06, + "loss": 0.1264907717704773, + "step": 212420 + }, + { + "epoch": 0.912006388294995, + "grad_norm": 1.7046873569488525, + "learning_rate": 8.83773272509335e-06, + "loss": 0.22326078414916992, + "step": 212430 + }, + { + "epoch": 0.912049320385015, + "grad_norm": 0.041218217462301254, + "learning_rate": 8.833421004975726e-06, + "loss": 0.26831223964691164, + "step": 212440 + }, + { + "epoch": 0.912092252475035, + "grad_norm": 6.816025733947754, + "learning_rate": 8.829109284858101e-06, + "loss": 0.36359567642211915, + "step": 212450 + }, + { + "epoch": 0.912135184565055, + "grad_norm": 0.5391436815261841, + "learning_rate": 8.824797564740479e-06, + "loss": 0.19195263385772704, + "step": 212460 + }, + { + "epoch": 0.912178116655075, + "grad_norm": 1.583741307258606, + "learning_rate": 8.820485844622854e-06, + "loss": 0.35937676429748533, + "step": 212470 + }, + { + "epoch": 0.912221048745095, + "grad_norm": 2.529594898223877, + "learning_rate": 8.81617412450523e-06, + "loss": 0.18012168407440185, + "step": 212480 + }, + { + "epoch": 0.9122639808351151, + "grad_norm": 3.7486445903778076, + "learning_rate": 8.811862404387607e-06, + "loss": 0.21492357254028321, + "step": 212490 + }, + { + "epoch": 0.912306912925135, + "grad_norm": 0.006154716946184635, + "learning_rate": 8.807550684269983e-06, + "loss": 0.16618551015853883, + "step": 212500 + }, + { + "epoch": 0.912349845015155, + "grad_norm": 0.2932658791542053, + "learning_rate": 8.803238964152359e-06, + "loss": 0.08020783066749573, + "step": 212510 + }, + { + "epoch": 0.9123927771051751, + "grad_norm": 0.908278226852417, + "learning_rate": 8.798927244034736e-06, + "loss": 0.0777955710887909, + "step": 212520 + }, + { + "epoch": 0.912435709195195, + "grad_norm": 0.0029801647178828716, + "learning_rate": 8.794615523917112e-06, + "loss": 0.02414112240076065, + "step": 212530 + }, + { + "epoch": 0.912478641285215, + "grad_norm": 0.0029607617761939764, + "learning_rate": 8.790303803799487e-06, + "loss": 0.24954428672790527, + "step": 212540 + }, + { + "epoch": 0.9125215733752351, + "grad_norm": 1.418501377105713, + "learning_rate": 8.785992083681864e-06, + "loss": 0.06055132150650024, + "step": 212550 + }, + { + "epoch": 0.912564505465255, + "grad_norm": 1.9168803691864014, + "learning_rate": 8.78168036356424e-06, + "loss": 0.17470144033432006, + "step": 212560 + }, + { + "epoch": 0.9126074375552751, + "grad_norm": 0.02751356177031994, + "learning_rate": 8.777368643446616e-06, + "loss": 0.0252609521150589, + "step": 212570 + }, + { + "epoch": 0.9126503696452951, + "grad_norm": 4.568458080291748, + "learning_rate": 8.773056923328993e-06, + "loss": 0.14840620756149292, + "step": 212580 + }, + { + "epoch": 0.912693301735315, + "grad_norm": 2.3897361755371094, + "learning_rate": 8.768745203211369e-06, + "loss": 0.18637797832489014, + "step": 212590 + }, + { + "epoch": 0.9127362338253351, + "grad_norm": 0.04232500493526459, + "learning_rate": 8.764433483093744e-06, + "loss": 0.16503369808197021, + "step": 212600 + }, + { + "epoch": 0.9127791659153551, + "grad_norm": 1.7149666547775269, + "learning_rate": 8.760121762976122e-06, + "loss": 0.13525526523590087, + "step": 212610 + }, + { + "epoch": 0.9128220980053751, + "grad_norm": 2.110562324523926, + "learning_rate": 8.755810042858497e-06, + "loss": 0.13837047815322875, + "step": 212620 + }, + { + "epoch": 0.9128650300953951, + "grad_norm": 9.753104209899902, + "learning_rate": 8.751498322740875e-06, + "loss": 0.30054826736450196, + "step": 212630 + }, + { + "epoch": 0.9129079621854151, + "grad_norm": 3.0043880939483643, + "learning_rate": 8.747186602623252e-06, + "loss": 0.11299515962600708, + "step": 212640 + }, + { + "epoch": 0.9129508942754351, + "grad_norm": 0.38159698247909546, + "learning_rate": 8.742874882505628e-06, + "loss": 0.06615483164787292, + "step": 212650 + }, + { + "epoch": 0.9129938263654551, + "grad_norm": 0.0005800298531539738, + "learning_rate": 8.738563162388003e-06, + "loss": 0.18061435222625732, + "step": 212660 + }, + { + "epoch": 0.9130367584554752, + "grad_norm": 0.24667766690254211, + "learning_rate": 8.73425144227038e-06, + "loss": 0.1562572717666626, + "step": 212670 + }, + { + "epoch": 0.9130796905454951, + "grad_norm": 0.057288434356451035, + "learning_rate": 8.729939722152756e-06, + "loss": 0.33345324993133546, + "step": 212680 + }, + { + "epoch": 0.9131226226355151, + "grad_norm": 1.1795369386672974, + "learning_rate": 8.725628002035132e-06, + "loss": 0.3631131172180176, + "step": 212690 + }, + { + "epoch": 0.9131655547255352, + "grad_norm": 0.0925176739692688, + "learning_rate": 8.72131628191751e-06, + "loss": 0.06134541034698486, + "step": 212700 + }, + { + "epoch": 0.9132084868155551, + "grad_norm": 2.638174057006836, + "learning_rate": 8.717004561799885e-06, + "loss": 0.14020242691040039, + "step": 212710 + }, + { + "epoch": 0.9132514189055752, + "grad_norm": 0.9038133025169373, + "learning_rate": 8.71269284168226e-06, + "loss": 0.08528336286544799, + "step": 212720 + }, + { + "epoch": 0.9132943509955952, + "grad_norm": 0.0008694896241649985, + "learning_rate": 8.708381121564638e-06, + "loss": 0.053201431035995485, + "step": 212730 + }, + { + "epoch": 0.9133372830856151, + "grad_norm": 2.8368773460388184, + "learning_rate": 8.704069401447014e-06, + "loss": 0.29605350494384763, + "step": 212740 + }, + { + "epoch": 0.9133802151756352, + "grad_norm": 0.001284956349991262, + "learning_rate": 8.69975768132939e-06, + "loss": 0.17406622171401978, + "step": 212750 + }, + { + "epoch": 0.9134231472656552, + "grad_norm": 0.0011331437854096293, + "learning_rate": 8.695445961211767e-06, + "loss": 0.20874719619750975, + "step": 212760 + }, + { + "epoch": 0.9134660793556751, + "grad_norm": 0.1996951401233673, + "learning_rate": 8.691134241094142e-06, + "loss": 0.13332525491714478, + "step": 212770 + }, + { + "epoch": 0.9135090114456952, + "grad_norm": 0.0020294704008847475, + "learning_rate": 8.68682252097652e-06, + "loss": 0.10547182559967042, + "step": 212780 + }, + { + "epoch": 0.9135519435357152, + "grad_norm": 0.0018016091780737042, + "learning_rate": 8.682510800858895e-06, + "loss": 0.2505123376846313, + "step": 212790 + }, + { + "epoch": 0.9135948756257352, + "grad_norm": 1.4915944337844849, + "learning_rate": 8.678199080741271e-06, + "loss": 0.20534911155700683, + "step": 212800 + }, + { + "epoch": 0.9136378077157552, + "grad_norm": 0.02371162362396717, + "learning_rate": 8.673887360623648e-06, + "loss": 0.0034074489027261733, + "step": 212810 + }, + { + "epoch": 0.9136807398057752, + "grad_norm": 0.0006841256399638951, + "learning_rate": 8.669575640506024e-06, + "loss": 0.19988794326782228, + "step": 212820 + }, + { + "epoch": 0.9137236718957953, + "grad_norm": 0.1977357715368271, + "learning_rate": 8.6652639203884e-06, + "loss": 0.1504676103591919, + "step": 212830 + }, + { + "epoch": 0.9137666039858152, + "grad_norm": 1.7436408996582031, + "learning_rate": 8.660952200270777e-06, + "loss": 0.2765189170837402, + "step": 212840 + }, + { + "epoch": 0.9138095360758353, + "grad_norm": 0.013714958913624287, + "learning_rate": 8.656640480153153e-06, + "loss": 0.12662038803100586, + "step": 212850 + }, + { + "epoch": 0.9138524681658553, + "grad_norm": 5.23076868057251, + "learning_rate": 8.652328760035528e-06, + "loss": 0.1594509482383728, + "step": 212860 + }, + { + "epoch": 0.9138954002558752, + "grad_norm": 0.027277108281850815, + "learning_rate": 8.648017039917906e-06, + "loss": 0.1075689435005188, + "step": 212870 + }, + { + "epoch": 0.9139383323458953, + "grad_norm": 1.4043325185775757, + "learning_rate": 8.643705319800281e-06, + "loss": 0.25166323184967043, + "step": 212880 + }, + { + "epoch": 0.9139812644359153, + "grad_norm": 0.04628121107816696, + "learning_rate": 8.639393599682657e-06, + "loss": 0.004422901198267937, + "step": 212890 + }, + { + "epoch": 0.9140241965259353, + "grad_norm": 0.02353578992187977, + "learning_rate": 8.635081879565034e-06, + "loss": 0.2293551445007324, + "step": 212900 + }, + { + "epoch": 0.9140671286159553, + "grad_norm": 3.4110593795776367, + "learning_rate": 8.63077015944741e-06, + "loss": 0.29389586448669436, + "step": 212910 + }, + { + "epoch": 0.9141100607059753, + "grad_norm": 0.4436527192592621, + "learning_rate": 8.626458439329786e-06, + "loss": 0.06104323863983154, + "step": 212920 + }, + { + "epoch": 0.9141529927959953, + "grad_norm": 0.06245763599872589, + "learning_rate": 8.622146719212163e-06, + "loss": 0.18613338470458984, + "step": 212930 + }, + { + "epoch": 0.9141959248860153, + "grad_norm": 1.623071551322937, + "learning_rate": 8.617834999094539e-06, + "loss": 0.1850024938583374, + "step": 212940 + }, + { + "epoch": 0.9142388569760354, + "grad_norm": 0.0883312001824379, + "learning_rate": 8.613523278976914e-06, + "loss": 0.1974207043647766, + "step": 212950 + }, + { + "epoch": 0.9142817890660553, + "grad_norm": 0.5963457822799683, + "learning_rate": 8.609211558859292e-06, + "loss": 0.24655821323394775, + "step": 212960 + }, + { + "epoch": 0.9143247211560753, + "grad_norm": 1.2836028337478638, + "learning_rate": 8.604899838741667e-06, + "loss": 0.23074700832366943, + "step": 212970 + }, + { + "epoch": 0.9143676532460954, + "grad_norm": 1.9236855506896973, + "learning_rate": 8.600588118624043e-06, + "loss": 0.16037309169769287, + "step": 212980 + }, + { + "epoch": 0.9144105853361153, + "grad_norm": 0.0057564787566661835, + "learning_rate": 8.596276398506422e-06, + "loss": 0.31025793552398684, + "step": 212990 + }, + { + "epoch": 0.9144535174261353, + "grad_norm": 2.740952968597412, + "learning_rate": 8.591964678388797e-06, + "loss": 0.2665108680725098, + "step": 213000 + }, + { + "epoch": 0.9144535174261353, + "eval_loss": 0.37361475825309753, + "eval_runtime": 27.4134, + "eval_samples_per_second": 3.648, + "eval_steps_per_second": 3.648, + "step": 213000 + }, + { + "epoch": 0.9144964495161554, + "grad_norm": 1.782199501991272, + "learning_rate": 8.587652958271173e-06, + "loss": 0.2488037109375, + "step": 213010 + }, + { + "epoch": 0.9145393816061753, + "grad_norm": 1.5992531776428223, + "learning_rate": 8.58334123815355e-06, + "loss": 0.34330708980560304, + "step": 213020 + }, + { + "epoch": 0.9145823136961954, + "grad_norm": 0.001826804713346064, + "learning_rate": 8.579029518035926e-06, + "loss": 0.2511431217193604, + "step": 213030 + }, + { + "epoch": 0.9146252457862154, + "grad_norm": 0.2493457943201065, + "learning_rate": 8.574717797918302e-06, + "loss": 0.2527246713638306, + "step": 213040 + }, + { + "epoch": 0.9146681778762353, + "grad_norm": 0.4726060628890991, + "learning_rate": 8.570406077800679e-06, + "loss": 0.12096805572509765, + "step": 213050 + }, + { + "epoch": 0.9147111099662554, + "grad_norm": 0.033890970051288605, + "learning_rate": 8.566094357683055e-06, + "loss": 0.3322614192962646, + "step": 213060 + }, + { + "epoch": 0.9147540420562754, + "grad_norm": 0.0006106910877861083, + "learning_rate": 8.56178263756543e-06, + "loss": 0.1534043788909912, + "step": 213070 + }, + { + "epoch": 0.9147969741462953, + "grad_norm": 0.5903043746948242, + "learning_rate": 8.557470917447808e-06, + "loss": 0.12563472986221313, + "step": 213080 + }, + { + "epoch": 0.9148399062363154, + "grad_norm": 0.07464814186096191, + "learning_rate": 8.553159197330183e-06, + "loss": 0.08452232480049134, + "step": 213090 + }, + { + "epoch": 0.9148828383263354, + "grad_norm": 0.003437488107010722, + "learning_rate": 8.548847477212559e-06, + "loss": 0.2705679416656494, + "step": 213100 + }, + { + "epoch": 0.9149257704163554, + "grad_norm": 0.009765159338712692, + "learning_rate": 8.544535757094936e-06, + "loss": 0.28933393955230713, + "step": 213110 + }, + { + "epoch": 0.9149687025063754, + "grad_norm": 0.007637848611921072, + "learning_rate": 8.540224036977312e-06, + "loss": 0.19045697450637816, + "step": 213120 + }, + { + "epoch": 0.9150116345963955, + "grad_norm": 0.000521529174875468, + "learning_rate": 8.53591231685969e-06, + "loss": 0.1336083769798279, + "step": 213130 + }, + { + "epoch": 0.9150545666864154, + "grad_norm": 0.30744844675064087, + "learning_rate": 8.531600596742065e-06, + "loss": 0.14940725564956664, + "step": 213140 + }, + { + "epoch": 0.9150974987764354, + "grad_norm": 0.007774029858410358, + "learning_rate": 8.52728887662444e-06, + "loss": 0.01716921925544739, + "step": 213150 + }, + { + "epoch": 0.9151404308664555, + "grad_norm": 0.33261045813560486, + "learning_rate": 8.522977156506818e-06, + "loss": 0.10560462474822999, + "step": 213160 + }, + { + "epoch": 0.9151833629564754, + "grad_norm": 2.7580161094665527, + "learning_rate": 8.518665436389194e-06, + "loss": 0.22219655513763428, + "step": 213170 + }, + { + "epoch": 0.9152262950464954, + "grad_norm": 0.004856944549828768, + "learning_rate": 8.51435371627157e-06, + "loss": 0.22760391235351562, + "step": 213180 + }, + { + "epoch": 0.9152692271365155, + "grad_norm": 0.029608484357595444, + "learning_rate": 8.510041996153947e-06, + "loss": 0.11115431785583496, + "step": 213190 + }, + { + "epoch": 0.9153121592265354, + "grad_norm": 0.2841481566429138, + "learning_rate": 8.505730276036322e-06, + "loss": 0.07824562788009644, + "step": 213200 + }, + { + "epoch": 0.9153550913165555, + "grad_norm": 1.3372067213058472, + "learning_rate": 8.501418555918698e-06, + "loss": 0.2558911323547363, + "step": 213210 + }, + { + "epoch": 0.9153980234065755, + "grad_norm": 0.07914111018180847, + "learning_rate": 8.497106835801075e-06, + "loss": 0.09136658310890197, + "step": 213220 + }, + { + "epoch": 0.9154409554965954, + "grad_norm": 1.8780505657196045, + "learning_rate": 8.492795115683451e-06, + "loss": 0.22362115383148193, + "step": 213230 + }, + { + "epoch": 0.9154838875866155, + "grad_norm": 2.8011960983276367, + "learning_rate": 8.488483395565827e-06, + "loss": 0.1213125467300415, + "step": 213240 + }, + { + "epoch": 0.9155268196766355, + "grad_norm": 0.8596304655075073, + "learning_rate": 8.484171675448204e-06, + "loss": 0.2691220998764038, + "step": 213250 + }, + { + "epoch": 0.9155697517666556, + "grad_norm": 0.019585467875003815, + "learning_rate": 8.47985995533058e-06, + "loss": 0.36429500579833984, + "step": 213260 + }, + { + "epoch": 0.9156126838566755, + "grad_norm": 0.10184433311223984, + "learning_rate": 8.475548235212955e-06, + "loss": 0.1306537866592407, + "step": 213270 + }, + { + "epoch": 0.9156556159466955, + "grad_norm": 1.1802695989608765, + "learning_rate": 8.471236515095333e-06, + "loss": 0.26048238277435304, + "step": 213280 + }, + { + "epoch": 0.9156985480367156, + "grad_norm": 0.00236449739895761, + "learning_rate": 8.466924794977708e-06, + "loss": 0.23241877555847168, + "step": 213290 + }, + { + "epoch": 0.9157414801267355, + "grad_norm": 5.898766040802002, + "learning_rate": 8.462613074860084e-06, + "loss": 0.2835911989212036, + "step": 213300 + }, + { + "epoch": 0.9157844122167556, + "grad_norm": 0.004197239875793457, + "learning_rate": 8.458301354742461e-06, + "loss": 0.25936548709869384, + "step": 213310 + }, + { + "epoch": 0.9158273443067756, + "grad_norm": 2.1925132274627686, + "learning_rate": 8.453989634624837e-06, + "loss": 0.3117018699645996, + "step": 213320 + }, + { + "epoch": 0.9158702763967955, + "grad_norm": 0.0032696540001779795, + "learning_rate": 8.449677914507213e-06, + "loss": 0.18839519023895263, + "step": 213330 + }, + { + "epoch": 0.9159132084868156, + "grad_norm": 0.3618191182613373, + "learning_rate": 8.44536619438959e-06, + "loss": 0.4135741233825684, + "step": 213340 + }, + { + "epoch": 0.9159561405768356, + "grad_norm": 0.6629675626754761, + "learning_rate": 8.441054474271966e-06, + "loss": 0.19146194458007812, + "step": 213350 + }, + { + "epoch": 0.9159990726668555, + "grad_norm": 0.027104495093226433, + "learning_rate": 8.436742754154343e-06, + "loss": 0.1926203966140747, + "step": 213360 + }, + { + "epoch": 0.9160420047568756, + "grad_norm": 1.6082515716552734, + "learning_rate": 8.43243103403672e-06, + "loss": 0.29182188510894774, + "step": 213370 + }, + { + "epoch": 0.9160849368468956, + "grad_norm": 2.3720266819000244, + "learning_rate": 8.428119313919096e-06, + "loss": 0.16708863973617555, + "step": 213380 + }, + { + "epoch": 0.9161278689369156, + "grad_norm": 0.0009071618551388383, + "learning_rate": 8.423807593801472e-06, + "loss": 0.04892894625663757, + "step": 213390 + }, + { + "epoch": 0.9161708010269356, + "grad_norm": 0.02186976745724678, + "learning_rate": 8.419495873683849e-06, + "loss": 0.07931110262870789, + "step": 213400 + }, + { + "epoch": 0.9162137331169556, + "grad_norm": 0.006246891804039478, + "learning_rate": 8.415184153566225e-06, + "loss": 0.1462794780731201, + "step": 213410 + }, + { + "epoch": 0.9162566652069756, + "grad_norm": 1.6635063886642456, + "learning_rate": 8.4108724334486e-06, + "loss": 0.11009447574615479, + "step": 213420 + }, + { + "epoch": 0.9162995972969956, + "grad_norm": 2.642993927001953, + "learning_rate": 8.406560713330977e-06, + "loss": 0.1289622187614441, + "step": 213430 + }, + { + "epoch": 0.9163425293870157, + "grad_norm": 0.02965669333934784, + "learning_rate": 8.402248993213353e-06, + "loss": 0.15729275941848755, + "step": 213440 + }, + { + "epoch": 0.9163854614770356, + "grad_norm": 0.612890362739563, + "learning_rate": 8.397937273095729e-06, + "loss": 0.3066643476486206, + "step": 213450 + }, + { + "epoch": 0.9164283935670556, + "grad_norm": 0.0016727737383916974, + "learning_rate": 8.393625552978106e-06, + "loss": 0.08483893871307373, + "step": 213460 + }, + { + "epoch": 0.9164713256570757, + "grad_norm": 0.00839283224195242, + "learning_rate": 8.389313832860482e-06, + "loss": 0.254620099067688, + "step": 213470 + }, + { + "epoch": 0.9165142577470956, + "grad_norm": 1.7350776195526123, + "learning_rate": 8.385002112742857e-06, + "loss": 0.15120675563812255, + "step": 213480 + }, + { + "epoch": 0.9165571898371156, + "grad_norm": 0.0003176078025717288, + "learning_rate": 8.380690392625235e-06, + "loss": 0.13280483484268188, + "step": 213490 + }, + { + "epoch": 0.9166001219271357, + "grad_norm": 0.0025800876319408417, + "learning_rate": 8.37637867250761e-06, + "loss": 0.17722119092941285, + "step": 213500 + }, + { + "epoch": 0.9166430540171556, + "grad_norm": 5.274077892303467, + "learning_rate": 8.372066952389988e-06, + "loss": 0.2644465208053589, + "step": 213510 + }, + { + "epoch": 0.9166859861071757, + "grad_norm": 0.2625701129436493, + "learning_rate": 8.367755232272363e-06, + "loss": 0.14736897945404054, + "step": 213520 + }, + { + "epoch": 0.9167289181971957, + "grad_norm": 0.0007383037591353059, + "learning_rate": 8.363443512154739e-06, + "loss": 0.1682689070701599, + "step": 213530 + }, + { + "epoch": 0.9167718502872156, + "grad_norm": 0.07749518007040024, + "learning_rate": 8.359131792037116e-06, + "loss": 0.07301062941551209, + "step": 213540 + }, + { + "epoch": 0.9168147823772357, + "grad_norm": 0.08707591146230698, + "learning_rate": 8.354820071919492e-06, + "loss": 0.35566532611846924, + "step": 213550 + }, + { + "epoch": 0.9168577144672557, + "grad_norm": 0.023812185972929, + "learning_rate": 8.350508351801868e-06, + "loss": 0.09307337403297425, + "step": 213560 + }, + { + "epoch": 0.9169006465572757, + "grad_norm": 2.4213881492614746, + "learning_rate": 8.346196631684245e-06, + "loss": 0.27511775493621826, + "step": 213570 + }, + { + "epoch": 0.9169435786472957, + "grad_norm": 0.014306395314633846, + "learning_rate": 8.34188491156662e-06, + "loss": 0.28723247051239015, + "step": 213580 + }, + { + "epoch": 0.9169865107373157, + "grad_norm": 0.6624493598937988, + "learning_rate": 8.337573191448996e-06, + "loss": 0.19444373846054078, + "step": 213590 + }, + { + "epoch": 0.9170294428273357, + "grad_norm": 2.9697694778442383, + "learning_rate": 8.333261471331374e-06, + "loss": 0.43126649856567384, + "step": 213600 + }, + { + "epoch": 0.9170723749173557, + "grad_norm": 1.9498356580734253, + "learning_rate": 8.32894975121375e-06, + "loss": 0.12668828964233397, + "step": 213610 + }, + { + "epoch": 0.9171153070073758, + "grad_norm": 0.39234861731529236, + "learning_rate": 8.324638031096125e-06, + "loss": 0.05853158235549927, + "step": 213620 + }, + { + "epoch": 0.9171582390973957, + "grad_norm": 0.03262830153107643, + "learning_rate": 8.320326310978502e-06, + "loss": 0.13207764625549318, + "step": 213630 + }, + { + "epoch": 0.9172011711874157, + "grad_norm": 0.10223422944545746, + "learning_rate": 8.316014590860878e-06, + "loss": 0.20672743320465087, + "step": 213640 + }, + { + "epoch": 0.9172441032774358, + "grad_norm": 6.9421563148498535, + "learning_rate": 8.311702870743254e-06, + "loss": 0.41819748878479, + "step": 213650 + }, + { + "epoch": 0.9172870353674557, + "grad_norm": 0.09261999279260635, + "learning_rate": 8.307391150625631e-06, + "loss": 0.12242704629898071, + "step": 213660 + }, + { + "epoch": 0.9173299674574757, + "grad_norm": 0.21561941504478455, + "learning_rate": 8.303079430508007e-06, + "loss": 0.26964023113250735, + "step": 213670 + }, + { + "epoch": 0.9173728995474958, + "grad_norm": 0.001588730257935822, + "learning_rate": 8.298767710390382e-06, + "loss": 0.03283386826515198, + "step": 213680 + }, + { + "epoch": 0.9174158316375158, + "grad_norm": 2.3190999031066895, + "learning_rate": 8.29445599027276e-06, + "loss": 0.10395605564117431, + "step": 213690 + }, + { + "epoch": 0.9174587637275358, + "grad_norm": 0.015156721696257591, + "learning_rate": 8.290144270155135e-06, + "loss": 0.28372209072113036, + "step": 213700 + }, + { + "epoch": 0.9175016958175558, + "grad_norm": 44.24710464477539, + "learning_rate": 8.285832550037511e-06, + "loss": 0.07365514039993286, + "step": 213710 + }, + { + "epoch": 0.9175446279075758, + "grad_norm": 55.237640380859375, + "learning_rate": 8.28152082991989e-06, + "loss": 0.2189706325531006, + "step": 213720 + }, + { + "epoch": 0.9175875599975958, + "grad_norm": 6.306714057922363, + "learning_rate": 8.277209109802266e-06, + "loss": 0.30330231189727785, + "step": 213730 + }, + { + "epoch": 0.9176304920876158, + "grad_norm": 0.6346426010131836, + "learning_rate": 8.272897389684641e-06, + "loss": 0.2297675848007202, + "step": 213740 + }, + { + "epoch": 0.9176734241776359, + "grad_norm": 0.011375557631254196, + "learning_rate": 8.268585669567019e-06, + "loss": 0.13121119737625123, + "step": 213750 + }, + { + "epoch": 0.9177163562676558, + "grad_norm": 0.029359858483076096, + "learning_rate": 8.264273949449394e-06, + "loss": 0.410442590713501, + "step": 213760 + }, + { + "epoch": 0.9177592883576758, + "grad_norm": 0.004748243372887373, + "learning_rate": 8.25996222933177e-06, + "loss": 0.24686429500579835, + "step": 213770 + }, + { + "epoch": 0.9178022204476959, + "grad_norm": 2.400439977645874, + "learning_rate": 8.255650509214147e-06, + "loss": 0.04432217478752136, + "step": 213780 + }, + { + "epoch": 0.9178451525377158, + "grad_norm": 1.4817440509796143, + "learning_rate": 8.251338789096523e-06, + "loss": 0.24401164054870605, + "step": 213790 + }, + { + "epoch": 0.9178880846277359, + "grad_norm": 2.9294021129608154, + "learning_rate": 8.247027068978899e-06, + "loss": 0.1489182472229004, + "step": 213800 + }, + { + "epoch": 0.9179310167177559, + "grad_norm": 0.0016216447111219168, + "learning_rate": 8.242715348861276e-06, + "loss": 0.08440933227539063, + "step": 213810 + }, + { + "epoch": 0.9179739488077758, + "grad_norm": 1.4032232761383057, + "learning_rate": 8.238403628743652e-06, + "loss": 0.4191108226776123, + "step": 213820 + }, + { + "epoch": 0.9180168808977959, + "grad_norm": 0.0279683880507946, + "learning_rate": 8.234091908626027e-06, + "loss": 0.07173965573310852, + "step": 213830 + }, + { + "epoch": 0.9180598129878159, + "grad_norm": 4.024061679840088, + "learning_rate": 8.229780188508405e-06, + "loss": 0.2750051975250244, + "step": 213840 + }, + { + "epoch": 0.9181027450778358, + "grad_norm": 1.4007704257965088, + "learning_rate": 8.22546846839078e-06, + "loss": 0.2678218364715576, + "step": 213850 + }, + { + "epoch": 0.9181456771678559, + "grad_norm": 0.30023008584976196, + "learning_rate": 8.221156748273156e-06, + "loss": 0.06759833097457886, + "step": 213860 + }, + { + "epoch": 0.9181886092578759, + "grad_norm": 0.004864270333200693, + "learning_rate": 8.216845028155533e-06, + "loss": 0.11008800268173217, + "step": 213870 + }, + { + "epoch": 0.9182315413478959, + "grad_norm": 0.0029974612407386303, + "learning_rate": 8.212533308037909e-06, + "loss": 0.15078630447387695, + "step": 213880 + }, + { + "epoch": 0.9182744734379159, + "grad_norm": 0.035421110689640045, + "learning_rate": 8.208221587920286e-06, + "loss": 0.16177096366882324, + "step": 213890 + }, + { + "epoch": 0.918317405527936, + "grad_norm": 0.7298688292503357, + "learning_rate": 8.203909867802662e-06, + "loss": 0.256973123550415, + "step": 213900 + }, + { + "epoch": 0.9183603376179559, + "grad_norm": 0.05107526481151581, + "learning_rate": 8.199598147685037e-06, + "loss": 0.2563158988952637, + "step": 213910 + }, + { + "epoch": 0.9184032697079759, + "grad_norm": 2.8571629524230957, + "learning_rate": 8.195286427567415e-06, + "loss": 0.1808464288711548, + "step": 213920 + }, + { + "epoch": 0.918446201797996, + "grad_norm": 0.28787219524383545, + "learning_rate": 8.19097470744979e-06, + "loss": 0.20286407470703124, + "step": 213930 + }, + { + "epoch": 0.9184891338880159, + "grad_norm": 0.031048133969306946, + "learning_rate": 8.186662987332166e-06, + "loss": 0.14505958557128906, + "step": 213940 + }, + { + "epoch": 0.9185320659780359, + "grad_norm": 0.037190306931734085, + "learning_rate": 8.182351267214543e-06, + "loss": 0.3253729581832886, + "step": 213950 + }, + { + "epoch": 0.918574998068056, + "grad_norm": 0.3285282254219055, + "learning_rate": 8.178039547096919e-06, + "loss": 0.325065016746521, + "step": 213960 + }, + { + "epoch": 0.9186179301580759, + "grad_norm": 2.641339063644409, + "learning_rate": 8.173727826979295e-06, + "loss": 0.23930034637451172, + "step": 213970 + }, + { + "epoch": 0.918660862248096, + "grad_norm": 0.017421351745724678, + "learning_rate": 8.169416106861672e-06, + "loss": 0.1706032156944275, + "step": 213980 + }, + { + "epoch": 0.918703794338116, + "grad_norm": 0.0013588599395006895, + "learning_rate": 8.165104386744048e-06, + "loss": 0.301548171043396, + "step": 213990 + }, + { + "epoch": 0.9187467264281359, + "grad_norm": 0.003222766565158963, + "learning_rate": 8.160792666626423e-06, + "loss": 0.13970067501068115, + "step": 214000 + }, + { + "epoch": 0.9187467264281359, + "eval_loss": 0.3728339672088623, + "eval_runtime": 27.3708, + "eval_samples_per_second": 3.654, + "eval_steps_per_second": 3.654, + "step": 214000 + }, + { + "epoch": 0.918789658518156, + "grad_norm": 0.0069166203029453754, + "learning_rate": 8.1564809465088e-06, + "loss": 0.16746548414230347, + "step": 214010 + }, + { + "epoch": 0.918832590608176, + "grad_norm": 0.002220205496996641, + "learning_rate": 8.152169226391176e-06, + "loss": 0.19638227224349974, + "step": 214020 + }, + { + "epoch": 0.918875522698196, + "grad_norm": 0.009522872976958752, + "learning_rate": 8.147857506273552e-06, + "loss": 0.10982517004013062, + "step": 214030 + }, + { + "epoch": 0.918918454788216, + "grad_norm": 0.005404068157076836, + "learning_rate": 8.14354578615593e-06, + "loss": 0.11145150661468506, + "step": 214040 + }, + { + "epoch": 0.918961386878236, + "grad_norm": 1.6319386959075928, + "learning_rate": 8.139234066038305e-06, + "loss": 0.2857020854949951, + "step": 214050 + }, + { + "epoch": 0.919004318968256, + "grad_norm": 0.04213089123368263, + "learning_rate": 8.13492234592068e-06, + "loss": 0.0028133489191532134, + "step": 214060 + }, + { + "epoch": 0.919047251058276, + "grad_norm": 0.0002146908809663728, + "learning_rate": 8.130610625803058e-06, + "loss": 0.13747214078903197, + "step": 214070 + }, + { + "epoch": 0.919090183148296, + "grad_norm": 8.343812942504883, + "learning_rate": 8.126298905685435e-06, + "loss": 0.2809786081314087, + "step": 214080 + }, + { + "epoch": 0.919133115238316, + "grad_norm": 0.0025600444059818983, + "learning_rate": 8.121987185567811e-06, + "loss": 0.05838137865066528, + "step": 214090 + }, + { + "epoch": 0.919176047328336, + "grad_norm": 1.5793315172195435, + "learning_rate": 8.117675465450188e-06, + "loss": 0.02491525709629059, + "step": 214100 + }, + { + "epoch": 0.9192189794183561, + "grad_norm": 2.205817699432373, + "learning_rate": 8.113363745332564e-06, + "loss": 0.18225139379501343, + "step": 214110 + }, + { + "epoch": 0.9192619115083761, + "grad_norm": 0.12144787609577179, + "learning_rate": 8.10905202521494e-06, + "loss": 0.41785888671875, + "step": 214120 + }, + { + "epoch": 0.919304843598396, + "grad_norm": 0.005557660944759846, + "learning_rate": 8.104740305097317e-06, + "loss": 0.12414391040802002, + "step": 214130 + }, + { + "epoch": 0.9193477756884161, + "grad_norm": 0.032746944576501846, + "learning_rate": 8.100428584979693e-06, + "loss": 0.21280176639556886, + "step": 214140 + }, + { + "epoch": 0.9193907077784361, + "grad_norm": 2.1349384784698486, + "learning_rate": 8.096116864862068e-06, + "loss": 0.2097855806350708, + "step": 214150 + }, + { + "epoch": 0.919433639868456, + "grad_norm": 28.290878295898438, + "learning_rate": 8.091805144744446e-06, + "loss": 0.26921648979187013, + "step": 214160 + }, + { + "epoch": 0.9194765719584761, + "grad_norm": 1.0186740159988403, + "learning_rate": 8.087493424626821e-06, + "loss": 0.056221646070480344, + "step": 214170 + }, + { + "epoch": 0.9195195040484961, + "grad_norm": 0.0006930052768439054, + "learning_rate": 8.083181704509197e-06, + "loss": 0.1876317262649536, + "step": 214180 + }, + { + "epoch": 0.9195624361385161, + "grad_norm": 0.01069891732186079, + "learning_rate": 8.078869984391574e-06, + "loss": 0.1312105655670166, + "step": 214190 + }, + { + "epoch": 0.9196053682285361, + "grad_norm": 3.0067901611328125, + "learning_rate": 8.07455826427395e-06, + "loss": 0.2393186092376709, + "step": 214200 + }, + { + "epoch": 0.9196483003185562, + "grad_norm": 1.284667730331421, + "learning_rate": 8.070246544156326e-06, + "loss": 0.33020362854003904, + "step": 214210 + }, + { + "epoch": 0.9196912324085761, + "grad_norm": 1.096384882926941, + "learning_rate": 8.065934824038703e-06, + "loss": 0.2953479290008545, + "step": 214220 + }, + { + "epoch": 0.9197341644985961, + "grad_norm": 0.004542557522654533, + "learning_rate": 8.061623103921079e-06, + "loss": 0.24383957386016847, + "step": 214230 + }, + { + "epoch": 0.9197770965886162, + "grad_norm": 2.143012046813965, + "learning_rate": 8.057311383803454e-06, + "loss": 0.19743248224258422, + "step": 214240 + }, + { + "epoch": 0.9198200286786361, + "grad_norm": 0.0006490990635938942, + "learning_rate": 8.052999663685832e-06, + "loss": 0.04032123982906342, + "step": 214250 + }, + { + "epoch": 0.9198629607686561, + "grad_norm": 0.03700326010584831, + "learning_rate": 8.048687943568207e-06, + "loss": 0.06131017804145813, + "step": 214260 + }, + { + "epoch": 0.9199058928586762, + "grad_norm": 1.601779580116272, + "learning_rate": 8.044376223450585e-06, + "loss": 0.314433217048645, + "step": 214270 + }, + { + "epoch": 0.9199488249486961, + "grad_norm": 0.9620373845100403, + "learning_rate": 8.04006450333296e-06, + "loss": 0.17828409671783446, + "step": 214280 + }, + { + "epoch": 0.9199917570387162, + "grad_norm": 1.0853599309921265, + "learning_rate": 8.035752783215336e-06, + "loss": 0.24211184978485106, + "step": 214290 + }, + { + "epoch": 0.9200346891287362, + "grad_norm": 0.02577628381550312, + "learning_rate": 8.031441063097713e-06, + "loss": 0.13259075880050658, + "step": 214300 + }, + { + "epoch": 0.9200776212187561, + "grad_norm": 5.1638264656066895, + "learning_rate": 8.027129342980089e-06, + "loss": 0.12507407665252684, + "step": 214310 + }, + { + "epoch": 0.9201205533087762, + "grad_norm": 0.04012976959347725, + "learning_rate": 8.022817622862464e-06, + "loss": 0.2570314645767212, + "step": 214320 + }, + { + "epoch": 0.9201634853987962, + "grad_norm": 0.6823280453681946, + "learning_rate": 8.018505902744842e-06, + "loss": 0.16146708726882936, + "step": 214330 + }, + { + "epoch": 0.9202064174888162, + "grad_norm": 0.0014941110275685787, + "learning_rate": 8.014194182627217e-06, + "loss": 0.05990640521049499, + "step": 214340 + }, + { + "epoch": 0.9202493495788362, + "grad_norm": 1.3181439638137817, + "learning_rate": 8.009882462509593e-06, + "loss": 0.17725783586502075, + "step": 214350 + }, + { + "epoch": 0.9202922816688562, + "grad_norm": 22.59912872314453, + "learning_rate": 8.00557074239197e-06, + "loss": 0.11575099229812622, + "step": 214360 + }, + { + "epoch": 0.9203352137588762, + "grad_norm": 0.00027134406263940036, + "learning_rate": 8.001259022274346e-06, + "loss": 0.18448431491851808, + "step": 214370 + }, + { + "epoch": 0.9203781458488962, + "grad_norm": 1.813732385635376, + "learning_rate": 7.996947302156722e-06, + "loss": 0.3346815347671509, + "step": 214380 + }, + { + "epoch": 0.9204210779389163, + "grad_norm": 0.0324367992579937, + "learning_rate": 7.992635582039099e-06, + "loss": 0.4010632991790771, + "step": 214390 + }, + { + "epoch": 0.9204640100289362, + "grad_norm": 0.10355591773986816, + "learning_rate": 7.988323861921475e-06, + "loss": 0.3059864521026611, + "step": 214400 + }, + { + "epoch": 0.9205069421189562, + "grad_norm": 0.03525923192501068, + "learning_rate": 7.98401214180385e-06, + "loss": 0.25398414134979247, + "step": 214410 + }, + { + "epoch": 0.9205498742089763, + "grad_norm": 2.2439262866973877, + "learning_rate": 7.979700421686228e-06, + "loss": 0.1350804328918457, + "step": 214420 + }, + { + "epoch": 0.9205928062989962, + "grad_norm": 0.00023116929514799267, + "learning_rate": 7.975388701568603e-06, + "loss": 0.06914361715316772, + "step": 214430 + }, + { + "epoch": 0.9206357383890162, + "grad_norm": 0.04269478842616081, + "learning_rate": 7.971076981450979e-06, + "loss": 0.17834771871566774, + "step": 214440 + }, + { + "epoch": 0.9206786704790363, + "grad_norm": 0.006393686402589083, + "learning_rate": 7.966765261333358e-06, + "loss": 0.4067357063293457, + "step": 214450 + }, + { + "epoch": 0.9207216025690562, + "grad_norm": 0.16539409756660461, + "learning_rate": 7.962453541215734e-06, + "loss": 0.19677144289016724, + "step": 214460 + }, + { + "epoch": 0.9207645346590763, + "grad_norm": 0.006637761369347572, + "learning_rate": 7.95814182109811e-06, + "loss": 0.07061156630516052, + "step": 214470 + }, + { + "epoch": 0.9208074667490963, + "grad_norm": 1.6709380149841309, + "learning_rate": 7.953830100980487e-06, + "loss": 0.03651362359523773, + "step": 214480 + }, + { + "epoch": 0.9208503988391162, + "grad_norm": 0.00023092412448022515, + "learning_rate": 7.949518380862862e-06, + "loss": 0.1736156463623047, + "step": 214490 + }, + { + "epoch": 0.9208933309291363, + "grad_norm": 0.40267854928970337, + "learning_rate": 7.945206660745238e-06, + "loss": 0.19352984428405762, + "step": 214500 + }, + { + "epoch": 0.9209362630191563, + "grad_norm": 0.08892907202243805, + "learning_rate": 7.940894940627615e-06, + "loss": 0.0169476717710495, + "step": 214510 + }, + { + "epoch": 0.9209791951091763, + "grad_norm": 0.001397549407556653, + "learning_rate": 7.936583220509991e-06, + "loss": 0.08612239956855774, + "step": 214520 + }, + { + "epoch": 0.9210221271991963, + "grad_norm": 0.6343229413032532, + "learning_rate": 7.932271500392367e-06, + "loss": 0.2680309534072876, + "step": 214530 + }, + { + "epoch": 0.9210650592892163, + "grad_norm": 0.7043375372886658, + "learning_rate": 7.927959780274744e-06, + "loss": 0.29093358516693113, + "step": 214540 + }, + { + "epoch": 0.9211079913792364, + "grad_norm": 0.3087158501148224, + "learning_rate": 7.92364806015712e-06, + "loss": 0.06735055446624756, + "step": 214550 + }, + { + "epoch": 0.9211509234692563, + "grad_norm": 0.0004326922935433686, + "learning_rate": 7.919336340039495e-06, + "loss": 0.17999672889709473, + "step": 214560 + }, + { + "epoch": 0.9211938555592764, + "grad_norm": 0.15494950115680695, + "learning_rate": 7.915024619921873e-06, + "loss": 0.21485633850097657, + "step": 214570 + }, + { + "epoch": 0.9212367876492964, + "grad_norm": 2.401089668273926, + "learning_rate": 7.910712899804248e-06, + "loss": 0.4662174701690674, + "step": 214580 + }, + { + "epoch": 0.9212797197393163, + "grad_norm": 2.7063958644866943, + "learning_rate": 7.906401179686624e-06, + "loss": 0.2256403923034668, + "step": 214590 + }, + { + "epoch": 0.9213226518293364, + "grad_norm": 2.5228281021118164, + "learning_rate": 7.902089459569001e-06, + "loss": 0.17687809467315674, + "step": 214600 + }, + { + "epoch": 0.9213655839193564, + "grad_norm": 0.2116834670305252, + "learning_rate": 7.897777739451377e-06, + "loss": 0.17992725372314453, + "step": 214610 + }, + { + "epoch": 0.9214085160093763, + "grad_norm": 0.0003231070004403591, + "learning_rate": 7.893466019333753e-06, + "loss": 0.16799609661102294, + "step": 214620 + }, + { + "epoch": 0.9214514480993964, + "grad_norm": 0.004379116464406252, + "learning_rate": 7.88915429921613e-06, + "loss": 0.030523082613945006, + "step": 214630 + }, + { + "epoch": 0.9214943801894164, + "grad_norm": 0.00718501815572381, + "learning_rate": 7.884842579098506e-06, + "loss": 0.10228891372680664, + "step": 214640 + }, + { + "epoch": 0.9215373122794364, + "grad_norm": 0.015222213231027126, + "learning_rate": 7.880530858980883e-06, + "loss": 0.21992430686950684, + "step": 214650 + }, + { + "epoch": 0.9215802443694564, + "grad_norm": 0.0776410922408104, + "learning_rate": 7.876219138863259e-06, + "loss": 0.3099169969558716, + "step": 214660 + }, + { + "epoch": 0.9216231764594764, + "grad_norm": 1.4122614860534668, + "learning_rate": 7.871907418745634e-06, + "loss": 0.17398278713226317, + "step": 214670 + }, + { + "epoch": 0.9216661085494964, + "grad_norm": 7.875330924987793, + "learning_rate": 7.867595698628012e-06, + "loss": 0.12187422513961792, + "step": 214680 + }, + { + "epoch": 0.9217090406395164, + "grad_norm": 0.010840149596333504, + "learning_rate": 7.863283978510387e-06, + "loss": 0.1018634557723999, + "step": 214690 + }, + { + "epoch": 0.9217519727295365, + "grad_norm": 0.007989337667822838, + "learning_rate": 7.858972258392763e-06, + "loss": 0.0946840524673462, + "step": 214700 + }, + { + "epoch": 0.9217949048195564, + "grad_norm": 0.00837134663015604, + "learning_rate": 7.85466053827514e-06, + "loss": 0.1308300495147705, + "step": 214710 + }, + { + "epoch": 0.9218378369095764, + "grad_norm": 0.5216271877288818, + "learning_rate": 7.850348818157516e-06, + "loss": 0.08474286198616028, + "step": 214720 + }, + { + "epoch": 0.9218807689995965, + "grad_norm": 2.1394126415252686, + "learning_rate": 7.846037098039891e-06, + "loss": 0.32121779918670657, + "step": 214730 + }, + { + "epoch": 0.9219237010896164, + "grad_norm": 3.283982276916504, + "learning_rate": 7.841725377922269e-06, + "loss": 0.16682465076446534, + "step": 214740 + }, + { + "epoch": 0.9219666331796365, + "grad_norm": 0.014272511005401611, + "learning_rate": 7.837413657804644e-06, + "loss": 0.19908100366592407, + "step": 214750 + }, + { + "epoch": 0.9220095652696565, + "grad_norm": 0.00569815281778574, + "learning_rate": 7.83310193768702e-06, + "loss": 0.10329477787017823, + "step": 214760 + }, + { + "epoch": 0.9220524973596764, + "grad_norm": 1.111350178718567, + "learning_rate": 7.828790217569397e-06, + "loss": 0.24649744033813475, + "step": 214770 + }, + { + "epoch": 0.9220954294496965, + "grad_norm": 0.0014714060816913843, + "learning_rate": 7.824478497451773e-06, + "loss": 0.17052639722824098, + "step": 214780 + }, + { + "epoch": 0.9221383615397165, + "grad_norm": 2.4729766845703125, + "learning_rate": 7.820166777334149e-06, + "loss": 0.21881551742553712, + "step": 214790 + }, + { + "epoch": 0.9221812936297364, + "grad_norm": 4.458625316619873, + "learning_rate": 7.815855057216526e-06, + "loss": 0.19693403244018554, + "step": 214800 + }, + { + "epoch": 0.9222242257197565, + "grad_norm": 0.16447418928146362, + "learning_rate": 7.811543337098903e-06, + "loss": 0.3553196907043457, + "step": 214810 + }, + { + "epoch": 0.9222671578097765, + "grad_norm": 0.015636222437024117, + "learning_rate": 7.807231616981279e-06, + "loss": 0.003960480913519859, + "step": 214820 + }, + { + "epoch": 0.9223100898997965, + "grad_norm": 0.11809345334768295, + "learning_rate": 7.802919896863656e-06, + "loss": 0.2305279016494751, + "step": 214830 + }, + { + "epoch": 0.9223530219898165, + "grad_norm": 1.4532008171081543, + "learning_rate": 7.798608176746032e-06, + "loss": 0.26950161457061766, + "step": 214840 + }, + { + "epoch": 0.9223959540798365, + "grad_norm": 0.292232871055603, + "learning_rate": 7.794296456628408e-06, + "loss": 0.019086624681949615, + "step": 214850 + }, + { + "epoch": 0.9224388861698565, + "grad_norm": 0.0011943303979933262, + "learning_rate": 7.789984736510785e-06, + "loss": 0.21599533557891845, + "step": 214860 + }, + { + "epoch": 0.9224818182598765, + "grad_norm": 2.3715226650238037, + "learning_rate": 7.78567301639316e-06, + "loss": 0.17667001485824585, + "step": 214870 + }, + { + "epoch": 0.9225247503498966, + "grad_norm": 6.8032708168029785, + "learning_rate": 7.781361296275536e-06, + "loss": 0.16944377422332763, + "step": 214880 + }, + { + "epoch": 0.9225676824399165, + "grad_norm": 0.09619058668613434, + "learning_rate": 7.777049576157914e-06, + "loss": 0.060175222158432004, + "step": 214890 + }, + { + "epoch": 0.9226106145299365, + "grad_norm": 3.406337022781372, + "learning_rate": 7.77273785604029e-06, + "loss": 0.11395597457885742, + "step": 214900 + }, + { + "epoch": 0.9226535466199566, + "grad_norm": 0.000977289048023522, + "learning_rate": 7.768426135922665e-06, + "loss": 0.16869571208953857, + "step": 214910 + }, + { + "epoch": 0.9226964787099765, + "grad_norm": 0.4416428804397583, + "learning_rate": 7.764114415805042e-06, + "loss": 0.2359858751296997, + "step": 214920 + }, + { + "epoch": 0.9227394107999966, + "grad_norm": 2.120130777359009, + "learning_rate": 7.759802695687418e-06, + "loss": 0.3166823148727417, + "step": 214930 + }, + { + "epoch": 0.9227823428900166, + "grad_norm": 0.9974570870399475, + "learning_rate": 7.755490975569794e-06, + "loss": 0.2767601013183594, + "step": 214940 + }, + { + "epoch": 0.9228252749800365, + "grad_norm": 0.011383699253201485, + "learning_rate": 7.751179255452171e-06, + "loss": 0.14973424673080443, + "step": 214950 + }, + { + "epoch": 0.9228682070700566, + "grad_norm": 1.8270457983016968, + "learning_rate": 7.746867535334547e-06, + "loss": 0.16595523357391356, + "step": 214960 + }, + { + "epoch": 0.9229111391600766, + "grad_norm": 0.05330152064561844, + "learning_rate": 7.742555815216922e-06, + "loss": 0.36078336238861086, + "step": 214970 + }, + { + "epoch": 0.9229540712500967, + "grad_norm": 0.0003709446464199573, + "learning_rate": 7.7382440950993e-06, + "loss": 0.2640532493591309, + "step": 214980 + }, + { + "epoch": 0.9229970033401166, + "grad_norm": 0.1887034922838211, + "learning_rate": 7.733932374981675e-06, + "loss": 0.08925374746322631, + "step": 214990 + }, + { + "epoch": 0.9230399354301366, + "grad_norm": 0.0009779626270756125, + "learning_rate": 7.729620654864053e-06, + "loss": 0.19787448644638062, + "step": 215000 + }, + { + "epoch": 0.9230399354301366, + "eval_loss": 0.3707399368286133, + "eval_runtime": 27.4681, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 215000 + }, + { + "epoch": 0.9230828675201567, + "grad_norm": 0.06447288393974304, + "learning_rate": 7.725308934746428e-06, + "loss": 0.16723594665527344, + "step": 215010 + }, + { + "epoch": 0.9231257996101766, + "grad_norm": 0.1130533218383789, + "learning_rate": 7.720997214628804e-06, + "loss": 0.13752557039260865, + "step": 215020 + }, + { + "epoch": 0.9231687317001966, + "grad_norm": 1.3311164379119873, + "learning_rate": 7.716685494511181e-06, + "loss": 0.24227960109710694, + "step": 215030 + }, + { + "epoch": 0.9232116637902167, + "grad_norm": 0.46791529655456543, + "learning_rate": 7.712373774393557e-06, + "loss": 0.011401008814573288, + "step": 215040 + }, + { + "epoch": 0.9232545958802366, + "grad_norm": 0.005901793949306011, + "learning_rate": 7.708062054275933e-06, + "loss": 0.2547999143600464, + "step": 215050 + }, + { + "epoch": 0.9232975279702567, + "grad_norm": 4.491530418395996, + "learning_rate": 7.70375033415831e-06, + "loss": 0.10067328214645385, + "step": 215060 + }, + { + "epoch": 0.9233404600602767, + "grad_norm": 5.371583938598633, + "learning_rate": 7.699438614040686e-06, + "loss": 0.1953263521194458, + "step": 215070 + }, + { + "epoch": 0.9233833921502966, + "grad_norm": 5.961281776428223, + "learning_rate": 7.695126893923061e-06, + "loss": 0.291353178024292, + "step": 215080 + }, + { + "epoch": 0.9234263242403167, + "grad_norm": 2.469714641571045, + "learning_rate": 7.690815173805439e-06, + "loss": 0.3257584571838379, + "step": 215090 + }, + { + "epoch": 0.9234692563303367, + "grad_norm": 0.29294028878211975, + "learning_rate": 7.686503453687814e-06, + "loss": 0.08499239087104797, + "step": 215100 + }, + { + "epoch": 0.9235121884203567, + "grad_norm": 0.0017936037620529532, + "learning_rate": 7.68219173357019e-06, + "loss": 0.08098435401916504, + "step": 215110 + }, + { + "epoch": 0.9235551205103767, + "grad_norm": 1.058728575706482, + "learning_rate": 7.677880013452567e-06, + "loss": 0.15216209888458251, + "step": 215120 + }, + { + "epoch": 0.9235980526003967, + "grad_norm": 0.08794575929641724, + "learning_rate": 7.673568293334943e-06, + "loss": 0.11532053947448731, + "step": 215130 + }, + { + "epoch": 0.9236409846904167, + "grad_norm": 0.0029542662668973207, + "learning_rate": 7.669256573217318e-06, + "loss": 0.403619384765625, + "step": 215140 + }, + { + "epoch": 0.9236839167804367, + "grad_norm": 1.159416913986206, + "learning_rate": 7.664944853099696e-06, + "loss": 0.3979302406311035, + "step": 215150 + }, + { + "epoch": 0.9237268488704568, + "grad_norm": 1.725462794303894, + "learning_rate": 7.660633132982071e-06, + "loss": 0.12816778421401978, + "step": 215160 + }, + { + "epoch": 0.9237697809604767, + "grad_norm": 4.982163429260254, + "learning_rate": 7.656321412864447e-06, + "loss": 0.2503523826599121, + "step": 215170 + }, + { + "epoch": 0.9238127130504967, + "grad_norm": 0.055415526032447815, + "learning_rate": 7.652009692746826e-06, + "loss": 0.18151148557662963, + "step": 215180 + }, + { + "epoch": 0.9238556451405168, + "grad_norm": 0.009110411629080772, + "learning_rate": 7.647697972629202e-06, + "loss": 0.22747561931610108, + "step": 215190 + }, + { + "epoch": 0.9238985772305367, + "grad_norm": 1.5314021110534668, + "learning_rate": 7.643386252511577e-06, + "loss": 0.19716296195983887, + "step": 215200 + }, + { + "epoch": 0.9239415093205567, + "grad_norm": 6.504369258880615, + "learning_rate": 7.639074532393955e-06, + "loss": 0.41256489753723147, + "step": 215210 + }, + { + "epoch": 0.9239844414105768, + "grad_norm": 0.0012986245565116405, + "learning_rate": 7.63476281227633e-06, + "loss": 0.05647019147872925, + "step": 215220 + }, + { + "epoch": 0.9240273735005967, + "grad_norm": 0.0007943433593027294, + "learning_rate": 7.630451092158706e-06, + "loss": 0.14794392585754396, + "step": 215230 + }, + { + "epoch": 0.9240703055906168, + "grad_norm": 0.0049696327187120914, + "learning_rate": 7.626139372041083e-06, + "loss": 0.32319116592407227, + "step": 215240 + }, + { + "epoch": 0.9241132376806368, + "grad_norm": 0.005273050162941217, + "learning_rate": 7.621827651923459e-06, + "loss": 0.24418747425079346, + "step": 215250 + }, + { + "epoch": 0.9241561697706567, + "grad_norm": 1.32347571849823, + "learning_rate": 7.617515931805836e-06, + "loss": 0.14303951263427733, + "step": 215260 + }, + { + "epoch": 0.9241991018606768, + "grad_norm": 0.21632139384746552, + "learning_rate": 7.613204211688211e-06, + "loss": 0.20365605354309083, + "step": 215270 + }, + { + "epoch": 0.9242420339506968, + "grad_norm": 0.5117451548576355, + "learning_rate": 7.608892491570588e-06, + "loss": 0.17990248203277587, + "step": 215280 + }, + { + "epoch": 0.9242849660407167, + "grad_norm": 0.011012107133865356, + "learning_rate": 7.604580771452964e-06, + "loss": 0.1717315912246704, + "step": 215290 + }, + { + "epoch": 0.9243278981307368, + "grad_norm": 0.00873053353279829, + "learning_rate": 7.60026905133534e-06, + "loss": 0.04388104975223541, + "step": 215300 + }, + { + "epoch": 0.9243708302207568, + "grad_norm": 0.017410963773727417, + "learning_rate": 7.595957331217716e-06, + "loss": 0.12251949310302734, + "step": 215310 + }, + { + "epoch": 0.9244137623107768, + "grad_norm": 0.015072675421833992, + "learning_rate": 7.591645611100093e-06, + "loss": 0.21252987384796143, + "step": 215320 + }, + { + "epoch": 0.9244566944007968, + "grad_norm": 0.0012947338400408626, + "learning_rate": 7.587333890982469e-06, + "loss": 0.2460631847381592, + "step": 215330 + }, + { + "epoch": 0.9244996264908169, + "grad_norm": 1.0608649253845215, + "learning_rate": 7.583022170864845e-06, + "loss": 0.22779808044433594, + "step": 215340 + }, + { + "epoch": 0.9245425585808368, + "grad_norm": 0.015276042744517326, + "learning_rate": 7.5787104507472215e-06, + "loss": 0.2625183343887329, + "step": 215350 + }, + { + "epoch": 0.9245854906708568, + "grad_norm": 1.360752820968628, + "learning_rate": 7.574398730629598e-06, + "loss": 0.4349686622619629, + "step": 215360 + }, + { + "epoch": 0.9246284227608769, + "grad_norm": 5.221940994262695, + "learning_rate": 7.570087010511974e-06, + "loss": 0.34817028045654297, + "step": 215370 + }, + { + "epoch": 0.9246713548508968, + "grad_norm": 7.083820819854736, + "learning_rate": 7.56577529039435e-06, + "loss": 0.17298378944396972, + "step": 215380 + }, + { + "epoch": 0.9247142869409168, + "grad_norm": 0.016590062528848648, + "learning_rate": 7.561463570276727e-06, + "loss": 0.0039948180317878725, + "step": 215390 + }, + { + "epoch": 0.9247572190309369, + "grad_norm": 0.0004913709126412868, + "learning_rate": 7.557151850159102e-06, + "loss": 0.2017059087753296, + "step": 215400 + }, + { + "epoch": 0.9248001511209569, + "grad_norm": 7.199446201324463, + "learning_rate": 7.552840130041479e-06, + "loss": 0.2697106599807739, + "step": 215410 + }, + { + "epoch": 0.9248430832109769, + "grad_norm": 0.28717929124832153, + "learning_rate": 7.548528409923855e-06, + "loss": 0.3328535556793213, + "step": 215420 + }, + { + "epoch": 0.9248860153009969, + "grad_norm": 0.053416259586811066, + "learning_rate": 7.544216689806232e-06, + "loss": 0.22627270221710205, + "step": 215430 + }, + { + "epoch": 0.9249289473910169, + "grad_norm": 0.006245059426873922, + "learning_rate": 7.5399049696886074e-06, + "loss": 0.10136998891830444, + "step": 215440 + }, + { + "epoch": 0.9249718794810369, + "grad_norm": 1.9855570793151855, + "learning_rate": 7.535593249570984e-06, + "loss": 0.1648913860321045, + "step": 215450 + }, + { + "epoch": 0.9250148115710569, + "grad_norm": 5.2693939208984375, + "learning_rate": 7.5312815294533604e-06, + "loss": 0.23091514110565187, + "step": 215460 + }, + { + "epoch": 0.925057743661077, + "grad_norm": 0.22706347703933716, + "learning_rate": 7.526969809335736e-06, + "loss": 0.27173595428466796, + "step": 215470 + }, + { + "epoch": 0.9251006757510969, + "grad_norm": 0.03872816637158394, + "learning_rate": 7.5226580892181126e-06, + "loss": 0.14449009895324708, + "step": 215480 + }, + { + "epoch": 0.9251436078411169, + "grad_norm": 0.07340923696756363, + "learning_rate": 7.518346369100489e-06, + "loss": 0.1235128402709961, + "step": 215490 + }, + { + "epoch": 0.925186539931137, + "grad_norm": 0.16422735154628754, + "learning_rate": 7.514034648982865e-06, + "loss": 0.04489589631557465, + "step": 215500 + }, + { + "epoch": 0.9252294720211569, + "grad_norm": 5.788188457489014, + "learning_rate": 7.509722928865241e-06, + "loss": 0.2109375, + "step": 215510 + }, + { + "epoch": 0.925272404111177, + "grad_norm": 0.025441322475671768, + "learning_rate": 7.505411208747618e-06, + "loss": 0.1279160737991333, + "step": 215520 + }, + { + "epoch": 0.925315336201197, + "grad_norm": 1.7489529848098755, + "learning_rate": 7.501099488629993e-06, + "loss": 0.3018485546112061, + "step": 215530 + }, + { + "epoch": 0.9253582682912169, + "grad_norm": 0.13211052119731903, + "learning_rate": 7.4967877685123715e-06, + "loss": 0.15862523317337035, + "step": 215540 + }, + { + "epoch": 0.925401200381237, + "grad_norm": 0.043895423412323, + "learning_rate": 7.492476048394747e-06, + "loss": 0.17331376075744628, + "step": 215550 + }, + { + "epoch": 0.925444132471257, + "grad_norm": 1.541989803314209, + "learning_rate": 7.488164328277124e-06, + "loss": 0.2534844636917114, + "step": 215560 + }, + { + "epoch": 0.9254870645612769, + "grad_norm": 1.3819447755813599, + "learning_rate": 7.4838526081595e-06, + "loss": 0.0953163206577301, + "step": 215570 + }, + { + "epoch": 0.925529996651297, + "grad_norm": 0.17048867046833038, + "learning_rate": 7.479540888041877e-06, + "loss": 0.20968384742736818, + "step": 215580 + }, + { + "epoch": 0.925572928741317, + "grad_norm": 0.2957654297351837, + "learning_rate": 7.475229167924252e-06, + "loss": 0.1413059115409851, + "step": 215590 + }, + { + "epoch": 0.925615860831337, + "grad_norm": 1.288763165473938, + "learning_rate": 7.470917447806629e-06, + "loss": 0.180072557926178, + "step": 215600 + }, + { + "epoch": 0.925658792921357, + "grad_norm": 0.014453163370490074, + "learning_rate": 7.466605727689005e-06, + "loss": 0.37809994220733645, + "step": 215610 + }, + { + "epoch": 0.925701725011377, + "grad_norm": 8.156563758850098, + "learning_rate": 7.462294007571381e-06, + "loss": 0.23681471347808838, + "step": 215620 + }, + { + "epoch": 0.925744657101397, + "grad_norm": 0.010253574699163437, + "learning_rate": 7.4579822874537575e-06, + "loss": 0.24417309761047362, + "step": 215630 + }, + { + "epoch": 0.925787589191417, + "grad_norm": 6.712070941925049, + "learning_rate": 7.453670567336134e-06, + "loss": 0.2375593900680542, + "step": 215640 + }, + { + "epoch": 0.9258305212814371, + "grad_norm": 0.011438658460974693, + "learning_rate": 7.44935884721851e-06, + "loss": 0.15096044540405273, + "step": 215650 + }, + { + "epoch": 0.925873453371457, + "grad_norm": 0.025541523471474648, + "learning_rate": 7.445047127100886e-06, + "loss": 0.11792056560516358, + "step": 215660 + }, + { + "epoch": 0.925916385461477, + "grad_norm": 2.1013917922973633, + "learning_rate": 7.440735406983263e-06, + "loss": 0.24362952709198, + "step": 215670 + }, + { + "epoch": 0.9259593175514971, + "grad_norm": 0.01850118488073349, + "learning_rate": 7.436423686865638e-06, + "loss": 0.1713258981704712, + "step": 215680 + }, + { + "epoch": 0.926002249641517, + "grad_norm": 0.047367800027132034, + "learning_rate": 7.432111966748015e-06, + "loss": 0.22293555736541748, + "step": 215690 + }, + { + "epoch": 0.926045181731537, + "grad_norm": 3.175168037414551, + "learning_rate": 7.427800246630391e-06, + "loss": 0.10906111001968384, + "step": 215700 + }, + { + "epoch": 0.9260881138215571, + "grad_norm": 0.006182890385389328, + "learning_rate": 7.423488526512768e-06, + "loss": 0.24358677864074707, + "step": 215710 + }, + { + "epoch": 0.926131045911577, + "grad_norm": 0.00020513370691332966, + "learning_rate": 7.419176806395143e-06, + "loss": 0.03684086203575134, + "step": 215720 + }, + { + "epoch": 0.9261739780015971, + "grad_norm": 0.010843836702406406, + "learning_rate": 7.41486508627752e-06, + "loss": 0.024738329648971557, + "step": 215730 + }, + { + "epoch": 0.9262169100916171, + "grad_norm": 0.07120528817176819, + "learning_rate": 7.410553366159896e-06, + "loss": 0.07912518382072449, + "step": 215740 + }, + { + "epoch": 0.926259842181637, + "grad_norm": 0.012355622835457325, + "learning_rate": 7.406241646042272e-06, + "loss": 0.1800381064414978, + "step": 215750 + }, + { + "epoch": 0.9263027742716571, + "grad_norm": 0.040920548141002655, + "learning_rate": 7.4019299259246485e-06, + "loss": 0.3121651649475098, + "step": 215760 + }, + { + "epoch": 0.9263457063616771, + "grad_norm": 0.15170609951019287, + "learning_rate": 7.397618205807025e-06, + "loss": 0.3069912433624268, + "step": 215770 + }, + { + "epoch": 0.926388638451697, + "grad_norm": 1.5117013454437256, + "learning_rate": 7.393306485689401e-06, + "loss": 0.4376046180725098, + "step": 215780 + }, + { + "epoch": 0.9264315705417171, + "grad_norm": 6.3107733726501465, + "learning_rate": 7.388994765571777e-06, + "loss": 0.19311146736145018, + "step": 215790 + }, + { + "epoch": 0.9264745026317371, + "grad_norm": 0.010845409706234932, + "learning_rate": 7.384683045454154e-06, + "loss": 0.11981412172317504, + "step": 215800 + }, + { + "epoch": 0.9265174347217571, + "grad_norm": 0.029103947803378105, + "learning_rate": 7.38037132533653e-06, + "loss": 0.17413549423217772, + "step": 215810 + }, + { + "epoch": 0.9265603668117771, + "grad_norm": 6.210660457611084, + "learning_rate": 7.376059605218906e-06, + "loss": 0.37948288917541506, + "step": 215820 + }, + { + "epoch": 0.9266032989017972, + "grad_norm": 3.099215269088745, + "learning_rate": 7.371747885101282e-06, + "loss": 0.2719104766845703, + "step": 215830 + }, + { + "epoch": 0.9266462309918172, + "grad_norm": 0.1082659363746643, + "learning_rate": 7.367436164983659e-06, + "loss": 0.13931306600570678, + "step": 215840 + }, + { + "epoch": 0.9266891630818371, + "grad_norm": 0.8399651050567627, + "learning_rate": 7.3631244448660345e-06, + "loss": 0.5135842323303222, + "step": 215850 + }, + { + "epoch": 0.9267320951718572, + "grad_norm": 0.02767796255648136, + "learning_rate": 7.358812724748411e-06, + "loss": 0.35433938503265383, + "step": 215860 + }, + { + "epoch": 0.9267750272618772, + "grad_norm": 0.08824355900287628, + "learning_rate": 7.3545010046307874e-06, + "loss": 0.14087847471237183, + "step": 215870 + }, + { + "epoch": 0.9268179593518971, + "grad_norm": 0.05289234593510628, + "learning_rate": 7.350189284513163e-06, + "loss": 0.09670596718788146, + "step": 215880 + }, + { + "epoch": 0.9268608914419172, + "grad_norm": 0.4825235605239868, + "learning_rate": 7.34587756439554e-06, + "loss": 0.23260304927825928, + "step": 215890 + }, + { + "epoch": 0.9269038235319372, + "grad_norm": 1.406697154045105, + "learning_rate": 7.341565844277917e-06, + "loss": 0.13096822500228883, + "step": 215900 + }, + { + "epoch": 0.9269467556219572, + "grad_norm": 0.16242437064647675, + "learning_rate": 7.3372541241602934e-06, + "loss": 0.17526451349258423, + "step": 215910 + }, + { + "epoch": 0.9269896877119772, + "grad_norm": 1.7749894857406616, + "learning_rate": 7.33294240404267e-06, + "loss": 0.19636834859848024, + "step": 215920 + }, + { + "epoch": 0.9270326198019972, + "grad_norm": 0.00374089227989316, + "learning_rate": 7.3286306839250456e-06, + "loss": 0.07796565294265748, + "step": 215930 + }, + { + "epoch": 0.9270755518920172, + "grad_norm": 5.736209869384766, + "learning_rate": 7.324318963807422e-06, + "loss": 0.4099263668060303, + "step": 215940 + }, + { + "epoch": 0.9271184839820372, + "grad_norm": 0.009525381959974766, + "learning_rate": 7.3200072436897986e-06, + "loss": 0.09380664825439453, + "step": 215950 + }, + { + "epoch": 0.9271614160720573, + "grad_norm": 0.018462950363755226, + "learning_rate": 7.315695523572175e-06, + "loss": 0.17773728370666503, + "step": 215960 + }, + { + "epoch": 0.9272043481620772, + "grad_norm": 2.080399751663208, + "learning_rate": 7.311383803454551e-06, + "loss": 0.15385993719100952, + "step": 215970 + }, + { + "epoch": 0.9272472802520972, + "grad_norm": 0.0018877952825278044, + "learning_rate": 7.307072083336927e-06, + "loss": 0.1597719669342041, + "step": 215980 + }, + { + "epoch": 0.9272902123421173, + "grad_norm": 0.12085002660751343, + "learning_rate": 7.302760363219304e-06, + "loss": 0.14481079578399658, + "step": 215990 + }, + { + "epoch": 0.9273331444321372, + "grad_norm": 0.006337421014904976, + "learning_rate": 7.298448643101679e-06, + "loss": 0.24477388858795165, + "step": 216000 + }, + { + "epoch": 0.9273331444321372, + "eval_loss": 0.37125617265701294, + "eval_runtime": 27.3992, + "eval_samples_per_second": 3.65, + "eval_steps_per_second": 3.65, + "step": 216000 + }, + { + "epoch": 0.9273760765221573, + "grad_norm": 0.004524619318544865, + "learning_rate": 7.294136922984056e-06, + "loss": 0.3099964618682861, + "step": 216010 + }, + { + "epoch": 0.9274190086121773, + "grad_norm": 0.6417093276977539, + "learning_rate": 7.289825202866432e-06, + "loss": 0.1063919186592102, + "step": 216020 + }, + { + "epoch": 0.9274619407021972, + "grad_norm": 0.003405363531783223, + "learning_rate": 7.285513482748808e-06, + "loss": 0.08869165778160096, + "step": 216030 + }, + { + "epoch": 0.9275048727922173, + "grad_norm": 1.994133710861206, + "learning_rate": 7.2812017626311845e-06, + "loss": 0.09733704328536988, + "step": 216040 + }, + { + "epoch": 0.9275478048822373, + "grad_norm": 1.5306503772735596, + "learning_rate": 7.276890042513561e-06, + "loss": 0.4169147491455078, + "step": 216050 + }, + { + "epoch": 0.9275907369722572, + "grad_norm": 0.00030547488131560385, + "learning_rate": 7.2725783223959375e-06, + "loss": 0.38906643390655515, + "step": 216060 + }, + { + "epoch": 0.9276336690622773, + "grad_norm": 0.1737918257713318, + "learning_rate": 7.268266602278313e-06, + "loss": 0.01735979914665222, + "step": 216070 + }, + { + "epoch": 0.9276766011522973, + "grad_norm": 0.006991616450250149, + "learning_rate": 7.26395488216069e-06, + "loss": 0.10484539270401001, + "step": 216080 + }, + { + "epoch": 0.9277195332423173, + "grad_norm": 1.4587398767471313, + "learning_rate": 7.259643162043066e-06, + "loss": 0.16874505281448365, + "step": 216090 + }, + { + "epoch": 0.9277624653323373, + "grad_norm": 0.03317570313811302, + "learning_rate": 7.255331441925442e-06, + "loss": 0.19244474172592163, + "step": 216100 + }, + { + "epoch": 0.9278053974223573, + "grad_norm": 0.0003741345426533371, + "learning_rate": 7.251019721807818e-06, + "loss": 0.14304815530776976, + "step": 216110 + }, + { + "epoch": 0.9278483295123773, + "grad_norm": 0.0023389640264213085, + "learning_rate": 7.246708001690195e-06, + "loss": 0.1601564407348633, + "step": 216120 + }, + { + "epoch": 0.9278912616023973, + "grad_norm": 0.7748071551322937, + "learning_rate": 7.24239628157257e-06, + "loss": 0.24553565979003905, + "step": 216130 + }, + { + "epoch": 0.9279341936924174, + "grad_norm": 1.804882287979126, + "learning_rate": 7.238084561454947e-06, + "loss": 0.08244321942329406, + "step": 216140 + }, + { + "epoch": 0.9279771257824373, + "grad_norm": 0.0013793292455375195, + "learning_rate": 7.233772841337323e-06, + "loss": 0.3597370386123657, + "step": 216150 + }, + { + "epoch": 0.9280200578724573, + "grad_norm": 0.009140110574662685, + "learning_rate": 7.229461121219699e-06, + "loss": 0.1765925645828247, + "step": 216160 + }, + { + "epoch": 0.9280629899624774, + "grad_norm": 0.007054975721985102, + "learning_rate": 7.2251494011020755e-06, + "loss": 0.26095626354217527, + "step": 216170 + }, + { + "epoch": 0.9281059220524973, + "grad_norm": 0.9550222158432007, + "learning_rate": 7.220837680984452e-06, + "loss": 0.127244234085083, + "step": 216180 + }, + { + "epoch": 0.9281488541425174, + "grad_norm": 5.590497970581055, + "learning_rate": 7.2165259608668285e-06, + "loss": 0.250270938873291, + "step": 216190 + }, + { + "epoch": 0.9281917862325374, + "grad_norm": 2.962364912033081, + "learning_rate": 7.212214240749204e-06, + "loss": 0.4075340270996094, + "step": 216200 + }, + { + "epoch": 0.9282347183225573, + "grad_norm": 0.0032614811789244413, + "learning_rate": 7.207902520631581e-06, + "loss": 0.16378134489059448, + "step": 216210 + }, + { + "epoch": 0.9282776504125774, + "grad_norm": 1.4757379293441772, + "learning_rate": 7.203590800513957e-06, + "loss": 0.27907600402832033, + "step": 216220 + }, + { + "epoch": 0.9283205825025974, + "grad_norm": 0.00793127715587616, + "learning_rate": 7.199279080396333e-06, + "loss": 0.22693383693695068, + "step": 216230 + }, + { + "epoch": 0.9283635145926173, + "grad_norm": 0.08194581419229507, + "learning_rate": 7.194967360278709e-06, + "loss": 0.06598881483078003, + "step": 216240 + }, + { + "epoch": 0.9284064466826374, + "grad_norm": 0.02816123329102993, + "learning_rate": 7.190655640161086e-06, + "loss": 0.027814167737960815, + "step": 216250 + }, + { + "epoch": 0.9284493787726574, + "grad_norm": 3.995386838912964, + "learning_rate": 7.1863439200434615e-06, + "loss": 0.3064185857772827, + "step": 216260 + }, + { + "epoch": 0.9284923108626775, + "grad_norm": 0.08040156215429306, + "learning_rate": 7.18203219992584e-06, + "loss": 0.32654943466186526, + "step": 216270 + }, + { + "epoch": 0.9285352429526974, + "grad_norm": 0.0028185301925987005, + "learning_rate": 7.177720479808215e-06, + "loss": 0.22416894435882567, + "step": 216280 + }, + { + "epoch": 0.9285781750427174, + "grad_norm": 0.0008616048726253211, + "learning_rate": 7.173408759690592e-06, + "loss": 0.336730432510376, + "step": 216290 + }, + { + "epoch": 0.9286211071327375, + "grad_norm": 0.0030813466291874647, + "learning_rate": 7.169097039572968e-06, + "loss": 0.07731318473815918, + "step": 216300 + }, + { + "epoch": 0.9286640392227574, + "grad_norm": 0.003057356458157301, + "learning_rate": 7.164785319455345e-06, + "loss": 0.08529521822929383, + "step": 216310 + }, + { + "epoch": 0.9287069713127775, + "grad_norm": 0.0021866008173674345, + "learning_rate": 7.1604735993377204e-06, + "loss": 0.13397653102874757, + "step": 216320 + }, + { + "epoch": 0.9287499034027975, + "grad_norm": 10.670136451721191, + "learning_rate": 7.156161879220097e-06, + "loss": 0.11311818361282348, + "step": 216330 + }, + { + "epoch": 0.9287928354928174, + "grad_norm": 0.053423043340444565, + "learning_rate": 7.1518501591024734e-06, + "loss": 0.1762098789215088, + "step": 216340 + }, + { + "epoch": 0.9288357675828375, + "grad_norm": 0.12246271222829819, + "learning_rate": 7.147538438984849e-06, + "loss": 0.15174304246902465, + "step": 216350 + }, + { + "epoch": 0.9288786996728575, + "grad_norm": 3.375256061553955, + "learning_rate": 7.143226718867226e-06, + "loss": 0.21079769134521484, + "step": 216360 + }, + { + "epoch": 0.9289216317628775, + "grad_norm": 0.113634392619133, + "learning_rate": 7.138914998749602e-06, + "loss": 0.2766488313674927, + "step": 216370 + }, + { + "epoch": 0.9289645638528975, + "grad_norm": 0.8795444965362549, + "learning_rate": 7.134603278631978e-06, + "loss": 0.3603955268859863, + "step": 216380 + }, + { + "epoch": 0.9290074959429175, + "grad_norm": 1.135464072227478, + "learning_rate": 7.130291558514354e-06, + "loss": 0.22582101821899414, + "step": 216390 + }, + { + "epoch": 0.9290504280329375, + "grad_norm": 0.5949857831001282, + "learning_rate": 7.125979838396731e-06, + "loss": 0.16929233074188232, + "step": 216400 + }, + { + "epoch": 0.9290933601229575, + "grad_norm": 0.012552267871797085, + "learning_rate": 7.121668118279106e-06, + "loss": 0.16296029090881348, + "step": 216410 + }, + { + "epoch": 0.9291362922129776, + "grad_norm": 0.01894237846136093, + "learning_rate": 7.117356398161483e-06, + "loss": 0.017162826657295228, + "step": 216420 + }, + { + "epoch": 0.9291792243029975, + "grad_norm": 0.01409218367189169, + "learning_rate": 7.113044678043859e-06, + "loss": 0.6158824443817139, + "step": 216430 + }, + { + "epoch": 0.9292221563930175, + "grad_norm": 0.5395675897598267, + "learning_rate": 7.108732957926236e-06, + "loss": 0.20108966827392577, + "step": 216440 + }, + { + "epoch": 0.9292650884830376, + "grad_norm": 0.006730252411216497, + "learning_rate": 7.1044212378086115e-06, + "loss": 0.1388014554977417, + "step": 216450 + }, + { + "epoch": 0.9293080205730575, + "grad_norm": 0.005381924100220203, + "learning_rate": 7.100109517690988e-06, + "loss": 0.3470167875289917, + "step": 216460 + }, + { + "epoch": 0.9293509526630775, + "grad_norm": 0.001024061581119895, + "learning_rate": 7.0957977975733645e-06, + "loss": 0.207639741897583, + "step": 216470 + }, + { + "epoch": 0.9293938847530976, + "grad_norm": 0.5139259696006775, + "learning_rate": 7.09148607745574e-06, + "loss": 0.3020778179168701, + "step": 216480 + }, + { + "epoch": 0.9294368168431175, + "grad_norm": 0.3843024969100952, + "learning_rate": 7.087174357338117e-06, + "loss": 0.2944885015487671, + "step": 216490 + }, + { + "epoch": 0.9294797489331376, + "grad_norm": 1.2467830181121826, + "learning_rate": 7.082862637220493e-06, + "loss": 0.15704236030578614, + "step": 216500 + }, + { + "epoch": 0.9295226810231576, + "grad_norm": 0.022638993337750435, + "learning_rate": 7.078550917102869e-06, + "loss": 0.12156269550323487, + "step": 216510 + }, + { + "epoch": 0.9295656131131775, + "grad_norm": 0.007260969839990139, + "learning_rate": 7.074239196985245e-06, + "loss": 0.20155694484710693, + "step": 216520 + }, + { + "epoch": 0.9296085452031976, + "grad_norm": 6.707982063293457, + "learning_rate": 7.069927476867622e-06, + "loss": 0.4008821964263916, + "step": 216530 + }, + { + "epoch": 0.9296514772932176, + "grad_norm": 0.00419988576322794, + "learning_rate": 7.065615756749997e-06, + "loss": 0.10957802534103393, + "step": 216540 + }, + { + "epoch": 0.9296944093832376, + "grad_norm": 7.6871209144592285, + "learning_rate": 7.061304036632374e-06, + "loss": 0.22321274280548095, + "step": 216550 + }, + { + "epoch": 0.9297373414732576, + "grad_norm": 0.021390387788414955, + "learning_rate": 7.05699231651475e-06, + "loss": 0.07745559811592102, + "step": 216560 + }, + { + "epoch": 0.9297802735632776, + "grad_norm": 0.015582584775984287, + "learning_rate": 7.052680596397127e-06, + "loss": 0.12427722215652466, + "step": 216570 + }, + { + "epoch": 0.9298232056532976, + "grad_norm": 0.43360817432403564, + "learning_rate": 7.0483688762795026e-06, + "loss": 0.01620929390192032, + "step": 216580 + }, + { + "epoch": 0.9298661377433176, + "grad_norm": 0.48537853360176086, + "learning_rate": 7.044057156161879e-06, + "loss": 0.20732901096343995, + "step": 216590 + }, + { + "epoch": 0.9299090698333377, + "grad_norm": 2.50187611579895, + "learning_rate": 7.0397454360442555e-06, + "loss": 0.3742201805114746, + "step": 216600 + }, + { + "epoch": 0.9299520019233576, + "grad_norm": 0.06702170521020889, + "learning_rate": 7.035433715926631e-06, + "loss": 0.23646984100341797, + "step": 216610 + }, + { + "epoch": 0.9299949340133776, + "grad_norm": 0.4540206789970398, + "learning_rate": 7.031121995809008e-06, + "loss": 0.40461249351501466, + "step": 216620 + }, + { + "epoch": 0.9300378661033977, + "grad_norm": 0.0032786328811198473, + "learning_rate": 7.026810275691385e-06, + "loss": 0.25916366577148436, + "step": 216630 + }, + { + "epoch": 0.9300807981934176, + "grad_norm": 0.002468528226017952, + "learning_rate": 7.0224985555737615e-06, + "loss": 0.19205652475357055, + "step": 216640 + }, + { + "epoch": 0.9301237302834376, + "grad_norm": 0.09234610944986343, + "learning_rate": 7.018186835456138e-06, + "loss": 0.16778699159622193, + "step": 216650 + }, + { + "epoch": 0.9301666623734577, + "grad_norm": 0.00125981867313385, + "learning_rate": 7.013875115338514e-06, + "loss": 0.10867637395858765, + "step": 216660 + }, + { + "epoch": 0.9302095944634776, + "grad_norm": 0.0003518553567118943, + "learning_rate": 7.00956339522089e-06, + "loss": 0.2450389862060547, + "step": 216670 + }, + { + "epoch": 0.9302525265534977, + "grad_norm": 0.21603882312774658, + "learning_rate": 7.005251675103267e-06, + "loss": 0.15238502025604247, + "step": 216680 + }, + { + "epoch": 0.9302954586435177, + "grad_norm": 0.00265440228395164, + "learning_rate": 7.000939954985643e-06, + "loss": 0.0186956986784935, + "step": 216690 + }, + { + "epoch": 0.9303383907335376, + "grad_norm": 0.00983706209808588, + "learning_rate": 6.996628234868019e-06, + "loss": 0.13357884883880616, + "step": 216700 + }, + { + "epoch": 0.9303813228235577, + "grad_norm": 0.0024289621505886316, + "learning_rate": 6.992316514750395e-06, + "loss": 0.21640503406524658, + "step": 216710 + }, + { + "epoch": 0.9304242549135777, + "grad_norm": 0.15975762903690338, + "learning_rate": 6.988004794632772e-06, + "loss": 0.10567178726196289, + "step": 216720 + }, + { + "epoch": 0.9304671870035978, + "grad_norm": 0.03177304193377495, + "learning_rate": 6.9836930745151475e-06, + "loss": 0.16201092004776002, + "step": 216730 + }, + { + "epoch": 0.9305101190936177, + "grad_norm": 0.022674480453133583, + "learning_rate": 6.979381354397524e-06, + "loss": 0.26315600872039796, + "step": 216740 + }, + { + "epoch": 0.9305530511836377, + "grad_norm": 0.019809991121292114, + "learning_rate": 6.9750696342799004e-06, + "loss": 0.1809520125389099, + "step": 216750 + }, + { + "epoch": 0.9305959832736578, + "grad_norm": 1.3690739870071411, + "learning_rate": 6.970757914162276e-06, + "loss": 0.21216487884521484, + "step": 216760 + }, + { + "epoch": 0.9306389153636777, + "grad_norm": 0.0006519387243315578, + "learning_rate": 6.966446194044653e-06, + "loss": 0.23284423351287842, + "step": 216770 + }, + { + "epoch": 0.9306818474536978, + "grad_norm": 1.3061208724975586, + "learning_rate": 6.962134473927029e-06, + "loss": 0.04781084656715393, + "step": 216780 + }, + { + "epoch": 0.9307247795437178, + "grad_norm": 0.007215921767055988, + "learning_rate": 6.957822753809405e-06, + "loss": 0.16829094886779786, + "step": 216790 + }, + { + "epoch": 0.9307677116337377, + "grad_norm": 4.176065921783447, + "learning_rate": 6.953511033691781e-06, + "loss": 0.22234597206115722, + "step": 216800 + }, + { + "epoch": 0.9308106437237578, + "grad_norm": 0.19154906272888184, + "learning_rate": 6.949199313574158e-06, + "loss": 0.12716169357299806, + "step": 216810 + }, + { + "epoch": 0.9308535758137778, + "grad_norm": 5.86802339553833, + "learning_rate": 6.944887593456534e-06, + "loss": 0.29127726554870603, + "step": 216820 + }, + { + "epoch": 0.9308965079037977, + "grad_norm": 3.3478128910064697, + "learning_rate": 6.94057587333891e-06, + "loss": 0.3179521083831787, + "step": 216830 + }, + { + "epoch": 0.9309394399938178, + "grad_norm": 0.002998100593686104, + "learning_rate": 6.936264153221286e-06, + "loss": 0.0034219883382320404, + "step": 216840 + }, + { + "epoch": 0.9309823720838378, + "grad_norm": 2.541491746902466, + "learning_rate": 6.931952433103663e-06, + "loss": 0.23211936950683593, + "step": 216850 + }, + { + "epoch": 0.9310253041738578, + "grad_norm": 0.2962930500507355, + "learning_rate": 6.9276407129860385e-06, + "loss": 0.11564910411834717, + "step": 216860 + }, + { + "epoch": 0.9310682362638778, + "grad_norm": 2.9043209552764893, + "learning_rate": 6.923328992868415e-06, + "loss": 0.15632420778274536, + "step": 216870 + }, + { + "epoch": 0.9311111683538978, + "grad_norm": 3.168398380279541, + "learning_rate": 6.9190172727507915e-06, + "loss": 0.11619545221328735, + "step": 216880 + }, + { + "epoch": 0.9311541004439178, + "grad_norm": 0.0030346859712153673, + "learning_rate": 6.914705552633167e-06, + "loss": 0.1562952995300293, + "step": 216890 + }, + { + "epoch": 0.9311970325339378, + "grad_norm": 5.6514692306518555, + "learning_rate": 6.910393832515544e-06, + "loss": 0.2962817192077637, + "step": 216900 + }, + { + "epoch": 0.9312399646239579, + "grad_norm": 40.44093322753906, + "learning_rate": 6.90608211239792e-06, + "loss": 0.14009404182434082, + "step": 216910 + }, + { + "epoch": 0.9312828967139778, + "grad_norm": 0.12538249790668488, + "learning_rate": 6.901770392280296e-06, + "loss": 0.09896424412727356, + "step": 216920 + }, + { + "epoch": 0.9313258288039978, + "grad_norm": 4.35367488861084, + "learning_rate": 6.897458672162672e-06, + "loss": 0.2702718019485474, + "step": 216930 + }, + { + "epoch": 0.9313687608940179, + "grad_norm": 0.004443527199327946, + "learning_rate": 6.893146952045049e-06, + "loss": 0.1984582781791687, + "step": 216940 + }, + { + "epoch": 0.9314116929840378, + "grad_norm": 0.7379357814788818, + "learning_rate": 6.888835231927425e-06, + "loss": 0.1050255537033081, + "step": 216950 + }, + { + "epoch": 0.9314546250740579, + "grad_norm": 1.528487205505371, + "learning_rate": 6.884523511809801e-06, + "loss": 0.26220476627349854, + "step": 216960 + }, + { + "epoch": 0.9314975571640779, + "grad_norm": 0.0015041300794109702, + "learning_rate": 6.8802117916921774e-06, + "loss": 0.19335163831710817, + "step": 216970 + }, + { + "epoch": 0.9315404892540978, + "grad_norm": 3.583026170730591, + "learning_rate": 6.875900071574554e-06, + "loss": 0.4297220230102539, + "step": 216980 + }, + { + "epoch": 0.9315834213441179, + "grad_norm": 2.160015344619751, + "learning_rate": 6.871588351456931e-06, + "loss": 0.41837215423583984, + "step": 216990 + }, + { + "epoch": 0.9316263534341379, + "grad_norm": 0.005593061912804842, + "learning_rate": 6.867276631339308e-06, + "loss": 0.2308417320251465, + "step": 217000 + }, + { + "epoch": 0.9316263534341379, + "eval_loss": 0.3718287944793701, + "eval_runtime": 27.3656, + "eval_samples_per_second": 3.654, + "eval_steps_per_second": 3.654, + "step": 217000 + }, + { + "epoch": 0.9316692855241578, + "grad_norm": 0.00907901581376791, + "learning_rate": 6.862964911221683e-06, + "loss": 0.24105193614959716, + "step": 217010 + }, + { + "epoch": 0.9317122176141779, + "grad_norm": 0.15819871425628662, + "learning_rate": 6.85865319110406e-06, + "loss": 0.13366590738296508, + "step": 217020 + }, + { + "epoch": 0.9317551497041979, + "grad_norm": 1.8451436758041382, + "learning_rate": 6.854341470986436e-06, + "loss": 0.17509522438049316, + "step": 217030 + }, + { + "epoch": 0.9317980817942179, + "grad_norm": 1.5129737854003906, + "learning_rate": 6.850029750868812e-06, + "loss": 0.3355816125869751, + "step": 217040 + }, + { + "epoch": 0.9318410138842379, + "grad_norm": 3.076338052749634, + "learning_rate": 6.8457180307511885e-06, + "loss": 0.3039119005203247, + "step": 217050 + }, + { + "epoch": 0.9318839459742579, + "grad_norm": 0.12309716641902924, + "learning_rate": 6.841406310633565e-06, + "loss": 0.1814027786254883, + "step": 217060 + }, + { + "epoch": 0.9319268780642779, + "grad_norm": 0.8530434966087341, + "learning_rate": 6.8370945905159415e-06, + "loss": 0.2380002737045288, + "step": 217070 + }, + { + "epoch": 0.9319698101542979, + "grad_norm": 0.05917308107018471, + "learning_rate": 6.832782870398317e-06, + "loss": 0.06543290615081787, + "step": 217080 + }, + { + "epoch": 0.932012742244318, + "grad_norm": 0.012692649848759174, + "learning_rate": 6.828471150280694e-06, + "loss": 0.24887619018554688, + "step": 217090 + }, + { + "epoch": 0.9320556743343379, + "grad_norm": 0.019568748772144318, + "learning_rate": 6.82415943016307e-06, + "loss": 0.1854008674621582, + "step": 217100 + }, + { + "epoch": 0.9320986064243579, + "grad_norm": 0.0013383131008595228, + "learning_rate": 6.819847710045446e-06, + "loss": 0.32312760353088377, + "step": 217110 + }, + { + "epoch": 0.932141538514378, + "grad_norm": 0.027715496718883514, + "learning_rate": 6.815535989927822e-06, + "loss": 0.04742673635482788, + "step": 217120 + }, + { + "epoch": 0.9321844706043979, + "grad_norm": 0.6883767247200012, + "learning_rate": 6.811224269810199e-06, + "loss": 0.44977903366088867, + "step": 217130 + }, + { + "epoch": 0.932227402694418, + "grad_norm": 0.014982739463448524, + "learning_rate": 6.8069125496925745e-06, + "loss": 0.01991720199584961, + "step": 217140 + }, + { + "epoch": 0.932270334784438, + "grad_norm": 1.052545428276062, + "learning_rate": 6.802600829574951e-06, + "loss": 0.3021081447601318, + "step": 217150 + }, + { + "epoch": 0.932313266874458, + "grad_norm": 0.022616246715188026, + "learning_rate": 6.7982891094573275e-06, + "loss": 0.1141018033027649, + "step": 217160 + }, + { + "epoch": 0.932356198964478, + "grad_norm": 0.8831282258033752, + "learning_rate": 6.793977389339703e-06, + "loss": 0.28502347469329836, + "step": 217170 + }, + { + "epoch": 0.932399131054498, + "grad_norm": 0.5590686798095703, + "learning_rate": 6.78966566922208e-06, + "loss": 0.12975300550460817, + "step": 217180 + }, + { + "epoch": 0.932442063144518, + "grad_norm": 2.844775915145874, + "learning_rate": 6.785353949104456e-06, + "loss": 0.29583823680877686, + "step": 217190 + }, + { + "epoch": 0.932484995234538, + "grad_norm": 0.0781247541308403, + "learning_rate": 6.781042228986833e-06, + "loss": 0.1255470871925354, + "step": 217200 + }, + { + "epoch": 0.932527927324558, + "grad_norm": 3.2160632610321045, + "learning_rate": 6.776730508869208e-06, + "loss": 0.1684964656829834, + "step": 217210 + }, + { + "epoch": 0.9325708594145781, + "grad_norm": 3.4206385612487793, + "learning_rate": 6.772418788751585e-06, + "loss": 0.37236800193786623, + "step": 217220 + }, + { + "epoch": 0.932613791504598, + "grad_norm": 0.0038313406985253096, + "learning_rate": 6.768107068633961e-06, + "loss": 0.17248566150665284, + "step": 217230 + }, + { + "epoch": 0.932656723594618, + "grad_norm": 0.038741614669561386, + "learning_rate": 6.763795348516337e-06, + "loss": 0.12119134664535522, + "step": 217240 + }, + { + "epoch": 0.9326996556846381, + "grad_norm": 6.805099010467529, + "learning_rate": 6.759483628398713e-06, + "loss": 0.4294279098510742, + "step": 217250 + }, + { + "epoch": 0.932742587774658, + "grad_norm": 1.4378046989440918, + "learning_rate": 6.75517190828109e-06, + "loss": 0.3188670873641968, + "step": 217260 + }, + { + "epoch": 0.9327855198646781, + "grad_norm": 0.9875854849815369, + "learning_rate": 6.7508601881634655e-06, + "loss": 0.10516676902770997, + "step": 217270 + }, + { + "epoch": 0.9328284519546981, + "grad_norm": 0.013433823361992836, + "learning_rate": 6.746548468045842e-06, + "loss": 0.25795345306396483, + "step": 217280 + }, + { + "epoch": 0.932871384044718, + "grad_norm": 0.016448192298412323, + "learning_rate": 6.7422367479282185e-06, + "loss": 0.08310087323188782, + "step": 217290 + }, + { + "epoch": 0.9329143161347381, + "grad_norm": 0.01219375804066658, + "learning_rate": 6.737925027810594e-06, + "loss": 0.22616422176361084, + "step": 217300 + }, + { + "epoch": 0.9329572482247581, + "grad_norm": 3.024327278137207, + "learning_rate": 6.733613307692971e-06, + "loss": 0.13399930000305177, + "step": 217310 + }, + { + "epoch": 0.933000180314778, + "grad_norm": 2.9267966747283936, + "learning_rate": 6.729301587575347e-06, + "loss": 0.32572245597839355, + "step": 217320 + }, + { + "epoch": 0.9330431124047981, + "grad_norm": 0.055614180862903595, + "learning_rate": 6.724989867457724e-06, + "loss": 0.402554988861084, + "step": 217330 + }, + { + "epoch": 0.9330860444948181, + "grad_norm": 0.0038208523765206337, + "learning_rate": 6.720678147340099e-06, + "loss": 0.08428694009780884, + "step": 217340 + }, + { + "epoch": 0.9331289765848381, + "grad_norm": 0.004386112093925476, + "learning_rate": 6.716366427222476e-06, + "loss": 0.26659092903137205, + "step": 217350 + }, + { + "epoch": 0.9331719086748581, + "grad_norm": 0.0014016431523486972, + "learning_rate": 6.712054707104853e-06, + "loss": 0.006140134483575821, + "step": 217360 + }, + { + "epoch": 0.9332148407648782, + "grad_norm": 0.3096982538700104, + "learning_rate": 6.70774298698723e-06, + "loss": 0.11977262496948242, + "step": 217370 + }, + { + "epoch": 0.9332577728548981, + "grad_norm": 0.05670047178864479, + "learning_rate": 6.703431266869606e-06, + "loss": 0.22703907489776612, + "step": 217380 + }, + { + "epoch": 0.9333007049449181, + "grad_norm": 0.04537387937307358, + "learning_rate": 6.699119546751982e-06, + "loss": 0.07578898072242737, + "step": 217390 + }, + { + "epoch": 0.9333436370349382, + "grad_norm": 0.06389743089675903, + "learning_rate": 6.694807826634358e-06, + "loss": 0.010492034256458282, + "step": 217400 + }, + { + "epoch": 0.9333865691249581, + "grad_norm": 0.02293463796377182, + "learning_rate": 6.690496106516735e-06, + "loss": 0.438470458984375, + "step": 217410 + }, + { + "epoch": 0.9334295012149781, + "grad_norm": 0.003789474256336689, + "learning_rate": 6.6861843863991104e-06, + "loss": 0.23822336196899413, + "step": 217420 + }, + { + "epoch": 0.9334724333049982, + "grad_norm": 0.2004413604736328, + "learning_rate": 6.681872666281487e-06, + "loss": 0.12822024822235106, + "step": 217430 + }, + { + "epoch": 0.9335153653950181, + "grad_norm": 0.3013432025909424, + "learning_rate": 6.677560946163863e-06, + "loss": 0.16031705141067504, + "step": 217440 + }, + { + "epoch": 0.9335582974850382, + "grad_norm": 1.5371167659759521, + "learning_rate": 6.67324922604624e-06, + "loss": 0.17891354560852052, + "step": 217450 + }, + { + "epoch": 0.9336012295750582, + "grad_norm": 0.011646891944110394, + "learning_rate": 6.6689375059286156e-06, + "loss": 0.05217450261116028, + "step": 217460 + }, + { + "epoch": 0.9336441616650781, + "grad_norm": 0.01779901422560215, + "learning_rate": 6.664625785810992e-06, + "loss": 0.06180897355079651, + "step": 217470 + }, + { + "epoch": 0.9336870937550982, + "grad_norm": 0.04970623552799225, + "learning_rate": 6.6603140656933686e-06, + "loss": 0.16953123807907106, + "step": 217480 + }, + { + "epoch": 0.9337300258451182, + "grad_norm": 3.2656664848327637, + "learning_rate": 6.656002345575744e-06, + "loss": 0.13938552141189575, + "step": 217490 + }, + { + "epoch": 0.9337729579351381, + "grad_norm": 0.09889588505029678, + "learning_rate": 6.651690625458121e-06, + "loss": 0.20841660499572753, + "step": 217500 + }, + { + "epoch": 0.9338158900251582, + "grad_norm": 3.853638172149658, + "learning_rate": 6.647378905340497e-06, + "loss": 0.2557401180267334, + "step": 217510 + }, + { + "epoch": 0.9338588221151782, + "grad_norm": 0.19543249905109406, + "learning_rate": 6.643067185222873e-06, + "loss": 0.15189332962036134, + "step": 217520 + }, + { + "epoch": 0.9339017542051982, + "grad_norm": 0.22307856380939484, + "learning_rate": 6.638755465105249e-06, + "loss": 0.2102879285812378, + "step": 217530 + }, + { + "epoch": 0.9339446862952182, + "grad_norm": 0.026279602199792862, + "learning_rate": 6.634443744987626e-06, + "loss": 0.23108108043670655, + "step": 217540 + }, + { + "epoch": 0.9339876183852382, + "grad_norm": 0.03201638534665108, + "learning_rate": 6.6301320248700015e-06, + "loss": 0.0656770944595337, + "step": 217550 + }, + { + "epoch": 0.9340305504752582, + "grad_norm": 0.008588535711169243, + "learning_rate": 6.625820304752378e-06, + "loss": 0.1322205901145935, + "step": 217560 + }, + { + "epoch": 0.9340734825652782, + "grad_norm": 0.015069599263370037, + "learning_rate": 6.6215085846347545e-06, + "loss": 0.14335529804229735, + "step": 217570 + }, + { + "epoch": 0.9341164146552983, + "grad_norm": 0.47126927971839905, + "learning_rate": 6.617196864517131e-06, + "loss": 0.18532544374465942, + "step": 217580 + }, + { + "epoch": 0.9341593467453183, + "grad_norm": 2.1417133808135986, + "learning_rate": 6.612885144399507e-06, + "loss": 0.14605796337127686, + "step": 217590 + }, + { + "epoch": 0.9342022788353382, + "grad_norm": 2.728137254714966, + "learning_rate": 6.608573424281883e-06, + "loss": 0.21810925006866455, + "step": 217600 + }, + { + "epoch": 0.9342452109253583, + "grad_norm": 8.483184814453125, + "learning_rate": 6.60426170416426e-06, + "loss": 0.20435698032379152, + "step": 217610 + }, + { + "epoch": 0.9342881430153783, + "grad_norm": 3.666672468185425, + "learning_rate": 6.599949984046635e-06, + "loss": 0.027203971147537233, + "step": 217620 + }, + { + "epoch": 0.9343310751053983, + "grad_norm": 0.0007382782059721649, + "learning_rate": 6.595638263929012e-06, + "loss": 0.13797013759613036, + "step": 217630 + }, + { + "epoch": 0.9343740071954183, + "grad_norm": 0.0005268286331556737, + "learning_rate": 6.591326543811388e-06, + "loss": 0.16217823028564454, + "step": 217640 + }, + { + "epoch": 0.9344169392854383, + "grad_norm": 0.2898944914340973, + "learning_rate": 6.587014823693764e-06, + "loss": 0.050319230556488036, + "step": 217650 + }, + { + "epoch": 0.9344598713754583, + "grad_norm": 0.010052835568785667, + "learning_rate": 6.58270310357614e-06, + "loss": 0.12320134639739991, + "step": 217660 + }, + { + "epoch": 0.9345028034654783, + "grad_norm": 0.016634009778499603, + "learning_rate": 6.578391383458517e-06, + "loss": 0.12773616313934327, + "step": 217670 + }, + { + "epoch": 0.9345457355554984, + "grad_norm": 0.08195490390062332, + "learning_rate": 6.574079663340893e-06, + "loss": 0.29748404026031494, + "step": 217680 + }, + { + "epoch": 0.9345886676455183, + "grad_norm": 0.0030808262526988983, + "learning_rate": 6.569767943223269e-06, + "loss": 0.2741116762161255, + "step": 217690 + }, + { + "epoch": 0.9346315997355383, + "grad_norm": 0.03463468328118324, + "learning_rate": 6.5654562231056455e-06, + "loss": 0.2992996692657471, + "step": 217700 + }, + { + "epoch": 0.9346745318255584, + "grad_norm": 0.057465892285108566, + "learning_rate": 6.561144502988022e-06, + "loss": 0.10853420495986939, + "step": 217710 + }, + { + "epoch": 0.9347174639155783, + "grad_norm": 0.011129839345812798, + "learning_rate": 6.556832782870399e-06, + "loss": 0.13864043951034546, + "step": 217720 + }, + { + "epoch": 0.9347603960055983, + "grad_norm": 0.3891150653362274, + "learning_rate": 6.552521062752776e-06, + "loss": 0.20633718967437745, + "step": 217730 + }, + { + "epoch": 0.9348033280956184, + "grad_norm": 0.05311836674809456, + "learning_rate": 6.5482093426351515e-06, + "loss": 0.2821415185928345, + "step": 217740 + }, + { + "epoch": 0.9348462601856383, + "grad_norm": 1.0979710817337036, + "learning_rate": 6.543897622517528e-06, + "loss": 0.0567243218421936, + "step": 217750 + }, + { + "epoch": 0.9348891922756584, + "grad_norm": 5.553037166595459, + "learning_rate": 6.5395859023999045e-06, + "loss": 0.18742778301239013, + "step": 217760 + }, + { + "epoch": 0.9349321243656784, + "grad_norm": 0.0048981523141264915, + "learning_rate": 6.53527418228228e-06, + "loss": 0.06455135345458984, + "step": 217770 + }, + { + "epoch": 0.9349750564556983, + "grad_norm": 0.0630481168627739, + "learning_rate": 6.530962462164657e-06, + "loss": 0.14230971336364745, + "step": 217780 + }, + { + "epoch": 0.9350179885457184, + "grad_norm": 1.3087913990020752, + "learning_rate": 6.526650742047033e-06, + "loss": 0.24478435516357422, + "step": 217790 + }, + { + "epoch": 0.9350609206357384, + "grad_norm": 3.037684679031372, + "learning_rate": 6.522339021929409e-06, + "loss": 0.23640942573547363, + "step": 217800 + }, + { + "epoch": 0.9351038527257584, + "grad_norm": 0.005896865390241146, + "learning_rate": 6.518027301811785e-06, + "loss": 0.14557818174362183, + "step": 217810 + }, + { + "epoch": 0.9351467848157784, + "grad_norm": 0.0034972967114299536, + "learning_rate": 6.513715581694162e-06, + "loss": 0.27066123485565186, + "step": 217820 + }, + { + "epoch": 0.9351897169057984, + "grad_norm": 0.002648388734087348, + "learning_rate": 6.509403861576538e-06, + "loss": 0.280427360534668, + "step": 217830 + }, + { + "epoch": 0.9352326489958184, + "grad_norm": 2.0864877700805664, + "learning_rate": 6.505092141458914e-06, + "loss": 0.3587048530578613, + "step": 217840 + }, + { + "epoch": 0.9352755810858384, + "grad_norm": 0.04708682373166084, + "learning_rate": 6.5007804213412904e-06, + "loss": 0.07716538310050965, + "step": 217850 + }, + { + "epoch": 0.9353185131758585, + "grad_norm": 0.012469529174268246, + "learning_rate": 6.496468701223667e-06, + "loss": 0.17058430910110473, + "step": 217860 + }, + { + "epoch": 0.9353614452658784, + "grad_norm": 4.872241497039795, + "learning_rate": 6.492156981106043e-06, + "loss": 0.1434084177017212, + "step": 217870 + }, + { + "epoch": 0.9354043773558984, + "grad_norm": 0.02215704135596752, + "learning_rate": 6.487845260988419e-06, + "loss": 0.31536402702331545, + "step": 217880 + }, + { + "epoch": 0.9354473094459185, + "grad_norm": 0.04922785237431526, + "learning_rate": 6.4835335408707956e-06, + "loss": 0.29095261096954345, + "step": 217890 + }, + { + "epoch": 0.9354902415359384, + "grad_norm": 0.4923323094844818, + "learning_rate": 6.479221820753171e-06, + "loss": 0.08270606994628907, + "step": 217900 + }, + { + "epoch": 0.9355331736259584, + "grad_norm": 0.005486680194735527, + "learning_rate": 6.474910100635548e-06, + "loss": 0.14887460470199584, + "step": 217910 + }, + { + "epoch": 0.9355761057159785, + "grad_norm": 3.3651132583618164, + "learning_rate": 6.470598380517924e-06, + "loss": 0.24405162334442138, + "step": 217920 + }, + { + "epoch": 0.9356190378059984, + "grad_norm": 3.6582298278808594, + "learning_rate": 6.466286660400301e-06, + "loss": 0.18124510049819947, + "step": 217930 + }, + { + "epoch": 0.9356619698960185, + "grad_norm": 2.6301400661468506, + "learning_rate": 6.461974940282676e-06, + "loss": 0.24226765632629393, + "step": 217940 + }, + { + "epoch": 0.9357049019860385, + "grad_norm": 3.168323278427124, + "learning_rate": 6.457663220165053e-06, + "loss": 0.13127228021621704, + "step": 217950 + }, + { + "epoch": 0.9357478340760584, + "grad_norm": 0.9777452349662781, + "learning_rate": 6.453351500047429e-06, + "loss": 0.24676942825317383, + "step": 217960 + }, + { + "epoch": 0.9357907661660785, + "grad_norm": 0.0015889370115473866, + "learning_rate": 6.449039779929805e-06, + "loss": 0.16644636392593384, + "step": 217970 + }, + { + "epoch": 0.9358336982560985, + "grad_norm": 0.017067328095436096, + "learning_rate": 6.4447280598121815e-06, + "loss": 0.14611515998840333, + "step": 217980 + }, + { + "epoch": 0.9358766303461185, + "grad_norm": 0.0011988900369033217, + "learning_rate": 6.440416339694558e-06, + "loss": 0.022569629549980163, + "step": 217990 + }, + { + "epoch": 0.9359195624361385, + "grad_norm": 1.1535048484802246, + "learning_rate": 6.436104619576934e-06, + "loss": 0.13392226696014403, + "step": 218000 + }, + { + "epoch": 0.9359195624361385, + "eval_loss": 0.37059035897254944, + "eval_runtime": 27.4273, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 218000 + }, + { + "epoch": 0.9359624945261585, + "grad_norm": 0.2708398699760437, + "learning_rate": 6.43179289945931e-06, + "loss": 0.11112178564071655, + "step": 218010 + }, + { + "epoch": 0.9360054266161786, + "grad_norm": 0.07791769504547119, + "learning_rate": 6.427481179341687e-06, + "loss": 0.01833338290452957, + "step": 218020 + }, + { + "epoch": 0.9360483587061985, + "grad_norm": 0.03304653614759445, + "learning_rate": 6.423169459224062e-06, + "loss": 0.20075433254241942, + "step": 218030 + }, + { + "epoch": 0.9360912907962186, + "grad_norm": 2.0760416984558105, + "learning_rate": 6.418857739106439e-06, + "loss": 0.3484358549118042, + "step": 218040 + }, + { + "epoch": 0.9361342228862386, + "grad_norm": 0.0009414487867616117, + "learning_rate": 6.414546018988815e-06, + "loss": 0.08126358389854431, + "step": 218050 + }, + { + "epoch": 0.9361771549762585, + "grad_norm": 0.0022511922288686037, + "learning_rate": 6.410234298871192e-06, + "loss": 0.21538751125335692, + "step": 218060 + }, + { + "epoch": 0.9362200870662786, + "grad_norm": 6.507908821105957, + "learning_rate": 6.405922578753567e-06, + "loss": 0.12897765636444092, + "step": 218070 + }, + { + "epoch": 0.9362630191562986, + "grad_norm": 0.005039474926888943, + "learning_rate": 6.401610858635944e-06, + "loss": 0.26041815280914304, + "step": 218080 + }, + { + "epoch": 0.9363059512463185, + "grad_norm": 0.013466509990394115, + "learning_rate": 6.397299138518321e-06, + "loss": 0.1639467239379883, + "step": 218090 + }, + { + "epoch": 0.9363488833363386, + "grad_norm": 0.13806986808776855, + "learning_rate": 6.392987418400698e-06, + "loss": 0.15577703714370728, + "step": 218100 + }, + { + "epoch": 0.9363918154263586, + "grad_norm": 0.0029524166602641344, + "learning_rate": 6.388675698283074e-06, + "loss": 0.2313316822052002, + "step": 218110 + }, + { + "epoch": 0.9364347475163786, + "grad_norm": 0.38946065306663513, + "learning_rate": 6.38436397816545e-06, + "loss": 0.048220235109329226, + "step": 218120 + }, + { + "epoch": 0.9364776796063986, + "grad_norm": 0.1648869812488556, + "learning_rate": 6.380052258047826e-06, + "loss": 0.14529719352722167, + "step": 218130 + }, + { + "epoch": 0.9365206116964186, + "grad_norm": 0.9979525208473206, + "learning_rate": 6.375740537930203e-06, + "loss": 0.3048782587051392, + "step": 218140 + }, + { + "epoch": 0.9365635437864386, + "grad_norm": 16.755258560180664, + "learning_rate": 6.3714288178125785e-06, + "loss": 0.2275531768798828, + "step": 218150 + }, + { + "epoch": 0.9366064758764586, + "grad_norm": 0.02060013823211193, + "learning_rate": 6.367117097694955e-06, + "loss": 0.19351630210876464, + "step": 218160 + }, + { + "epoch": 0.9366494079664787, + "grad_norm": 0.009929434396326542, + "learning_rate": 6.3628053775773315e-06, + "loss": 0.046320590376853946, + "step": 218170 + }, + { + "epoch": 0.9366923400564986, + "grad_norm": 1.319644570350647, + "learning_rate": 6.358493657459708e-06, + "loss": 0.15600812435150146, + "step": 218180 + }, + { + "epoch": 0.9367352721465186, + "grad_norm": 0.17497789859771729, + "learning_rate": 6.354181937342084e-06, + "loss": 0.21659915447235106, + "step": 218190 + }, + { + "epoch": 0.9367782042365387, + "grad_norm": 3.353944778442383, + "learning_rate": 6.34987021722446e-06, + "loss": 0.1580977439880371, + "step": 218200 + }, + { + "epoch": 0.9368211363265586, + "grad_norm": 1.8000859022140503, + "learning_rate": 6.345558497106837e-06, + "loss": 0.15619392395019532, + "step": 218210 + }, + { + "epoch": 0.9368640684165787, + "grad_norm": 0.007903819903731346, + "learning_rate": 6.341246776989212e-06, + "loss": 0.37516734600067136, + "step": 218220 + }, + { + "epoch": 0.9369070005065987, + "grad_norm": 2.224149227142334, + "learning_rate": 6.336935056871589e-06, + "loss": 0.07544257044792176, + "step": 218230 + }, + { + "epoch": 0.9369499325966186, + "grad_norm": 0.026319283992052078, + "learning_rate": 6.332623336753965e-06, + "loss": 0.0577617883682251, + "step": 218240 + }, + { + "epoch": 0.9369928646866387, + "grad_norm": 0.0009843171574175358, + "learning_rate": 6.328311616636341e-06, + "loss": 0.2975840330123901, + "step": 218250 + }, + { + "epoch": 0.9370357967766587, + "grad_norm": 0.189020037651062, + "learning_rate": 6.3239998965187174e-06, + "loss": 0.38624231815338134, + "step": 218260 + }, + { + "epoch": 0.9370787288666786, + "grad_norm": 0.8112937808036804, + "learning_rate": 6.319688176401094e-06, + "loss": 0.2941300392150879, + "step": 218270 + }, + { + "epoch": 0.9371216609566987, + "grad_norm": 1.3181637525558472, + "learning_rate": 6.31537645628347e-06, + "loss": 0.49183197021484376, + "step": 218280 + }, + { + "epoch": 0.9371645930467187, + "grad_norm": 1.5222159624099731, + "learning_rate": 6.311064736165846e-06, + "loss": 0.16584717035293578, + "step": 218290 + }, + { + "epoch": 0.9372075251367387, + "grad_norm": 1.103300929069519, + "learning_rate": 6.306753016048223e-06, + "loss": 0.1373578667640686, + "step": 218300 + }, + { + "epoch": 0.9372504572267587, + "grad_norm": 0.04471690580248833, + "learning_rate": 6.302441295930599e-06, + "loss": 0.3182393789291382, + "step": 218310 + }, + { + "epoch": 0.9372933893167787, + "grad_norm": 0.02034154161810875, + "learning_rate": 6.298129575812975e-06, + "loss": 0.09224039316177368, + "step": 218320 + }, + { + "epoch": 0.9373363214067987, + "grad_norm": 15.507043838500977, + "learning_rate": 6.293817855695351e-06, + "loss": 0.44158592224121096, + "step": 218330 + }, + { + "epoch": 0.9373792534968187, + "grad_norm": 0.013069471344351768, + "learning_rate": 6.289506135577728e-06, + "loss": 0.15449692010879518, + "step": 218340 + }, + { + "epoch": 0.9374221855868388, + "grad_norm": 0.2247493863105774, + "learning_rate": 6.285194415460103e-06, + "loss": 0.2751898288726807, + "step": 218350 + }, + { + "epoch": 0.9374651176768587, + "grad_norm": 0.8723449110984802, + "learning_rate": 6.28088269534248e-06, + "loss": 0.17945202589035034, + "step": 218360 + }, + { + "epoch": 0.9375080497668787, + "grad_norm": 0.0049847024492919445, + "learning_rate": 6.276570975224856e-06, + "loss": 0.1884321689605713, + "step": 218370 + }, + { + "epoch": 0.9375509818568988, + "grad_norm": 0.017331691458821297, + "learning_rate": 6.272259255107232e-06, + "loss": 0.029934722185134887, + "step": 218380 + }, + { + "epoch": 0.9375939139469187, + "grad_norm": 3.178102970123291, + "learning_rate": 6.2679475349896085e-06, + "loss": 0.1653683066368103, + "step": 218390 + }, + { + "epoch": 0.9376368460369388, + "grad_norm": 0.003745247842743993, + "learning_rate": 6.263635814871985e-06, + "loss": 0.14961315393447877, + "step": 218400 + }, + { + "epoch": 0.9376797781269588, + "grad_norm": 0.30254513025283813, + "learning_rate": 6.259324094754361e-06, + "loss": 0.11315090656280517, + "step": 218410 + }, + { + "epoch": 0.9377227102169787, + "grad_norm": 0.0023749074898660183, + "learning_rate": 6.255012374636737e-06, + "loss": 0.2886801242828369, + "step": 218420 + }, + { + "epoch": 0.9377656423069988, + "grad_norm": 0.0015023527666926384, + "learning_rate": 6.250700654519114e-06, + "loss": 0.17715065479278563, + "step": 218430 + }, + { + "epoch": 0.9378085743970188, + "grad_norm": 0.017936963587999344, + "learning_rate": 6.24638893440149e-06, + "loss": 0.32670762538909914, + "step": 218440 + }, + { + "epoch": 0.9378515064870389, + "grad_norm": 2.004991292953491, + "learning_rate": 6.242077214283867e-06, + "loss": 0.2062148094177246, + "step": 218450 + }, + { + "epoch": 0.9378944385770588, + "grad_norm": 0.18925458192825317, + "learning_rate": 6.237765494166243e-06, + "loss": 0.08423059582710266, + "step": 218460 + }, + { + "epoch": 0.9379373706670788, + "grad_norm": 6.76282262802124, + "learning_rate": 6.233453774048619e-06, + "loss": 0.1460658073425293, + "step": 218470 + }, + { + "epoch": 0.9379803027570989, + "grad_norm": 1.7375794649124146, + "learning_rate": 6.229142053930995e-06, + "loss": 0.2405299186706543, + "step": 218480 + }, + { + "epoch": 0.9380232348471188, + "grad_norm": 0.9361600279808044, + "learning_rate": 6.224830333813372e-06, + "loss": 0.31464076042175293, + "step": 218490 + }, + { + "epoch": 0.9380661669371388, + "grad_norm": 1.7604894638061523, + "learning_rate": 6.220518613695748e-06, + "loss": 0.16823394298553468, + "step": 218500 + }, + { + "epoch": 0.9381090990271589, + "grad_norm": 0.03205284848809242, + "learning_rate": 6.216206893578124e-06, + "loss": 0.17398911714553833, + "step": 218510 + }, + { + "epoch": 0.9381520311171788, + "grad_norm": 0.051615796983242035, + "learning_rate": 6.2118951734605e-06, + "loss": 0.1916268587112427, + "step": 218520 + }, + { + "epoch": 0.9381949632071989, + "grad_norm": 1.9579862356185913, + "learning_rate": 6.207583453342877e-06, + "loss": 0.16121180057525636, + "step": 218530 + }, + { + "epoch": 0.9382378952972189, + "grad_norm": 1.5483946800231934, + "learning_rate": 6.203271733225253e-06, + "loss": 0.06491850018501281, + "step": 218540 + }, + { + "epoch": 0.9382808273872388, + "grad_norm": 0.05101202428340912, + "learning_rate": 6.19896001310763e-06, + "loss": 0.08455324172973633, + "step": 218550 + }, + { + "epoch": 0.9383237594772589, + "grad_norm": 0.040585048496723175, + "learning_rate": 6.194648292990006e-06, + "loss": 0.1799705743789673, + "step": 218560 + }, + { + "epoch": 0.9383666915672789, + "grad_norm": 0.016378004103899002, + "learning_rate": 6.190336572872382e-06, + "loss": 0.134379780292511, + "step": 218570 + }, + { + "epoch": 0.9384096236572989, + "grad_norm": 3.475637674331665, + "learning_rate": 6.1860248527547585e-06, + "loss": 0.15294344425201417, + "step": 218580 + }, + { + "epoch": 0.9384525557473189, + "grad_norm": 0.05142517387866974, + "learning_rate": 6.181713132637135e-06, + "loss": 0.13303122520446778, + "step": 218590 + }, + { + "epoch": 0.9384954878373389, + "grad_norm": 1.9796829223632812, + "learning_rate": 6.177401412519511e-06, + "loss": 0.3248765468597412, + "step": 218600 + }, + { + "epoch": 0.9385384199273589, + "grad_norm": 0.013521007262170315, + "learning_rate": 6.173089692401887e-06, + "loss": 0.24125297069549562, + "step": 218610 + }, + { + "epoch": 0.9385813520173789, + "grad_norm": 0.02028888463973999, + "learning_rate": 6.168777972284264e-06, + "loss": 0.10118316411972046, + "step": 218620 + }, + { + "epoch": 0.938624284107399, + "grad_norm": 4.58914852142334, + "learning_rate": 6.164466252166639e-06, + "loss": 0.1950531005859375, + "step": 218630 + }, + { + "epoch": 0.9386672161974189, + "grad_norm": 1.3460999727249146, + "learning_rate": 6.160154532049016e-06, + "loss": 0.19600038528442382, + "step": 218640 + }, + { + "epoch": 0.9387101482874389, + "grad_norm": 0.016167912632226944, + "learning_rate": 6.155842811931392e-06, + "loss": 0.0983378529548645, + "step": 218650 + }, + { + "epoch": 0.938753080377459, + "grad_norm": 1.4347484111785889, + "learning_rate": 6.151531091813768e-06, + "loss": 0.3592548370361328, + "step": 218660 + }, + { + "epoch": 0.9387960124674789, + "grad_norm": 9.225719451904297, + "learning_rate": 6.1472193716961445e-06, + "loss": 0.2684290409088135, + "step": 218670 + }, + { + "epoch": 0.9388389445574989, + "grad_norm": 0.01551085989922285, + "learning_rate": 6.142907651578521e-06, + "loss": 0.17376840114593506, + "step": 218680 + }, + { + "epoch": 0.938881876647519, + "grad_norm": 0.1393050104379654, + "learning_rate": 6.1385959314608975e-06, + "loss": 0.3194669246673584, + "step": 218690 + }, + { + "epoch": 0.9389248087375389, + "grad_norm": 3.2130279541015625, + "learning_rate": 6.134284211343273e-06, + "loss": 0.3998244047164917, + "step": 218700 + }, + { + "epoch": 0.938967740827559, + "grad_norm": 14.124472618103027, + "learning_rate": 6.12997249122565e-06, + "loss": 0.11968344449996948, + "step": 218710 + }, + { + "epoch": 0.939010672917579, + "grad_norm": 0.0034347360488027334, + "learning_rate": 6.125660771108026e-06, + "loss": 0.1548219919204712, + "step": 218720 + }, + { + "epoch": 0.9390536050075989, + "grad_norm": 3.031308174133301, + "learning_rate": 6.121349050990403e-06, + "loss": 0.2987856388092041, + "step": 218730 + }, + { + "epoch": 0.939096537097619, + "grad_norm": 0.6183398365974426, + "learning_rate": 6.117037330872779e-06, + "loss": 0.12947933673858641, + "step": 218740 + }, + { + "epoch": 0.939139469187639, + "grad_norm": 1.35626220703125, + "learning_rate": 6.112725610755156e-06, + "loss": 0.2367931127548218, + "step": 218750 + }, + { + "epoch": 0.939182401277659, + "grad_norm": 0.009454138576984406, + "learning_rate": 6.108413890637531e-06, + "loss": 0.2076733112335205, + "step": 218760 + }, + { + "epoch": 0.939225333367679, + "grad_norm": 0.2160428762435913, + "learning_rate": 6.104102170519908e-06, + "loss": 0.010231452435255051, + "step": 218770 + }, + { + "epoch": 0.939268265457699, + "grad_norm": 2.0446693897247314, + "learning_rate": 6.099790450402284e-06, + "loss": 0.34381661415100095, + "step": 218780 + }, + { + "epoch": 0.939311197547719, + "grad_norm": 1.378746509552002, + "learning_rate": 6.09547873028466e-06, + "loss": 0.18798928260803222, + "step": 218790 + }, + { + "epoch": 0.939354129637739, + "grad_norm": 1.4301226139068604, + "learning_rate": 6.091167010167036e-06, + "loss": 0.0660437822341919, + "step": 218800 + }, + { + "epoch": 0.939397061727759, + "grad_norm": 0.016804933547973633, + "learning_rate": 6.086855290049413e-06, + "loss": 0.3777721643447876, + "step": 218810 + }, + { + "epoch": 0.939439993817779, + "grad_norm": 1.1796166896820068, + "learning_rate": 6.0825435699317885e-06, + "loss": 0.1267245888710022, + "step": 218820 + }, + { + "epoch": 0.939482925907799, + "grad_norm": 0.44042861461639404, + "learning_rate": 6.078231849814165e-06, + "loss": 0.1943804383277893, + "step": 218830 + }, + { + "epoch": 0.9395258579978191, + "grad_norm": 1.6625487804412842, + "learning_rate": 6.0739201296965415e-06, + "loss": 0.25074880123138427, + "step": 218840 + }, + { + "epoch": 0.939568790087839, + "grad_norm": 0.15306060016155243, + "learning_rate": 6.069608409578917e-06, + "loss": 0.41699953079223634, + "step": 218850 + }, + { + "epoch": 0.939611722177859, + "grad_norm": 1.8667495250701904, + "learning_rate": 6.065296689461294e-06, + "loss": 0.3992176532745361, + "step": 218860 + }, + { + "epoch": 0.9396546542678791, + "grad_norm": 0.0020866908598691225, + "learning_rate": 6.06098496934367e-06, + "loss": 0.14581599235534667, + "step": 218870 + }, + { + "epoch": 0.9396975863578991, + "grad_norm": 0.02238316461443901, + "learning_rate": 6.056673249226047e-06, + "loss": 0.33009798526763917, + "step": 218880 + }, + { + "epoch": 0.9397405184479191, + "grad_norm": 2.0602495670318604, + "learning_rate": 6.052361529108422e-06, + "loss": 0.12349958419799804, + "step": 218890 + }, + { + "epoch": 0.9397834505379391, + "grad_norm": 0.0704239159822464, + "learning_rate": 6.048049808990799e-06, + "loss": 0.2625364542007446, + "step": 218900 + }, + { + "epoch": 0.9398263826279591, + "grad_norm": 0.006151742767542601, + "learning_rate": 6.043738088873175e-06, + "loss": 0.28477323055267334, + "step": 218910 + }, + { + "epoch": 0.9398693147179791, + "grad_norm": 1.7099026441574097, + "learning_rate": 6.039426368755552e-06, + "loss": 0.17000820636749267, + "step": 218920 + }, + { + "epoch": 0.9399122468079991, + "grad_norm": 0.0685584619641304, + "learning_rate": 6.035114648637928e-06, + "loss": 0.2855337142944336, + "step": 218930 + }, + { + "epoch": 0.9399551788980192, + "grad_norm": 0.1191517561674118, + "learning_rate": 6.030802928520305e-06, + "loss": 0.13334267139434813, + "step": 218940 + }, + { + "epoch": 0.9399981109880391, + "grad_norm": 0.5394452214241028, + "learning_rate": 6.02649120840268e-06, + "loss": 0.3404886722564697, + "step": 218950 + }, + { + "epoch": 0.9400410430780591, + "grad_norm": 0.04119795933365822, + "learning_rate": 6.022179488285057e-06, + "loss": 0.044364386796951295, + "step": 218960 + }, + { + "epoch": 0.9400839751680792, + "grad_norm": 0.1636960208415985, + "learning_rate": 6.017867768167433e-06, + "loss": 0.1932442307472229, + "step": 218970 + }, + { + "epoch": 0.9401269072580991, + "grad_norm": 0.033029649406671524, + "learning_rate": 6.013556048049809e-06, + "loss": 0.3541710376739502, + "step": 218980 + }, + { + "epoch": 0.9401698393481192, + "grad_norm": 0.0074260905385017395, + "learning_rate": 6.0092443279321856e-06, + "loss": 0.07902343273162842, + "step": 218990 + }, + { + "epoch": 0.9402127714381392, + "grad_norm": 5.728902816772461, + "learning_rate": 6.004932607814562e-06, + "loss": 0.2292637825012207, + "step": 219000 + }, + { + "epoch": 0.9402127714381392, + "eval_loss": 0.3688412010669708, + "eval_runtime": 27.5737, + "eval_samples_per_second": 3.627, + "eval_steps_per_second": 3.627, + "step": 219000 + }, + { + "epoch": 0.9402557035281591, + "grad_norm": 1.7974296808242798, + "learning_rate": 6.000620887696938e-06, + "loss": 0.25763325691223143, + "step": 219010 + }, + { + "epoch": 0.9402986356181792, + "grad_norm": 0.029638810083270073, + "learning_rate": 5.996309167579314e-06, + "loss": 0.15889897346496581, + "step": 219020 + }, + { + "epoch": 0.9403415677081992, + "grad_norm": 0.028355387970805168, + "learning_rate": 5.991997447461691e-06, + "loss": 0.19087129831314087, + "step": 219030 + }, + { + "epoch": 0.9403844997982191, + "grad_norm": 0.08307473361492157, + "learning_rate": 5.987685727344066e-06, + "loss": 0.11460771560668945, + "step": 219040 + }, + { + "epoch": 0.9404274318882392, + "grad_norm": 0.028220640495419502, + "learning_rate": 5.983374007226443e-06, + "loss": 0.14903123378753663, + "step": 219050 + }, + { + "epoch": 0.9404703639782592, + "grad_norm": 0.7593523859977722, + "learning_rate": 5.979062287108819e-06, + "loss": 0.23282818794250487, + "step": 219060 + }, + { + "epoch": 0.9405132960682792, + "grad_norm": 0.014977425336837769, + "learning_rate": 5.974750566991196e-06, + "loss": 0.04835158586502075, + "step": 219070 + }, + { + "epoch": 0.9405562281582992, + "grad_norm": 0.0452410951256752, + "learning_rate": 5.9704388468735715e-06, + "loss": 0.15343317985534669, + "step": 219080 + }, + { + "epoch": 0.9405991602483192, + "grad_norm": 2.069753408432007, + "learning_rate": 5.966127126755949e-06, + "loss": 0.11665205955505371, + "step": 219090 + }, + { + "epoch": 0.9406420923383392, + "grad_norm": 0.006946403067559004, + "learning_rate": 5.9618154066383245e-06, + "loss": 0.1817198634147644, + "step": 219100 + }, + { + "epoch": 0.9406850244283592, + "grad_norm": 0.010719393379986286, + "learning_rate": 5.957503686520701e-06, + "loss": 0.22847046852111816, + "step": 219110 + }, + { + "epoch": 0.9407279565183793, + "grad_norm": 0.008265381678938866, + "learning_rate": 5.9531919664030775e-06, + "loss": 0.3102251052856445, + "step": 219120 + }, + { + "epoch": 0.9407708886083992, + "grad_norm": 1.2220699787139893, + "learning_rate": 5.948880246285454e-06, + "loss": 0.12826883792877197, + "step": 219130 + }, + { + "epoch": 0.9408138206984192, + "grad_norm": 7.854069232940674, + "learning_rate": 5.94456852616783e-06, + "loss": 0.25547878742218016, + "step": 219140 + }, + { + "epoch": 0.9408567527884393, + "grad_norm": 0.1460963934659958, + "learning_rate": 5.940256806050206e-06, + "loss": 0.10857096910476685, + "step": 219150 + }, + { + "epoch": 0.9408996848784592, + "grad_norm": 0.013315374962985516, + "learning_rate": 5.935945085932583e-06, + "loss": 0.23141663074493407, + "step": 219160 + }, + { + "epoch": 0.9409426169684793, + "grad_norm": 0.04174807667732239, + "learning_rate": 5.931633365814958e-06, + "loss": 0.15095480680465698, + "step": 219170 + }, + { + "epoch": 0.9409855490584993, + "grad_norm": 2.723499059677124, + "learning_rate": 5.927321645697335e-06, + "loss": 0.0929310142993927, + "step": 219180 + }, + { + "epoch": 0.9410284811485192, + "grad_norm": 0.0006184586673043668, + "learning_rate": 5.923009925579711e-06, + "loss": 0.15547598600387574, + "step": 219190 + }, + { + "epoch": 0.9410714132385393, + "grad_norm": 0.8751519322395325, + "learning_rate": 5.918698205462087e-06, + "loss": 0.4736207962036133, + "step": 219200 + }, + { + "epoch": 0.9411143453285593, + "grad_norm": 0.007214980665594339, + "learning_rate": 5.914386485344463e-06, + "loss": 0.09388966560363769, + "step": 219210 + }, + { + "epoch": 0.9411572774185792, + "grad_norm": 0.007233398500829935, + "learning_rate": 5.91007476522684e-06, + "loss": 0.1745733141899109, + "step": 219220 + }, + { + "epoch": 0.9412002095085993, + "grad_norm": 2.5610218048095703, + "learning_rate": 5.9057630451092155e-06, + "loss": 0.28267683982849123, + "step": 219230 + }, + { + "epoch": 0.9412431415986193, + "grad_norm": 0.031056322157382965, + "learning_rate": 5.901451324991592e-06, + "loss": 0.09716549515724182, + "step": 219240 + }, + { + "epoch": 0.9412860736886393, + "grad_norm": 0.07894471287727356, + "learning_rate": 5.8971396048739685e-06, + "loss": 0.02060786187648773, + "step": 219250 + }, + { + "epoch": 0.9413290057786593, + "grad_norm": 0.019434994086623192, + "learning_rate": 5.892827884756345e-06, + "loss": 0.10689753293991089, + "step": 219260 + }, + { + "epoch": 0.9413719378686793, + "grad_norm": 0.003029686165973544, + "learning_rate": 5.8885161646387215e-06, + "loss": 0.19583780765533448, + "step": 219270 + }, + { + "epoch": 0.9414148699586993, + "grad_norm": 0.08387192338705063, + "learning_rate": 5.884204444521098e-06, + "loss": 0.0790525197982788, + "step": 219280 + }, + { + "epoch": 0.9414578020487193, + "grad_norm": 1.89656662940979, + "learning_rate": 5.879892724403474e-06, + "loss": 0.19452766180038453, + "step": 219290 + }, + { + "epoch": 0.9415007341387394, + "grad_norm": 0.09110067039728165, + "learning_rate": 5.87558100428585e-06, + "loss": 0.19205337762832642, + "step": 219300 + }, + { + "epoch": 0.9415436662287594, + "grad_norm": 0.6194562911987305, + "learning_rate": 5.871269284168227e-06, + "loss": 0.1736610174179077, + "step": 219310 + }, + { + "epoch": 0.9415865983187793, + "grad_norm": 0.30323392152786255, + "learning_rate": 5.866957564050603e-06, + "loss": 0.2051142454147339, + "step": 219320 + }, + { + "epoch": 0.9416295304087994, + "grad_norm": 0.006133268587291241, + "learning_rate": 5.862645843932979e-06, + "loss": 0.24879255294799804, + "step": 219330 + }, + { + "epoch": 0.9416724624988194, + "grad_norm": 0.033949390053749084, + "learning_rate": 5.858334123815355e-06, + "loss": 0.17589467763900757, + "step": 219340 + }, + { + "epoch": 0.9417153945888393, + "grad_norm": 0.23580507934093475, + "learning_rate": 5.854022403697732e-06, + "loss": 0.1860203981399536, + "step": 219350 + }, + { + "epoch": 0.9417583266788594, + "grad_norm": 0.953391969203949, + "learning_rate": 5.8497106835801074e-06, + "loss": 0.31667141914367675, + "step": 219360 + }, + { + "epoch": 0.9418012587688794, + "grad_norm": 0.9369455575942993, + "learning_rate": 5.845398963462484e-06, + "loss": 0.08720980882644654, + "step": 219370 + }, + { + "epoch": 0.9418441908588994, + "grad_norm": 0.8098713755607605, + "learning_rate": 5.8410872433448604e-06, + "loss": 0.23913743495941162, + "step": 219380 + }, + { + "epoch": 0.9418871229489194, + "grad_norm": 1.5632972717285156, + "learning_rate": 5.836775523227236e-06, + "loss": 0.24591350555419922, + "step": 219390 + }, + { + "epoch": 0.9419300550389395, + "grad_norm": 0.00022845991770736873, + "learning_rate": 5.8324638031096126e-06, + "loss": 0.11978145837783813, + "step": 219400 + }, + { + "epoch": 0.9419729871289594, + "grad_norm": 0.6672672629356384, + "learning_rate": 5.828152082991989e-06, + "loss": 0.11547610759735108, + "step": 219410 + }, + { + "epoch": 0.9420159192189794, + "grad_norm": 0.001994553254917264, + "learning_rate": 5.823840362874365e-06, + "loss": 0.2555203914642334, + "step": 219420 + }, + { + "epoch": 0.9420588513089995, + "grad_norm": 0.18988437950611115, + "learning_rate": 5.819528642756741e-06, + "loss": 0.026738014817237855, + "step": 219430 + }, + { + "epoch": 0.9421017833990194, + "grad_norm": 1.5366977453231812, + "learning_rate": 5.815216922639118e-06, + "loss": 0.2509934425354004, + "step": 219440 + }, + { + "epoch": 0.9421447154890394, + "grad_norm": 0.04574510082602501, + "learning_rate": 5.810905202521494e-06, + "loss": 0.04108910858631134, + "step": 219450 + }, + { + "epoch": 0.9421876475790595, + "grad_norm": 0.04924973472952843, + "learning_rate": 5.806593482403871e-06, + "loss": 0.4201669216156006, + "step": 219460 + }, + { + "epoch": 0.9422305796690794, + "grad_norm": 0.005704942625015974, + "learning_rate": 5.802281762286247e-06, + "loss": 0.1917565107345581, + "step": 219470 + }, + { + "epoch": 0.9422735117590995, + "grad_norm": 0.005896027199923992, + "learning_rate": 5.797970042168623e-06, + "loss": 0.154584801197052, + "step": 219480 + }, + { + "epoch": 0.9423164438491195, + "grad_norm": 0.0035282974131405354, + "learning_rate": 5.793658322050999e-06, + "loss": 0.17236467599868774, + "step": 219490 + }, + { + "epoch": 0.9423593759391394, + "grad_norm": 2.3953702449798584, + "learning_rate": 5.789346601933376e-06, + "loss": 0.4158484935760498, + "step": 219500 + }, + { + "epoch": 0.9424023080291595, + "grad_norm": 0.0037427491042762995, + "learning_rate": 5.785034881815752e-06, + "loss": 0.14870126247406007, + "step": 219510 + }, + { + "epoch": 0.9424452401191795, + "grad_norm": 0.0012468050699681044, + "learning_rate": 5.780723161698128e-06, + "loss": 0.11983139514923095, + "step": 219520 + }, + { + "epoch": 0.9424881722091994, + "grad_norm": 0.008860177360475063, + "learning_rate": 5.7764114415805045e-06, + "loss": 0.2827944278717041, + "step": 219530 + }, + { + "epoch": 0.9425311042992195, + "grad_norm": 0.1442103087902069, + "learning_rate": 5.772099721462881e-06, + "loss": 0.3095766305923462, + "step": 219540 + }, + { + "epoch": 0.9425740363892395, + "grad_norm": 6.474790096282959, + "learning_rate": 5.767788001345257e-06, + "loss": 0.34821128845214844, + "step": 219550 + }, + { + "epoch": 0.9426169684792595, + "grad_norm": 0.05650155246257782, + "learning_rate": 5.763476281227633e-06, + "loss": 0.08767632246017457, + "step": 219560 + }, + { + "epoch": 0.9426599005692795, + "grad_norm": 0.001218812307342887, + "learning_rate": 5.75916456111001e-06, + "loss": 0.26210243701934816, + "step": 219570 + }, + { + "epoch": 0.9427028326592995, + "grad_norm": 1.0307174921035767, + "learning_rate": 5.754852840992385e-06, + "loss": 0.20877382755279542, + "step": 219580 + }, + { + "epoch": 0.9427457647493195, + "grad_norm": 1.477104663848877, + "learning_rate": 5.750541120874762e-06, + "loss": 0.1944859504699707, + "step": 219590 + }, + { + "epoch": 0.9427886968393395, + "grad_norm": 0.008503386750817299, + "learning_rate": 5.746229400757138e-06, + "loss": 0.1722312092781067, + "step": 219600 + }, + { + "epoch": 0.9428316289293596, + "grad_norm": 0.2559939920902252, + "learning_rate": 5.741917680639514e-06, + "loss": 0.2364104986190796, + "step": 219610 + }, + { + "epoch": 0.9428745610193795, + "grad_norm": 1.0935306549072266, + "learning_rate": 5.73760596052189e-06, + "loss": 0.18122740983963012, + "step": 219620 + }, + { + "epoch": 0.9429174931093995, + "grad_norm": 0.06634160131216049, + "learning_rate": 5.733294240404268e-06, + "loss": 0.03658463954925537, + "step": 219630 + }, + { + "epoch": 0.9429604251994196, + "grad_norm": 0.009046138264238834, + "learning_rate": 5.728982520286643e-06, + "loss": 0.09423674941062928, + "step": 219640 + }, + { + "epoch": 0.9430033572894395, + "grad_norm": 1.5938408374786377, + "learning_rate": 5.72467080016902e-06, + "loss": 0.16401240825653077, + "step": 219650 + }, + { + "epoch": 0.9430462893794596, + "grad_norm": 0.20236362516880035, + "learning_rate": 5.720359080051396e-06, + "loss": 0.2952629327774048, + "step": 219660 + }, + { + "epoch": 0.9430892214694796, + "grad_norm": 0.6822418570518494, + "learning_rate": 5.716047359933772e-06, + "loss": 0.19778774976730346, + "step": 219670 + }, + { + "epoch": 0.9431321535594995, + "grad_norm": 0.00200895918533206, + "learning_rate": 5.7117356398161485e-06, + "loss": 0.10837646722793579, + "step": 219680 + }, + { + "epoch": 0.9431750856495196, + "grad_norm": 0.0010093733435496688, + "learning_rate": 5.707423919698525e-06, + "loss": 0.16900849342346191, + "step": 219690 + }, + { + "epoch": 0.9432180177395396, + "grad_norm": 1.047550916671753, + "learning_rate": 5.7031121995809015e-06, + "loss": 0.16191253662109376, + "step": 219700 + }, + { + "epoch": 0.9432609498295595, + "grad_norm": 1.4691581726074219, + "learning_rate": 5.698800479463277e-06, + "loss": 0.27159914970397947, + "step": 219710 + }, + { + "epoch": 0.9433038819195796, + "grad_norm": 0.006137867458164692, + "learning_rate": 5.694488759345654e-06, + "loss": 0.12011933326721191, + "step": 219720 + }, + { + "epoch": 0.9433468140095996, + "grad_norm": 3.10224986076355, + "learning_rate": 5.69017703922803e-06, + "loss": 0.12906384468078613, + "step": 219730 + }, + { + "epoch": 0.9433897460996197, + "grad_norm": 1.1125973463058472, + "learning_rate": 5.685865319110406e-06, + "loss": 0.11585037708282471, + "step": 219740 + }, + { + "epoch": 0.9434326781896396, + "grad_norm": 1.7985775470733643, + "learning_rate": 5.681553598992782e-06, + "loss": 0.15065838098526002, + "step": 219750 + }, + { + "epoch": 0.9434756102796596, + "grad_norm": 1.7841936349868774, + "learning_rate": 5.677241878875159e-06, + "loss": 0.29679064750671386, + "step": 219760 + }, + { + "epoch": 0.9435185423696797, + "grad_norm": 0.0036705408710986376, + "learning_rate": 5.6729301587575344e-06, + "loss": 0.08027640581130982, + "step": 219770 + }, + { + "epoch": 0.9435614744596996, + "grad_norm": 0.0015720472438260913, + "learning_rate": 5.668618438639911e-06, + "loss": 0.13819996118545533, + "step": 219780 + }, + { + "epoch": 0.9436044065497197, + "grad_norm": 0.1628006100654602, + "learning_rate": 5.6643067185222874e-06, + "loss": 0.13903387784957885, + "step": 219790 + }, + { + "epoch": 0.9436473386397397, + "grad_norm": 0.23969070613384247, + "learning_rate": 5.659994998404664e-06, + "loss": 0.07040133476257324, + "step": 219800 + }, + { + "epoch": 0.9436902707297596, + "grad_norm": 5.787487030029297, + "learning_rate": 5.6556832782870404e-06, + "loss": 0.36126892566680907, + "step": 219810 + }, + { + "epoch": 0.9437332028197797, + "grad_norm": 0.8496485352516174, + "learning_rate": 5.651371558169417e-06, + "loss": 0.3064922332763672, + "step": 219820 + }, + { + "epoch": 0.9437761349097997, + "grad_norm": 2.6420037746429443, + "learning_rate": 5.6470598380517926e-06, + "loss": 0.23954577445983888, + "step": 219830 + }, + { + "epoch": 0.9438190669998197, + "grad_norm": 1.964925765991211, + "learning_rate": 5.642748117934169e-06, + "loss": 0.47455859184265137, + "step": 219840 + }, + { + "epoch": 0.9438619990898397, + "grad_norm": 0.029651742428541183, + "learning_rate": 5.6384363978165456e-06, + "loss": 0.178091561794281, + "step": 219850 + }, + { + "epoch": 0.9439049311798597, + "grad_norm": 0.015751611441373825, + "learning_rate": 5.634124677698921e-06, + "loss": 0.17995089292526245, + "step": 219860 + }, + { + "epoch": 0.9439478632698797, + "grad_norm": 0.011860656552016735, + "learning_rate": 5.629812957581298e-06, + "loss": 0.15687016248703003, + "step": 219870 + }, + { + "epoch": 0.9439907953598997, + "grad_norm": 0.003596897004172206, + "learning_rate": 5.625501237463674e-06, + "loss": 0.04495824873447418, + "step": 219880 + }, + { + "epoch": 0.9440337274499198, + "grad_norm": 0.0010537290945649147, + "learning_rate": 5.621189517346051e-06, + "loss": 0.1898065447807312, + "step": 219890 + }, + { + "epoch": 0.9440766595399397, + "grad_norm": 4.584033012390137, + "learning_rate": 5.616877797228426e-06, + "loss": 0.2350625991821289, + "step": 219900 + }, + { + "epoch": 0.9441195916299597, + "grad_norm": 0.002993101254105568, + "learning_rate": 5.612566077110803e-06, + "loss": 0.1021914005279541, + "step": 219910 + }, + { + "epoch": 0.9441625237199798, + "grad_norm": 0.31038209795951843, + "learning_rate": 5.608254356993179e-06, + "loss": 0.06648457646369935, + "step": 219920 + }, + { + "epoch": 0.9442054558099997, + "grad_norm": 0.01189296692609787, + "learning_rate": 5.603942636875555e-06, + "loss": 0.05628054738044739, + "step": 219930 + }, + { + "epoch": 0.9442483879000197, + "grad_norm": 0.018562033772468567, + "learning_rate": 5.5996309167579315e-06, + "loss": 0.37505056858062746, + "step": 219940 + }, + { + "epoch": 0.9442913199900398, + "grad_norm": 0.02130456268787384, + "learning_rate": 5.595319196640308e-06, + "loss": 0.24764158725738525, + "step": 219950 + }, + { + "epoch": 0.9443342520800597, + "grad_norm": 0.0013278445694595575, + "learning_rate": 5.591007476522684e-06, + "loss": 0.061684155464172365, + "step": 219960 + }, + { + "epoch": 0.9443771841700798, + "grad_norm": 0.012180428020656109, + "learning_rate": 5.58669575640506e-06, + "loss": 0.03911663293838501, + "step": 219970 + }, + { + "epoch": 0.9444201162600998, + "grad_norm": 0.10410662740468979, + "learning_rate": 5.582384036287437e-06, + "loss": 0.24889121055603028, + "step": 219980 + }, + { + "epoch": 0.9444630483501197, + "grad_norm": 2.865203380584717, + "learning_rate": 5.578072316169813e-06, + "loss": 0.5030947685241699, + "step": 219990 + }, + { + "epoch": 0.9445059804401398, + "grad_norm": 9.011594772338867, + "learning_rate": 5.57376059605219e-06, + "loss": 0.14479026794433594, + "step": 220000 + }, + { + "epoch": 0.9445059804401398, + "eval_loss": 0.36936265230178833, + "eval_runtime": 27.4733, + "eval_samples_per_second": 3.64, + "eval_steps_per_second": 3.64, + "step": 220000 + }, + { + "epoch": 0.9445489125301598, + "grad_norm": 0.02283742092549801, + "learning_rate": 5.569448875934566e-06, + "loss": 0.2979933977127075, + "step": 220010 + }, + { + "epoch": 0.9445918446201798, + "grad_norm": 0.8922110795974731, + "learning_rate": 5.565137155816942e-06, + "loss": 0.21819326877593995, + "step": 220020 + }, + { + "epoch": 0.9446347767101998, + "grad_norm": 0.0026191927026957273, + "learning_rate": 5.560825435699318e-06, + "loss": 0.12958024740219115, + "step": 220030 + }, + { + "epoch": 0.9446777088002198, + "grad_norm": 0.0004326049529481679, + "learning_rate": 5.556513715581695e-06, + "loss": 0.13836138248443602, + "step": 220040 + }, + { + "epoch": 0.9447206408902398, + "grad_norm": 1.2779291868209839, + "learning_rate": 5.55220199546407e-06, + "loss": 0.4020243167877197, + "step": 220050 + }, + { + "epoch": 0.9447635729802598, + "grad_norm": 0.005593777634203434, + "learning_rate": 5.547890275346447e-06, + "loss": 0.11673593521118164, + "step": 220060 + }, + { + "epoch": 0.9448065050702799, + "grad_norm": 0.13254858553409576, + "learning_rate": 5.543578555228823e-06, + "loss": 0.15964832305908203, + "step": 220070 + }, + { + "epoch": 0.9448494371602998, + "grad_norm": 4.318567752838135, + "learning_rate": 5.5392668351112e-06, + "loss": 0.2629995584487915, + "step": 220080 + }, + { + "epoch": 0.9448923692503198, + "grad_norm": 0.009927893988788128, + "learning_rate": 5.5349551149935755e-06, + "loss": 0.2225257158279419, + "step": 220090 + }, + { + "epoch": 0.9449353013403399, + "grad_norm": 0.00321480305865407, + "learning_rate": 5.530643394875952e-06, + "loss": 0.13898024559020997, + "step": 220100 + }, + { + "epoch": 0.9449782334303598, + "grad_norm": 5.857727527618408, + "learning_rate": 5.5263316747583285e-06, + "loss": 0.08403302431106567, + "step": 220110 + }, + { + "epoch": 0.9450211655203798, + "grad_norm": 1.4161524772644043, + "learning_rate": 5.522019954640704e-06, + "loss": 0.23849353790283204, + "step": 220120 + }, + { + "epoch": 0.9450640976103999, + "grad_norm": 0.0011095688678324223, + "learning_rate": 5.517708234523081e-06, + "loss": 0.18472713232040405, + "step": 220130 + }, + { + "epoch": 0.9451070297004198, + "grad_norm": 0.8806191682815552, + "learning_rate": 5.513396514405457e-06, + "loss": 0.18616429567337037, + "step": 220140 + }, + { + "epoch": 0.9451499617904399, + "grad_norm": 2.297152042388916, + "learning_rate": 5.509084794287833e-06, + "loss": 0.3651312828063965, + "step": 220150 + }, + { + "epoch": 0.9451928938804599, + "grad_norm": 0.0011765279341489077, + "learning_rate": 5.504773074170209e-06, + "loss": 0.31837148666381837, + "step": 220160 + }, + { + "epoch": 0.94523582597048, + "grad_norm": 0.9704437255859375, + "learning_rate": 5.500461354052586e-06, + "loss": 0.1830833673477173, + "step": 220170 + }, + { + "epoch": 0.9452787580604999, + "grad_norm": 1.3971502780914307, + "learning_rate": 5.496149633934962e-06, + "loss": 0.3164043664932251, + "step": 220180 + }, + { + "epoch": 0.9453216901505199, + "grad_norm": 6.114679336547852, + "learning_rate": 5.491837913817339e-06, + "loss": 0.0872846245765686, + "step": 220190 + }, + { + "epoch": 0.94536462224054, + "grad_norm": 0.43857166171073914, + "learning_rate": 5.487526193699715e-06, + "loss": 0.28688344955444334, + "step": 220200 + }, + { + "epoch": 0.9454075543305599, + "grad_norm": 0.9512658715248108, + "learning_rate": 5.483214473582091e-06, + "loss": 0.34722816944122314, + "step": 220210 + }, + { + "epoch": 0.9454504864205799, + "grad_norm": 0.20257291197776794, + "learning_rate": 5.4789027534644674e-06, + "loss": 0.22685813903808594, + "step": 220220 + }, + { + "epoch": 0.9454934185106, + "grad_norm": 0.0031715496443212032, + "learning_rate": 5.474591033346844e-06, + "loss": 0.026157155632972717, + "step": 220230 + }, + { + "epoch": 0.9455363506006199, + "grad_norm": 0.053399458527565, + "learning_rate": 5.4702793132292204e-06, + "loss": 0.18398308753967285, + "step": 220240 + }, + { + "epoch": 0.94557928269064, + "grad_norm": 0.3509232997894287, + "learning_rate": 5.465967593111596e-06, + "loss": 0.005688534304499626, + "step": 220250 + }, + { + "epoch": 0.94562221478066, + "grad_norm": 0.003000019583851099, + "learning_rate": 5.461655872993973e-06, + "loss": 0.2526733875274658, + "step": 220260 + }, + { + "epoch": 0.9456651468706799, + "grad_norm": 0.8390291929244995, + "learning_rate": 5.457344152876349e-06, + "loss": 0.293306303024292, + "step": 220270 + }, + { + "epoch": 0.9457080789607, + "grad_norm": 0.4303343892097473, + "learning_rate": 5.453032432758725e-06, + "loss": 0.2933788299560547, + "step": 220280 + }, + { + "epoch": 0.94575101105072, + "grad_norm": 2.3883321285247803, + "learning_rate": 5.448720712641101e-06, + "loss": 0.4658994197845459, + "step": 220290 + }, + { + "epoch": 0.94579394314074, + "grad_norm": 0.0033390114549547434, + "learning_rate": 5.444408992523478e-06, + "loss": 0.3251925468444824, + "step": 220300 + }, + { + "epoch": 0.94583687523076, + "grad_norm": 0.006041026208549738, + "learning_rate": 5.440097272405853e-06, + "loss": 0.05924082398414612, + "step": 220310 + }, + { + "epoch": 0.94587980732078, + "grad_norm": 1.786308765411377, + "learning_rate": 5.43578555228823e-06, + "loss": 0.15548089742660523, + "step": 220320 + }, + { + "epoch": 0.9459227394108, + "grad_norm": 29.166587829589844, + "learning_rate": 5.431473832170606e-06, + "loss": 0.2582669734954834, + "step": 220330 + }, + { + "epoch": 0.94596567150082, + "grad_norm": 0.40726613998413086, + "learning_rate": 5.427162112052982e-06, + "loss": 0.19830280542373657, + "step": 220340 + }, + { + "epoch": 0.94600860359084, + "grad_norm": 0.1460336297750473, + "learning_rate": 5.4228503919353585e-06, + "loss": 0.2719969987869263, + "step": 220350 + }, + { + "epoch": 0.94605153568086, + "grad_norm": 0.004638573620468378, + "learning_rate": 5.418538671817736e-06, + "loss": 0.287063455581665, + "step": 220360 + }, + { + "epoch": 0.94609446777088, + "grad_norm": 0.43055668473243713, + "learning_rate": 5.4142269517001115e-06, + "loss": 0.11195619106292724, + "step": 220370 + }, + { + "epoch": 0.9461373998609001, + "grad_norm": 0.0006677526980638504, + "learning_rate": 5.409915231582488e-06, + "loss": 0.2058488368988037, + "step": 220380 + }, + { + "epoch": 0.94618033195092, + "grad_norm": 0.010899543762207031, + "learning_rate": 5.4056035114648645e-06, + "loss": 0.1301390767097473, + "step": 220390 + }, + { + "epoch": 0.94622326404094, + "grad_norm": 0.5160672664642334, + "learning_rate": 5.40129179134724e-06, + "loss": 0.15139005184173585, + "step": 220400 + }, + { + "epoch": 0.9462661961309601, + "grad_norm": 0.4792172610759735, + "learning_rate": 5.396980071229617e-06, + "loss": 0.03056088089942932, + "step": 220410 + }, + { + "epoch": 0.94630912822098, + "grad_norm": 0.006284055765718222, + "learning_rate": 5.392668351111993e-06, + "loss": 0.0028323063626885412, + "step": 220420 + }, + { + "epoch": 0.946352060311, + "grad_norm": 0.10239158570766449, + "learning_rate": 5.38835663099437e-06, + "loss": 0.15772597789764403, + "step": 220430 + }, + { + "epoch": 0.9463949924010201, + "grad_norm": 0.047647874802351, + "learning_rate": 5.384044910876745e-06, + "loss": 0.19255368709564208, + "step": 220440 + }, + { + "epoch": 0.94643792449104, + "grad_norm": 0.02043193392455578, + "learning_rate": 5.379733190759122e-06, + "loss": 0.05766103863716125, + "step": 220450 + }, + { + "epoch": 0.9464808565810601, + "grad_norm": 0.2842939496040344, + "learning_rate": 5.375421470641498e-06, + "loss": 0.13044567108154298, + "step": 220460 + }, + { + "epoch": 0.9465237886710801, + "grad_norm": 0.002361274790018797, + "learning_rate": 5.371109750523874e-06, + "loss": 0.10269827842712402, + "step": 220470 + }, + { + "epoch": 0.9465667207611, + "grad_norm": 0.015903867781162262, + "learning_rate": 5.36679803040625e-06, + "loss": 0.45228824615478513, + "step": 220480 + }, + { + "epoch": 0.9466096528511201, + "grad_norm": 0.014387888833880424, + "learning_rate": 5.362486310288627e-06, + "loss": 0.09613164663314819, + "step": 220490 + }, + { + "epoch": 0.9466525849411401, + "grad_norm": 0.0561012402176857, + "learning_rate": 5.3581745901710026e-06, + "loss": 0.09313885569572448, + "step": 220500 + }, + { + "epoch": 0.9466955170311601, + "grad_norm": 0.1349417120218277, + "learning_rate": 5.353862870053379e-06, + "loss": 0.0629725456237793, + "step": 220510 + }, + { + "epoch": 0.9467384491211801, + "grad_norm": 1.3292343616485596, + "learning_rate": 5.3495511499357555e-06, + "loss": 0.13882611989974974, + "step": 220520 + }, + { + "epoch": 0.9467813812112001, + "grad_norm": 0.27240267395973206, + "learning_rate": 5.345239429818131e-06, + "loss": 0.3212329149246216, + "step": 220530 + }, + { + "epoch": 0.9468243133012201, + "grad_norm": 0.0015581254847347736, + "learning_rate": 5.3409277097005085e-06, + "loss": 0.08865725994110107, + "step": 220540 + }, + { + "epoch": 0.9468672453912401, + "grad_norm": 0.024519003927707672, + "learning_rate": 5.336615989582885e-06, + "loss": 0.010979881882667542, + "step": 220550 + }, + { + "epoch": 0.9469101774812602, + "grad_norm": 6.637258529663086, + "learning_rate": 5.332304269465261e-06, + "loss": 0.10928783416748047, + "step": 220560 + }, + { + "epoch": 0.9469531095712801, + "grad_norm": 0.15391530096530914, + "learning_rate": 5.327992549347637e-06, + "loss": 0.22134747505187988, + "step": 220570 + }, + { + "epoch": 0.9469960416613001, + "grad_norm": 3.40328049659729, + "learning_rate": 5.323680829230014e-06, + "loss": 0.32368927001953124, + "step": 220580 + }, + { + "epoch": 0.9470389737513202, + "grad_norm": 1.0748347043991089, + "learning_rate": 5.319369109112389e-06, + "loss": 0.30531432628631594, + "step": 220590 + }, + { + "epoch": 0.9470819058413402, + "grad_norm": 0.009988274425268173, + "learning_rate": 5.315057388994766e-06, + "loss": 0.07916906476020813, + "step": 220600 + }, + { + "epoch": 0.9471248379313602, + "grad_norm": 0.06923183798789978, + "learning_rate": 5.310745668877142e-06, + "loss": 0.2095344305038452, + "step": 220610 + }, + { + "epoch": 0.9471677700213802, + "grad_norm": 0.7235842347145081, + "learning_rate": 5.306433948759519e-06, + "loss": 0.2882643938064575, + "step": 220620 + }, + { + "epoch": 0.9472107021114002, + "grad_norm": 0.03255331888794899, + "learning_rate": 5.3021222286418945e-06, + "loss": 0.14885547161102294, + "step": 220630 + }, + { + "epoch": 0.9472536342014202, + "grad_norm": 0.00038300984306260943, + "learning_rate": 5.297810508524271e-06, + "loss": 0.11864241361618041, + "step": 220640 + }, + { + "epoch": 0.9472965662914402, + "grad_norm": 1.5155200958251953, + "learning_rate": 5.2934987884066475e-06, + "loss": 0.17059919834136963, + "step": 220650 + }, + { + "epoch": 0.9473394983814603, + "grad_norm": 0.0005635952693410218, + "learning_rate": 5.289187068289023e-06, + "loss": 0.19965181350708008, + "step": 220660 + }, + { + "epoch": 0.9473824304714802, + "grad_norm": 0.0028896143194288015, + "learning_rate": 5.2848753481714e-06, + "loss": 0.11965832710266114, + "step": 220670 + }, + { + "epoch": 0.9474253625615002, + "grad_norm": 0.004104061983525753, + "learning_rate": 5.280563628053776e-06, + "loss": 0.14908111095428467, + "step": 220680 + }, + { + "epoch": 0.9474682946515203, + "grad_norm": 1.0493465662002563, + "learning_rate": 5.276251907936152e-06, + "loss": 0.15110547542572023, + "step": 220690 + }, + { + "epoch": 0.9475112267415402, + "grad_norm": 3.4508376121520996, + "learning_rate": 5.271940187818528e-06, + "loss": 0.20434744358062745, + "step": 220700 + }, + { + "epoch": 0.9475541588315602, + "grad_norm": 0.0009196574683301151, + "learning_rate": 5.267628467700905e-06, + "loss": 0.26406259536743165, + "step": 220710 + }, + { + "epoch": 0.9475970909215803, + "grad_norm": 0.0016605377895757556, + "learning_rate": 5.263316747583281e-06, + "loss": 0.14054032564163207, + "step": 220720 + }, + { + "epoch": 0.9476400230116002, + "grad_norm": 0.4991012513637543, + "learning_rate": 5.259005027465658e-06, + "loss": 0.1640143871307373, + "step": 220730 + }, + { + "epoch": 0.9476829551016203, + "grad_norm": 0.18163564801216125, + "learning_rate": 5.254693307348034e-06, + "loss": 0.2269913911819458, + "step": 220740 + }, + { + "epoch": 0.9477258871916403, + "grad_norm": 0.13701975345611572, + "learning_rate": 5.25038158723041e-06, + "loss": 0.1771934747695923, + "step": 220750 + }, + { + "epoch": 0.9477688192816602, + "grad_norm": 0.009738288819789886, + "learning_rate": 5.246069867112786e-06, + "loss": 0.12999645471572877, + "step": 220760 + }, + { + "epoch": 0.9478117513716803, + "grad_norm": 0.004294713959097862, + "learning_rate": 5.241758146995163e-06, + "loss": 0.21597652435302733, + "step": 220770 + }, + { + "epoch": 0.9478546834617003, + "grad_norm": 0.0014341874048113823, + "learning_rate": 5.2374464268775385e-06, + "loss": 0.1990837574005127, + "step": 220780 + }, + { + "epoch": 0.9478976155517203, + "grad_norm": 0.0011496704537421465, + "learning_rate": 5.233134706759915e-06, + "loss": 0.2995701789855957, + "step": 220790 + }, + { + "epoch": 0.9479405476417403, + "grad_norm": 0.32946211099624634, + "learning_rate": 5.2288229866422915e-06, + "loss": 0.08457562923431397, + "step": 220800 + }, + { + "epoch": 0.9479834797317603, + "grad_norm": 2.4673047065734863, + "learning_rate": 5.224511266524668e-06, + "loss": 0.3051250457763672, + "step": 220810 + }, + { + "epoch": 0.9480264118217803, + "grad_norm": 1.6087555885314941, + "learning_rate": 5.220199546407044e-06, + "loss": 0.11010923385620117, + "step": 220820 + }, + { + "epoch": 0.9480693439118003, + "grad_norm": 2.3338229656219482, + "learning_rate": 5.21588782628942e-06, + "loss": 0.2138131618499756, + "step": 220830 + }, + { + "epoch": 0.9481122760018204, + "grad_norm": 5.679168224334717, + "learning_rate": 5.211576106171797e-06, + "loss": 0.17699214220046997, + "step": 220840 + }, + { + "epoch": 0.9481552080918403, + "grad_norm": 0.005776789505034685, + "learning_rate": 5.207264386054172e-06, + "loss": 0.10060790777206421, + "step": 220850 + }, + { + "epoch": 0.9481981401818603, + "grad_norm": 0.015451615676283836, + "learning_rate": 5.202952665936549e-06, + "loss": 0.37086925506591795, + "step": 220860 + }, + { + "epoch": 0.9482410722718804, + "grad_norm": 1.4263032674789429, + "learning_rate": 5.198640945818925e-06, + "loss": 0.2528158903121948, + "step": 220870 + }, + { + "epoch": 0.9482840043619003, + "grad_norm": 0.03226279467344284, + "learning_rate": 5.194329225701301e-06, + "loss": 0.08974118828773499, + "step": 220880 + }, + { + "epoch": 0.9483269364519203, + "grad_norm": 4.70692777633667, + "learning_rate": 5.1900175055836774e-06, + "loss": 0.2029585361480713, + "step": 220890 + }, + { + "epoch": 0.9483698685419404, + "grad_norm": 0.20702318847179413, + "learning_rate": 5.185705785466054e-06, + "loss": 0.2841609001159668, + "step": 220900 + }, + { + "epoch": 0.9484128006319603, + "grad_norm": 0.005325347185134888, + "learning_rate": 5.18139406534843e-06, + "loss": 0.17788323163986205, + "step": 220910 + }, + { + "epoch": 0.9484557327219804, + "grad_norm": 1.380802869796753, + "learning_rate": 5.177082345230807e-06, + "loss": 0.1807429552078247, + "step": 220920 + }, + { + "epoch": 0.9484986648120004, + "grad_norm": 0.017762403935194016, + "learning_rate": 5.172770625113183e-06, + "loss": 0.12475830316543579, + "step": 220930 + }, + { + "epoch": 0.9485415969020203, + "grad_norm": 1.45126211643219, + "learning_rate": 5.168458904995559e-06, + "loss": 0.06799777746200561, + "step": 220940 + }, + { + "epoch": 0.9485845289920404, + "grad_norm": 0.7015895843505859, + "learning_rate": 5.1641471848779356e-06, + "loss": 0.2524296760559082, + "step": 220950 + }, + { + "epoch": 0.9486274610820604, + "grad_norm": 0.03411026671528816, + "learning_rate": 5.159835464760312e-06, + "loss": 0.1578918218612671, + "step": 220960 + }, + { + "epoch": 0.9486703931720804, + "grad_norm": 0.09593575447797775, + "learning_rate": 5.155523744642688e-06, + "loss": 0.3851247072219849, + "step": 220970 + }, + { + "epoch": 0.9487133252621004, + "grad_norm": 0.019091414287686348, + "learning_rate": 5.151212024525064e-06, + "loss": 0.14782023429870605, + "step": 220980 + }, + { + "epoch": 0.9487562573521204, + "grad_norm": 0.10230414569377899, + "learning_rate": 5.146900304407441e-06, + "loss": 0.17959287166595458, + "step": 220990 + }, + { + "epoch": 0.9487991894421404, + "grad_norm": 1.1342577934265137, + "learning_rate": 5.142588584289817e-06, + "loss": 0.2069911003112793, + "step": 221000 + }, + { + "epoch": 0.9487991894421404, + "eval_loss": 0.37033191323280334, + "eval_runtime": 27.4293, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 221000 + }, + { + "epoch": 0.9488421215321604, + "grad_norm": 0.01214311458170414, + "learning_rate": 5.138276864172193e-06, + "loss": 0.07155942916870117, + "step": 221010 + }, + { + "epoch": 0.9488850536221805, + "grad_norm": 0.0067489249631762505, + "learning_rate": 5.133965144054569e-06, + "loss": 0.14288434982299805, + "step": 221020 + }, + { + "epoch": 0.9489279857122005, + "grad_norm": 0.021981053054332733, + "learning_rate": 5.129653423936946e-06, + "loss": 0.08851742744445801, + "step": 221030 + }, + { + "epoch": 0.9489709178022204, + "grad_norm": 0.016317633911967278, + "learning_rate": 5.1253417038193215e-06, + "loss": 0.13080765008926393, + "step": 221040 + }, + { + "epoch": 0.9490138498922405, + "grad_norm": 7.504158973693848, + "learning_rate": 5.121029983701698e-06, + "loss": 0.29133789539337157, + "step": 221050 + }, + { + "epoch": 0.9490567819822605, + "grad_norm": 0.020063180476427078, + "learning_rate": 5.1167182635840745e-06, + "loss": 0.11202926635742187, + "step": 221060 + }, + { + "epoch": 0.9490997140722804, + "grad_norm": 0.18971426784992218, + "learning_rate": 5.11240654346645e-06, + "loss": 0.18990329504013062, + "step": 221070 + }, + { + "epoch": 0.9491426461623005, + "grad_norm": 0.0023197117261588573, + "learning_rate": 5.108094823348827e-06, + "loss": 0.24900193214416505, + "step": 221080 + }, + { + "epoch": 0.9491855782523205, + "grad_norm": 0.003275302704423666, + "learning_rate": 5.103783103231204e-06, + "loss": 0.1832704186439514, + "step": 221090 + }, + { + "epoch": 0.9492285103423405, + "grad_norm": 0.011212700977921486, + "learning_rate": 5.09947138311358e-06, + "loss": 0.12993799448013305, + "step": 221100 + }, + { + "epoch": 0.9492714424323605, + "grad_norm": 0.09696463495492935, + "learning_rate": 5.095159662995956e-06, + "loss": 0.1601085901260376, + "step": 221110 + }, + { + "epoch": 0.9493143745223805, + "grad_norm": 0.07188393920660019, + "learning_rate": 5.090847942878333e-06, + "loss": 0.192316997051239, + "step": 221120 + }, + { + "epoch": 0.9493573066124005, + "grad_norm": 0.055957090109586716, + "learning_rate": 5.086536222760708e-06, + "loss": 0.15709545612335205, + "step": 221130 + }, + { + "epoch": 0.9494002387024205, + "grad_norm": 0.9220591187477112, + "learning_rate": 5.082224502643085e-06, + "loss": 0.24518163204193116, + "step": 221140 + }, + { + "epoch": 0.9494431707924406, + "grad_norm": 1.6489856243133545, + "learning_rate": 5.077912782525461e-06, + "loss": 0.218973708152771, + "step": 221150 + }, + { + "epoch": 0.9494861028824605, + "grad_norm": 0.0015439556445926428, + "learning_rate": 5.073601062407837e-06, + "loss": 0.19692326784133912, + "step": 221160 + }, + { + "epoch": 0.9495290349724805, + "grad_norm": 0.1865650713443756, + "learning_rate": 5.069289342290213e-06, + "loss": 0.16926788091659545, + "step": 221170 + }, + { + "epoch": 0.9495719670625006, + "grad_norm": 1.5441687107086182, + "learning_rate": 5.06497762217259e-06, + "loss": 0.17856531143188475, + "step": 221180 + }, + { + "epoch": 0.9496148991525205, + "grad_norm": 0.028018776327371597, + "learning_rate": 5.060665902054966e-06, + "loss": 0.20565359592437743, + "step": 221190 + }, + { + "epoch": 0.9496578312425406, + "grad_norm": 2.1190805435180664, + "learning_rate": 5.056354181937342e-06, + "loss": 0.15518250465393066, + "step": 221200 + }, + { + "epoch": 0.9497007633325606, + "grad_norm": 1.3152310848236084, + "learning_rate": 5.0520424618197185e-06, + "loss": 0.25301418304443357, + "step": 221210 + }, + { + "epoch": 0.9497436954225805, + "grad_norm": 1.5250507593154907, + "learning_rate": 5.047730741702095e-06, + "loss": 0.20646183490753173, + "step": 221220 + }, + { + "epoch": 0.9497866275126006, + "grad_norm": 0.0007811338873580098, + "learning_rate": 5.043419021584471e-06, + "loss": 0.17377300262451173, + "step": 221230 + }, + { + "epoch": 0.9498295596026206, + "grad_norm": 1.1598446369171143, + "learning_rate": 5.039107301466847e-06, + "loss": 0.2322542667388916, + "step": 221240 + }, + { + "epoch": 0.9498724916926405, + "grad_norm": 0.009316137060523033, + "learning_rate": 5.034795581349224e-06, + "loss": 0.07732200622558594, + "step": 221250 + }, + { + "epoch": 0.9499154237826606, + "grad_norm": 0.407930850982666, + "learning_rate": 5.030483861231599e-06, + "loss": 0.24793801307678223, + "step": 221260 + }, + { + "epoch": 0.9499583558726806, + "grad_norm": 0.048544321209192276, + "learning_rate": 5.026172141113977e-06, + "loss": 0.07298610806465149, + "step": 221270 + }, + { + "epoch": 0.9500012879627006, + "grad_norm": 1.4429339170455933, + "learning_rate": 5.021860420996353e-06, + "loss": 0.23808639049530028, + "step": 221280 + }, + { + "epoch": 0.9500442200527206, + "grad_norm": 0.14647912979125977, + "learning_rate": 5.017548700878729e-06, + "loss": 0.09555991888046264, + "step": 221290 + }, + { + "epoch": 0.9500871521427406, + "grad_norm": 3.288055658340454, + "learning_rate": 5.013236980761105e-06, + "loss": 0.18354005813598634, + "step": 221300 + }, + { + "epoch": 0.9501300842327606, + "grad_norm": 0.00022563387756235898, + "learning_rate": 5.008925260643482e-06, + "loss": 0.196072518825531, + "step": 221310 + }, + { + "epoch": 0.9501730163227806, + "grad_norm": 1.5810396671295166, + "learning_rate": 5.0046135405258574e-06, + "loss": 0.21989688873291016, + "step": 221320 + }, + { + "epoch": 0.9502159484128007, + "grad_norm": 1.266875147819519, + "learning_rate": 5.000301820408234e-06, + "loss": 0.13136966228485109, + "step": 221330 + }, + { + "epoch": 0.9502588805028206, + "grad_norm": 0.001413840800523758, + "learning_rate": 4.99599010029061e-06, + "loss": 0.159348464012146, + "step": 221340 + }, + { + "epoch": 0.9503018125928406, + "grad_norm": 0.009680492803454399, + "learning_rate": 4.991678380172986e-06, + "loss": 0.18243454694747924, + "step": 221350 + }, + { + "epoch": 0.9503447446828607, + "grad_norm": 0.009590870700776577, + "learning_rate": 4.9873666600553626e-06, + "loss": 0.4021789073944092, + "step": 221360 + }, + { + "epoch": 0.9503876767728806, + "grad_norm": 0.002812737599015236, + "learning_rate": 4.983054939937739e-06, + "loss": 0.34322056770324705, + "step": 221370 + }, + { + "epoch": 0.9504306088629006, + "grad_norm": 0.021498220041394234, + "learning_rate": 4.9787432198201156e-06, + "loss": 0.1618666648864746, + "step": 221380 + }, + { + "epoch": 0.9504735409529207, + "grad_norm": 0.000619508558884263, + "learning_rate": 4.974431499702491e-06, + "loss": 0.4161521911621094, + "step": 221390 + }, + { + "epoch": 0.9505164730429406, + "grad_norm": 0.015014680102467537, + "learning_rate": 4.970119779584868e-06, + "loss": 0.1755547881126404, + "step": 221400 + }, + { + "epoch": 0.9505594051329607, + "grad_norm": 0.9173482060432434, + "learning_rate": 4.965808059467244e-06, + "loss": 0.4441547870635986, + "step": 221410 + }, + { + "epoch": 0.9506023372229807, + "grad_norm": 0.007916325703263283, + "learning_rate": 4.96149633934962e-06, + "loss": 0.05237484574317932, + "step": 221420 + }, + { + "epoch": 0.9506452693130006, + "grad_norm": 0.14865094423294067, + "learning_rate": 4.957184619231996e-06, + "loss": 0.20918598175048828, + "step": 221430 + }, + { + "epoch": 0.9506882014030207, + "grad_norm": 3.4020707607269287, + "learning_rate": 4.952872899114373e-06, + "loss": 0.18745356798171997, + "step": 221440 + }, + { + "epoch": 0.9507311334930407, + "grad_norm": 0.0011085561709478498, + "learning_rate": 4.948561178996749e-06, + "loss": 0.17667560577392577, + "step": 221450 + }, + { + "epoch": 0.9507740655830608, + "grad_norm": 8.518034934997559, + "learning_rate": 4.944249458879126e-06, + "loss": 0.09774234294891357, + "step": 221460 + }, + { + "epoch": 0.9508169976730807, + "grad_norm": 0.3491690754890442, + "learning_rate": 4.939937738761502e-06, + "loss": 0.2363292694091797, + "step": 221470 + }, + { + "epoch": 0.9508599297631007, + "grad_norm": 0.011719225905835629, + "learning_rate": 4.935626018643878e-06, + "loss": 0.18856680393218994, + "step": 221480 + }, + { + "epoch": 0.9509028618531208, + "grad_norm": 0.00425582705065608, + "learning_rate": 4.9313142985262545e-06, + "loss": 0.1619241714477539, + "step": 221490 + }, + { + "epoch": 0.9509457939431407, + "grad_norm": 0.0060163987800478935, + "learning_rate": 4.927002578408631e-06, + "loss": 0.16561635732650756, + "step": 221500 + }, + { + "epoch": 0.9509887260331608, + "grad_norm": 0.01624118909239769, + "learning_rate": 4.922690858291007e-06, + "loss": 0.04563118517398834, + "step": 221510 + }, + { + "epoch": 0.9510316581231808, + "grad_norm": 1.32929265499115, + "learning_rate": 4.918379138173383e-06, + "loss": 0.24808201789855958, + "step": 221520 + }, + { + "epoch": 0.9510745902132007, + "grad_norm": 0.03225236386060715, + "learning_rate": 4.91406741805576e-06, + "loss": 0.24988224506378173, + "step": 221530 + }, + { + "epoch": 0.9511175223032208, + "grad_norm": 2.835340738296509, + "learning_rate": 4.909755697938135e-06, + "loss": 0.15641406774520875, + "step": 221540 + }, + { + "epoch": 0.9511604543932408, + "grad_norm": 0.08538325875997543, + "learning_rate": 4.905443977820512e-06, + "loss": 0.10083311796188354, + "step": 221550 + }, + { + "epoch": 0.9512033864832607, + "grad_norm": 3.253385305404663, + "learning_rate": 4.901132257702888e-06, + "loss": 0.12257604598999024, + "step": 221560 + }, + { + "epoch": 0.9512463185732808, + "grad_norm": 0.0689903274178505, + "learning_rate": 4.896820537585265e-06, + "loss": 0.2482537269592285, + "step": 221570 + }, + { + "epoch": 0.9512892506633008, + "grad_norm": 0.9589329361915588, + "learning_rate": 4.89250881746764e-06, + "loss": 0.14661691188812256, + "step": 221580 + }, + { + "epoch": 0.9513321827533208, + "grad_norm": 0.8014940023422241, + "learning_rate": 4.888197097350017e-06, + "loss": 0.15701075792312622, + "step": 221590 + }, + { + "epoch": 0.9513751148433408, + "grad_norm": 0.0076012699864804745, + "learning_rate": 4.883885377232393e-06, + "loss": 0.20025289058685303, + "step": 221600 + }, + { + "epoch": 0.9514180469333609, + "grad_norm": 0.9554777145385742, + "learning_rate": 4.879573657114769e-06, + "loss": 0.21109647750854493, + "step": 221610 + }, + { + "epoch": 0.9514609790233808, + "grad_norm": 0.03870271146297455, + "learning_rate": 4.8752619369971455e-06, + "loss": 0.151367712020874, + "step": 221620 + }, + { + "epoch": 0.9515039111134008, + "grad_norm": 12.868548393249512, + "learning_rate": 4.870950216879523e-06, + "loss": 0.17155493497848512, + "step": 221630 + }, + { + "epoch": 0.9515468432034209, + "grad_norm": 1.6246249675750732, + "learning_rate": 4.8666384967618985e-06, + "loss": 0.3529273509979248, + "step": 221640 + }, + { + "epoch": 0.9515897752934408, + "grad_norm": 1.4000318050384521, + "learning_rate": 4.862326776644275e-06, + "loss": 0.4355010509490967, + "step": 221650 + }, + { + "epoch": 0.9516327073834608, + "grad_norm": 4.835864543914795, + "learning_rate": 4.8580150565266515e-06, + "loss": 0.22297344207763672, + "step": 221660 + }, + { + "epoch": 0.9516756394734809, + "grad_norm": 0.011678424663841724, + "learning_rate": 4.853703336409027e-06, + "loss": 0.10040615797042847, + "step": 221670 + }, + { + "epoch": 0.9517185715635008, + "grad_norm": 0.04759407415986061, + "learning_rate": 4.849391616291404e-06, + "loss": 0.20213911533355713, + "step": 221680 + }, + { + "epoch": 0.9517615036535209, + "grad_norm": 0.0031532905995845795, + "learning_rate": 4.84507989617378e-06, + "loss": 0.26140198707580564, + "step": 221690 + }, + { + "epoch": 0.9518044357435409, + "grad_norm": 0.004247985314577818, + "learning_rate": 4.840768176056156e-06, + "loss": 0.2709981441497803, + "step": 221700 + }, + { + "epoch": 0.9518473678335608, + "grad_norm": 0.08644208312034607, + "learning_rate": 4.836456455938532e-06, + "loss": 0.15383058786392212, + "step": 221710 + }, + { + "epoch": 0.9518902999235809, + "grad_norm": 0.18716543912887573, + "learning_rate": 4.832144735820909e-06, + "loss": 0.13800834417343139, + "step": 221720 + }, + { + "epoch": 0.9519332320136009, + "grad_norm": 0.020547056570649147, + "learning_rate": 4.8278330157032844e-06, + "loss": 0.22123284339904786, + "step": 221730 + }, + { + "epoch": 0.9519761641036208, + "grad_norm": 0.0005546602769754827, + "learning_rate": 4.823521295585661e-06, + "loss": 0.05822961926460266, + "step": 221740 + }, + { + "epoch": 0.9520190961936409, + "grad_norm": 0.007388788275420666, + "learning_rate": 4.8192095754680374e-06, + "loss": 0.07534821033477783, + "step": 221750 + }, + { + "epoch": 0.9520620282836609, + "grad_norm": 0.06283167749643326, + "learning_rate": 4.814897855350414e-06, + "loss": 0.05532159805297852, + "step": 221760 + }, + { + "epoch": 0.9521049603736809, + "grad_norm": 0.12734369933605194, + "learning_rate": 4.81058613523279e-06, + "loss": 0.06152777075767517, + "step": 221770 + }, + { + "epoch": 0.9521478924637009, + "grad_norm": 0.018678177148103714, + "learning_rate": 4.806274415115166e-06, + "loss": 0.1855409026145935, + "step": 221780 + }, + { + "epoch": 0.952190824553721, + "grad_norm": 0.007766157388687134, + "learning_rate": 4.8019626949975426e-06, + "loss": 0.13776013851165772, + "step": 221790 + }, + { + "epoch": 0.9522337566437409, + "grad_norm": 0.0021806948352605104, + "learning_rate": 4.797650974879918e-06, + "loss": 0.24061529636383056, + "step": 221800 + }, + { + "epoch": 0.9522766887337609, + "grad_norm": 0.0725255087018013, + "learning_rate": 4.793339254762295e-06, + "loss": 0.002632809802889824, + "step": 221810 + }, + { + "epoch": 0.952319620823781, + "grad_norm": 0.023199887946248055, + "learning_rate": 4.789027534644672e-06, + "loss": 0.05077831149101257, + "step": 221820 + }, + { + "epoch": 0.9523625529138009, + "grad_norm": 1.4895657300949097, + "learning_rate": 4.784715814527048e-06, + "loss": 0.10186721086502075, + "step": 221830 + }, + { + "epoch": 0.9524054850038209, + "grad_norm": 2.4794552326202393, + "learning_rate": 4.780404094409424e-06, + "loss": 0.31405203342437743, + "step": 221840 + }, + { + "epoch": 0.952448417093841, + "grad_norm": 0.003952878527343273, + "learning_rate": 4.776092374291801e-06, + "loss": 0.2233206510543823, + "step": 221850 + }, + { + "epoch": 0.9524913491838609, + "grad_norm": 0.006409044843167067, + "learning_rate": 4.771780654174176e-06, + "loss": 0.20069022178649903, + "step": 221860 + }, + { + "epoch": 0.952534281273881, + "grad_norm": 8.640118598937988, + "learning_rate": 4.767468934056553e-06, + "loss": 0.11834697723388672, + "step": 221870 + }, + { + "epoch": 0.952577213363901, + "grad_norm": 0.11279051005840302, + "learning_rate": 4.763157213938929e-06, + "loss": 0.1271551489830017, + "step": 221880 + }, + { + "epoch": 0.952620145453921, + "grad_norm": 1.8554179668426514, + "learning_rate": 4.758845493821305e-06, + "loss": 0.1396311640739441, + "step": 221890 + }, + { + "epoch": 0.952663077543941, + "grad_norm": 0.005013489164412022, + "learning_rate": 4.7545337737036815e-06, + "loss": 0.08952710628509522, + "step": 221900 + }, + { + "epoch": 0.952706009633961, + "grad_norm": 0.006020084954798222, + "learning_rate": 4.750222053586058e-06, + "loss": 0.20054419040679933, + "step": 221910 + }, + { + "epoch": 0.9527489417239811, + "grad_norm": 0.019753310829401016, + "learning_rate": 4.745910333468434e-06, + "loss": 0.23398704528808595, + "step": 221920 + }, + { + "epoch": 0.952791873814001, + "grad_norm": 0.00018748146248981357, + "learning_rate": 4.74159861335081e-06, + "loss": 0.15820237398147582, + "step": 221930 + }, + { + "epoch": 0.952834805904021, + "grad_norm": 0.02594497613608837, + "learning_rate": 4.737286893233187e-06, + "loss": 0.09363011121749878, + "step": 221940 + }, + { + "epoch": 0.9528777379940411, + "grad_norm": 0.0038654941599816084, + "learning_rate": 4.732975173115563e-06, + "loss": 0.21626200675964355, + "step": 221950 + }, + { + "epoch": 0.952920670084061, + "grad_norm": 0.00581236369907856, + "learning_rate": 4.728663452997939e-06, + "loss": 0.19654461145401, + "step": 221960 + }, + { + "epoch": 0.952963602174081, + "grad_norm": 6.517816543579102, + "learning_rate": 4.724351732880315e-06, + "loss": 0.4622176170349121, + "step": 221970 + }, + { + "epoch": 0.9530065342641011, + "grad_norm": 0.012695305980741978, + "learning_rate": 4.720040012762692e-06, + "loss": 0.21787841320037843, + "step": 221980 + }, + { + "epoch": 0.953049466354121, + "grad_norm": 0.3617287576198578, + "learning_rate": 4.715728292645067e-06, + "loss": 0.25056912899017336, + "step": 221990 + }, + { + "epoch": 0.9530923984441411, + "grad_norm": 0.04657085984945297, + "learning_rate": 4.711416572527445e-06, + "loss": 0.17787359952926635, + "step": 222000 + }, + { + "epoch": 0.9530923984441411, + "eval_loss": 0.3706425726413727, + "eval_runtime": 27.4304, + "eval_samples_per_second": 3.646, + "eval_steps_per_second": 3.646, + "step": 222000 + }, + { + "epoch": 0.9531353305341611, + "grad_norm": 0.03316396474838257, + "learning_rate": 4.707104852409821e-06, + "loss": 0.15991002321243286, + "step": 222010 + }, + { + "epoch": 0.953178262624181, + "grad_norm": 0.36852920055389404, + "learning_rate": 4.702793132292197e-06, + "loss": 0.1532452940940857, + "step": 222020 + }, + { + "epoch": 0.9532211947142011, + "grad_norm": 4.372796535491943, + "learning_rate": 4.698481412174573e-06, + "loss": 0.2997922897338867, + "step": 222030 + }, + { + "epoch": 0.9532641268042211, + "grad_norm": 2.077420711517334, + "learning_rate": 4.69416969205695e-06, + "loss": 0.3332381248474121, + "step": 222040 + }, + { + "epoch": 0.953307058894241, + "grad_norm": 0.002367915352806449, + "learning_rate": 4.6898579719393255e-06, + "loss": 0.16526131629943847, + "step": 222050 + }, + { + "epoch": 0.9533499909842611, + "grad_norm": 7.411321640014648, + "learning_rate": 4.685546251821702e-06, + "loss": 0.24013891220092773, + "step": 222060 + }, + { + "epoch": 0.9533929230742811, + "grad_norm": 0.02261611446738243, + "learning_rate": 4.6812345317040785e-06, + "loss": 0.09087812900543213, + "step": 222070 + }, + { + "epoch": 0.9534358551643011, + "grad_norm": 0.01428727526217699, + "learning_rate": 4.676922811586454e-06, + "loss": 0.21020045280456542, + "step": 222080 + }, + { + "epoch": 0.9534787872543211, + "grad_norm": 0.25873035192489624, + "learning_rate": 4.672611091468831e-06, + "loss": 0.07374967336654663, + "step": 222090 + }, + { + "epoch": 0.9535217193443412, + "grad_norm": 0.0773739367723465, + "learning_rate": 4.668299371351207e-06, + "loss": 0.3371073007583618, + "step": 222100 + }, + { + "epoch": 0.9535646514343611, + "grad_norm": 0.024157024919986725, + "learning_rate": 4.663987651233584e-06, + "loss": 0.1477481961250305, + "step": 222110 + }, + { + "epoch": 0.9536075835243811, + "grad_norm": 0.0007937061600387096, + "learning_rate": 4.659675931115959e-06, + "loss": 0.1839459180831909, + "step": 222120 + }, + { + "epoch": 0.9536505156144012, + "grad_norm": 0.010143321007490158, + "learning_rate": 4.655364210998336e-06, + "loss": 0.3554100275039673, + "step": 222130 + }, + { + "epoch": 0.9536934477044211, + "grad_norm": 0.01476583257317543, + "learning_rate": 4.651052490880712e-06, + "loss": 0.16000083684921265, + "step": 222140 + }, + { + "epoch": 0.9537363797944411, + "grad_norm": 0.0036839002277702093, + "learning_rate": 4.646740770763088e-06, + "loss": 0.12175587415695191, + "step": 222150 + }, + { + "epoch": 0.9537793118844612, + "grad_norm": 0.023834414780139923, + "learning_rate": 4.6424290506454645e-06, + "loss": 0.19544967412948608, + "step": 222160 + }, + { + "epoch": 0.9538222439744811, + "grad_norm": 0.0008877997170202434, + "learning_rate": 4.638117330527841e-06, + "loss": 0.21831424236297609, + "step": 222170 + }, + { + "epoch": 0.9538651760645012, + "grad_norm": 0.518056333065033, + "learning_rate": 4.6338056104102174e-06, + "loss": 0.277052640914917, + "step": 222180 + }, + { + "epoch": 0.9539081081545212, + "grad_norm": 2.052628517150879, + "learning_rate": 4.629493890292594e-06, + "loss": 0.18043670654296876, + "step": 222190 + }, + { + "epoch": 0.9539510402445411, + "grad_norm": 2.4332070350646973, + "learning_rate": 4.6251821701749704e-06, + "loss": 0.37254397869110106, + "step": 222200 + }, + { + "epoch": 0.9539939723345612, + "grad_norm": 2.9976701736450195, + "learning_rate": 4.620870450057346e-06, + "loss": 0.2258861780166626, + "step": 222210 + }, + { + "epoch": 0.9540369044245812, + "grad_norm": 4.1985344886779785, + "learning_rate": 4.616558729939723e-06, + "loss": 0.3831270694732666, + "step": 222220 + }, + { + "epoch": 0.9540798365146012, + "grad_norm": 1.095831274986267, + "learning_rate": 4.612247009822099e-06, + "loss": 0.2485506296157837, + "step": 222230 + }, + { + "epoch": 0.9541227686046212, + "grad_norm": 0.0008200284210033715, + "learning_rate": 4.607935289704475e-06, + "loss": 0.12095870971679687, + "step": 222240 + }, + { + "epoch": 0.9541657006946412, + "grad_norm": 0.001585979014635086, + "learning_rate": 4.603623569586851e-06, + "loss": 0.11065644025802612, + "step": 222250 + }, + { + "epoch": 0.9542086327846612, + "grad_norm": 1.3344343900680542, + "learning_rate": 4.599311849469228e-06, + "loss": 0.2467266798019409, + "step": 222260 + }, + { + "epoch": 0.9542515648746812, + "grad_norm": 0.8917443156242371, + "learning_rate": 4.595000129351603e-06, + "loss": 0.3195840358734131, + "step": 222270 + }, + { + "epoch": 0.9542944969647013, + "grad_norm": 0.0020116260275244713, + "learning_rate": 4.59068840923398e-06, + "loss": 0.24985339641571044, + "step": 222280 + }, + { + "epoch": 0.9543374290547212, + "grad_norm": 1.8657126426696777, + "learning_rate": 4.586376689116356e-06, + "loss": 0.19289579391479492, + "step": 222290 + }, + { + "epoch": 0.9543803611447412, + "grad_norm": 0.014383583329617977, + "learning_rate": 4.582064968998733e-06, + "loss": 0.17697054147720337, + "step": 222300 + }, + { + "epoch": 0.9544232932347613, + "grad_norm": 0.055165525525808334, + "learning_rate": 4.5777532488811085e-06, + "loss": 0.2178732395172119, + "step": 222310 + }, + { + "epoch": 0.9544662253247813, + "grad_norm": 0.20588642358779907, + "learning_rate": 4.573441528763485e-06, + "loss": 0.34146618843078613, + "step": 222320 + }, + { + "epoch": 0.9545091574148012, + "grad_norm": 1.7712666988372803, + "learning_rate": 4.5691298086458615e-06, + "loss": 0.22836158275604249, + "step": 222330 + }, + { + "epoch": 0.9545520895048213, + "grad_norm": 0.004596407525241375, + "learning_rate": 4.564818088528237e-06, + "loss": 0.016224928200244904, + "step": 222340 + }, + { + "epoch": 0.9545950215948413, + "grad_norm": 0.006730203051120043, + "learning_rate": 4.560506368410614e-06, + "loss": 0.07292388081550598, + "step": 222350 + }, + { + "epoch": 0.9546379536848613, + "grad_norm": 0.009615895338356495, + "learning_rate": 4.55619464829299e-06, + "loss": 0.09958805441856385, + "step": 222360 + }, + { + "epoch": 0.9546808857748813, + "grad_norm": 0.8345091342926025, + "learning_rate": 4.551882928175367e-06, + "loss": 0.3748096704483032, + "step": 222370 + }, + { + "epoch": 0.9547238178649013, + "grad_norm": 9.58636474609375, + "learning_rate": 4.547571208057743e-06, + "loss": 0.40106916427612305, + "step": 222380 + }, + { + "epoch": 0.9547667499549213, + "grad_norm": 0.22981485724449158, + "learning_rate": 4.54325948794012e-06, + "loss": 0.0750343382358551, + "step": 222390 + }, + { + "epoch": 0.9548096820449413, + "grad_norm": 1.1611047983169556, + "learning_rate": 4.538947767822495e-06, + "loss": 0.15348081588745116, + "step": 222400 + }, + { + "epoch": 0.9548526141349614, + "grad_norm": 0.0849420502781868, + "learning_rate": 4.534636047704872e-06, + "loss": 0.2569085359573364, + "step": 222410 + }, + { + "epoch": 0.9548955462249813, + "grad_norm": 0.020023846998810768, + "learning_rate": 4.530324327587248e-06, + "loss": 0.2325721263885498, + "step": 222420 + }, + { + "epoch": 0.9549384783150013, + "grad_norm": 1.6331207752227783, + "learning_rate": 4.526012607469624e-06, + "loss": 0.1936333417892456, + "step": 222430 + }, + { + "epoch": 0.9549814104050214, + "grad_norm": 0.06379146873950958, + "learning_rate": 4.521700887352e-06, + "loss": 0.12804726362228394, + "step": 222440 + }, + { + "epoch": 0.9550243424950413, + "grad_norm": 0.0015627113170921803, + "learning_rate": 4.517389167234377e-06, + "loss": 0.10898298025131226, + "step": 222450 + }, + { + "epoch": 0.9550672745850614, + "grad_norm": 5.185602188110352, + "learning_rate": 4.5130774471167526e-06, + "loss": 0.24877891540527344, + "step": 222460 + }, + { + "epoch": 0.9551102066750814, + "grad_norm": 0.001561668235808611, + "learning_rate": 4.508765726999129e-06, + "loss": 0.3175127744674683, + "step": 222470 + }, + { + "epoch": 0.9551531387651013, + "grad_norm": 0.04387206211686134, + "learning_rate": 4.5044540068815055e-06, + "loss": 0.14225680828094484, + "step": 222480 + }, + { + "epoch": 0.9551960708551214, + "grad_norm": 2.1273081302642822, + "learning_rate": 4.500142286763882e-06, + "loss": 0.20350861549377441, + "step": 222490 + }, + { + "epoch": 0.9552390029451414, + "grad_norm": 3.505652904510498, + "learning_rate": 4.495830566646258e-06, + "loss": 0.10865819454193115, + "step": 222500 + }, + { + "epoch": 0.9552819350351613, + "grad_norm": 0.023028582334518433, + "learning_rate": 4.491518846528634e-06, + "loss": 0.2138669490814209, + "step": 222510 + }, + { + "epoch": 0.9553248671251814, + "grad_norm": 0.03329871594905853, + "learning_rate": 4.487207126411011e-06, + "loss": 0.02992744743824005, + "step": 222520 + }, + { + "epoch": 0.9553677992152014, + "grad_norm": 0.124062679708004, + "learning_rate": 4.482895406293386e-06, + "loss": 0.2770715236663818, + "step": 222530 + }, + { + "epoch": 0.9554107313052214, + "grad_norm": 33.47114181518555, + "learning_rate": 4.478583686175764e-06, + "loss": 0.0699992299079895, + "step": 222540 + }, + { + "epoch": 0.9554536633952414, + "grad_norm": 0.008653360418975353, + "learning_rate": 4.47427196605814e-06, + "loss": 0.11284812688827514, + "step": 222550 + }, + { + "epoch": 0.9554965954852614, + "grad_norm": 2.8744757175445557, + "learning_rate": 4.469960245940516e-06, + "loss": 0.12507638931274415, + "step": 222560 + }, + { + "epoch": 0.9555395275752814, + "grad_norm": 5.595198154449463, + "learning_rate": 4.465648525822892e-06, + "loss": 0.347485089302063, + "step": 222570 + }, + { + "epoch": 0.9555824596653014, + "grad_norm": 0.33337855339050293, + "learning_rate": 4.461336805705269e-06, + "loss": 0.18478089570999146, + "step": 222580 + }, + { + "epoch": 0.9556253917553215, + "grad_norm": 0.020518135279417038, + "learning_rate": 4.4570250855876445e-06, + "loss": 0.3763843536376953, + "step": 222590 + }, + { + "epoch": 0.9556683238453414, + "grad_norm": 0.012903768569231033, + "learning_rate": 4.452713365470021e-06, + "loss": 0.140981125831604, + "step": 222600 + }, + { + "epoch": 0.9557112559353614, + "grad_norm": 1.3483617305755615, + "learning_rate": 4.4484016453523975e-06, + "loss": 0.16890804767608641, + "step": 222610 + }, + { + "epoch": 0.9557541880253815, + "grad_norm": 0.01349811814725399, + "learning_rate": 4.444089925234773e-06, + "loss": 0.18624815940856934, + "step": 222620 + }, + { + "epoch": 0.9557971201154014, + "grad_norm": 0.0021429001353681087, + "learning_rate": 4.43977820511715e-06, + "loss": 0.017654465138912202, + "step": 222630 + }, + { + "epoch": 0.9558400522054215, + "grad_norm": 68.33997344970703, + "learning_rate": 4.435466484999526e-06, + "loss": 0.11674298048019409, + "step": 222640 + }, + { + "epoch": 0.9558829842954415, + "grad_norm": 65.13331604003906, + "learning_rate": 4.431154764881902e-06, + "loss": 0.32192862033843994, + "step": 222650 + }, + { + "epoch": 0.9559259163854614, + "grad_norm": 0.043820153921842575, + "learning_rate": 4.426843044764278e-06, + "loss": 0.4090623378753662, + "step": 222660 + }, + { + "epoch": 0.9559688484754815, + "grad_norm": 0.0010055521270260215, + "learning_rate": 4.422531324646655e-06, + "loss": 0.28583674430847167, + "step": 222670 + }, + { + "epoch": 0.9560117805655015, + "grad_norm": 1.6560944318771362, + "learning_rate": 4.418219604529031e-06, + "loss": 0.21666302680969238, + "step": 222680 + }, + { + "epoch": 0.9560547126555214, + "grad_norm": 0.0015214415034279227, + "learning_rate": 4.413907884411407e-06, + "loss": 0.12489688396453857, + "step": 222690 + }, + { + "epoch": 0.9560976447455415, + "grad_norm": 0.20916807651519775, + "learning_rate": 4.409596164293783e-06, + "loss": 0.23923103809356688, + "step": 222700 + }, + { + "epoch": 0.9561405768355615, + "grad_norm": 0.029555104672908783, + "learning_rate": 4.40528444417616e-06, + "loss": 0.1707392930984497, + "step": 222710 + }, + { + "epoch": 0.9561835089255815, + "grad_norm": 0.004484428558498621, + "learning_rate": 4.4009727240585355e-06, + "loss": 0.06875411272048951, + "step": 222720 + }, + { + "epoch": 0.9562264410156015, + "grad_norm": 1.1642791032791138, + "learning_rate": 4.396661003940913e-06, + "loss": 0.33511579036712646, + "step": 222730 + }, + { + "epoch": 0.9562693731056215, + "grad_norm": 0.017194673418998718, + "learning_rate": 4.392349283823289e-06, + "loss": 0.13210066556930541, + "step": 222740 + }, + { + "epoch": 0.9563123051956416, + "grad_norm": 0.002511198166757822, + "learning_rate": 4.388037563705665e-06, + "loss": 0.24065072536468507, + "step": 222750 + }, + { + "epoch": 0.9563552372856615, + "grad_norm": 0.0038884340319782495, + "learning_rate": 4.3837258435880415e-06, + "loss": 0.3353621006011963, + "step": 222760 + }, + { + "epoch": 0.9563981693756816, + "grad_norm": 0.8704876899719238, + "learning_rate": 4.379414123470418e-06, + "loss": 0.20678813457489015, + "step": 222770 + }, + { + "epoch": 0.9564411014657016, + "grad_norm": 0.02445058897137642, + "learning_rate": 4.375102403352794e-06, + "loss": 0.15792833566665648, + "step": 222780 + }, + { + "epoch": 0.9564840335557215, + "grad_norm": 0.11866077035665512, + "learning_rate": 4.37079068323517e-06, + "loss": 0.21752674579620362, + "step": 222790 + }, + { + "epoch": 0.9565269656457416, + "grad_norm": 0.3736298084259033, + "learning_rate": 4.366478963117547e-06, + "loss": 0.1535860538482666, + "step": 222800 + }, + { + "epoch": 0.9565698977357616, + "grad_norm": 2.179549217224121, + "learning_rate": 4.362167242999922e-06, + "loss": 0.2848331928253174, + "step": 222810 + }, + { + "epoch": 0.9566128298257816, + "grad_norm": 1.7065210342407227, + "learning_rate": 4.357855522882299e-06, + "loss": 0.11012871265411377, + "step": 222820 + }, + { + "epoch": 0.9566557619158016, + "grad_norm": 4.045711517333984, + "learning_rate": 4.353543802764675e-06, + "loss": 0.2910693407058716, + "step": 222830 + }, + { + "epoch": 0.9566986940058216, + "grad_norm": 0.0030369660817086697, + "learning_rate": 4.349232082647051e-06, + "loss": 0.07584235668182374, + "step": 222840 + }, + { + "epoch": 0.9567416260958416, + "grad_norm": 0.20301000773906708, + "learning_rate": 4.344920362529427e-06, + "loss": 0.15026159286499025, + "step": 222850 + }, + { + "epoch": 0.9567845581858616, + "grad_norm": 2.4597878456115723, + "learning_rate": 4.340608642411804e-06, + "loss": 0.16425200700759887, + "step": 222860 + }, + { + "epoch": 0.9568274902758817, + "grad_norm": 0.026504697278141975, + "learning_rate": 4.33629692229418e-06, + "loss": 0.37502100467681887, + "step": 222870 + }, + { + "epoch": 0.9568704223659016, + "grad_norm": 0.7608723044395447, + "learning_rate": 4.331985202176556e-06, + "loss": 0.17374789714813232, + "step": 222880 + }, + { + "epoch": 0.9569133544559216, + "grad_norm": 0.26648297905921936, + "learning_rate": 4.3276734820589326e-06, + "loss": 0.19921503067016602, + "step": 222890 + }, + { + "epoch": 0.9569562865459417, + "grad_norm": 1.9915199279785156, + "learning_rate": 4.323361761941309e-06, + "loss": 0.25008134841918944, + "step": 222900 + }, + { + "epoch": 0.9569992186359616, + "grad_norm": 0.9826081395149231, + "learning_rate": 4.3190500418236856e-06, + "loss": 0.3138739824295044, + "step": 222910 + }, + { + "epoch": 0.9570421507259816, + "grad_norm": 0.041718751192092896, + "learning_rate": 4.314738321706062e-06, + "loss": 0.0015770439058542252, + "step": 222920 + }, + { + "epoch": 0.9570850828160017, + "grad_norm": 1.591940999031067, + "learning_rate": 4.3104266015884385e-06, + "loss": 0.1963658571243286, + "step": 222930 + }, + { + "epoch": 0.9571280149060216, + "grad_norm": 2.0966057777404785, + "learning_rate": 4.306114881470814e-06, + "loss": 0.42662744522094725, + "step": 222940 + }, + { + "epoch": 0.9571709469960417, + "grad_norm": 0.013677514158189297, + "learning_rate": 4.301803161353191e-06, + "loss": 0.40020246505737306, + "step": 222950 + }, + { + "epoch": 0.9572138790860617, + "grad_norm": 2.0066123008728027, + "learning_rate": 4.297491441235567e-06, + "loss": 0.15436534881591796, + "step": 222960 + }, + { + "epoch": 0.9572568111760816, + "grad_norm": 21.6331729888916, + "learning_rate": 4.293179721117943e-06, + "loss": 0.24480834007263183, + "step": 222970 + }, + { + "epoch": 0.9572997432661017, + "grad_norm": 4.920427322387695, + "learning_rate": 4.288868001000319e-06, + "loss": 0.27158007621765134, + "step": 222980 + }, + { + "epoch": 0.9573426753561217, + "grad_norm": 48.925804138183594, + "learning_rate": 4.284556280882696e-06, + "loss": 0.2037571907043457, + "step": 222990 + }, + { + "epoch": 0.9573856074461417, + "grad_norm": 0.005673081614077091, + "learning_rate": 4.2802445607650715e-06, + "loss": 0.19406213760375976, + "step": 223000 + }, + { + "epoch": 0.9573856074461417, + "eval_loss": 0.36744725704193115, + "eval_runtime": 27.5922, + "eval_samples_per_second": 3.624, + "eval_steps_per_second": 3.624, + "step": 223000 + }, + { + "epoch": 0.9574285395361617, + "grad_norm": 2.244419574737549, + "learning_rate": 4.275932840647448e-06, + "loss": 0.15834920406341552, + "step": 223010 + }, + { + "epoch": 0.9574714716261817, + "grad_norm": 0.05991288647055626, + "learning_rate": 4.2716211205298245e-06, + "loss": 0.3904329061508179, + "step": 223020 + }, + { + "epoch": 0.9575144037162017, + "grad_norm": 0.5041655898094177, + "learning_rate": 4.2673094004122e-06, + "loss": 0.15431556701660157, + "step": 223030 + }, + { + "epoch": 0.9575573358062217, + "grad_norm": 0.8797579407691956, + "learning_rate": 4.262997680294577e-06, + "loss": 0.28423237800598145, + "step": 223040 + }, + { + "epoch": 0.9576002678962418, + "grad_norm": 0.947993814945221, + "learning_rate": 4.258685960176953e-06, + "loss": 0.14467850923538209, + "step": 223050 + }, + { + "epoch": 0.9576431999862617, + "grad_norm": 1.3202130794525146, + "learning_rate": 4.25437424005933e-06, + "loss": 0.2532930612564087, + "step": 223060 + }, + { + "epoch": 0.9576861320762817, + "grad_norm": 0.4150962233543396, + "learning_rate": 4.250062519941705e-06, + "loss": 0.10975308418273926, + "step": 223070 + }, + { + "epoch": 0.9577290641663018, + "grad_norm": 0.031912241131067276, + "learning_rate": 4.245750799824082e-06, + "loss": 0.08250230550765991, + "step": 223080 + }, + { + "epoch": 0.9577719962563217, + "grad_norm": 0.004118985962122679, + "learning_rate": 4.241439079706458e-06, + "loss": 0.13309333324432374, + "step": 223090 + }, + { + "epoch": 0.9578149283463417, + "grad_norm": 0.004930357448756695, + "learning_rate": 4.237127359588835e-06, + "loss": 0.11556535959243774, + "step": 223100 + }, + { + "epoch": 0.9578578604363618, + "grad_norm": 16.858257293701172, + "learning_rate": 4.232815639471211e-06, + "loss": 0.26987152099609374, + "step": 223110 + }, + { + "epoch": 0.9579007925263817, + "grad_norm": 0.012583344243466854, + "learning_rate": 4.228503919353588e-06, + "loss": 0.2191478729248047, + "step": 223120 + }, + { + "epoch": 0.9579437246164018, + "grad_norm": 1.26869535446167, + "learning_rate": 4.224192199235963e-06, + "loss": 0.24034445285797118, + "step": 223130 + }, + { + "epoch": 0.9579866567064218, + "grad_norm": 1.5905871391296387, + "learning_rate": 4.21988047911834e-06, + "loss": 0.24000306129455568, + "step": 223140 + }, + { + "epoch": 0.9580295887964417, + "grad_norm": 1.7036107778549194, + "learning_rate": 4.215568759000716e-06, + "loss": 0.10172114372253419, + "step": 223150 + }, + { + "epoch": 0.9580725208864618, + "grad_norm": 0.03540444001555443, + "learning_rate": 4.211257038883092e-06, + "loss": 0.09440221190452576, + "step": 223160 + }, + { + "epoch": 0.9581154529764818, + "grad_norm": 1.9591245651245117, + "learning_rate": 4.2069453187654685e-06, + "loss": 0.21786725521087646, + "step": 223170 + }, + { + "epoch": 0.9581583850665019, + "grad_norm": 1.5034193992614746, + "learning_rate": 4.202633598647845e-06, + "loss": 0.25783910751342776, + "step": 223180 + }, + { + "epoch": 0.9582013171565218, + "grad_norm": 0.06697037070989609, + "learning_rate": 4.198321878530221e-06, + "loss": 0.1745791792869568, + "step": 223190 + }, + { + "epoch": 0.9582442492465418, + "grad_norm": 0.010242069140076637, + "learning_rate": 4.194010158412597e-06, + "loss": 0.38800320625305174, + "step": 223200 + }, + { + "epoch": 0.9582871813365619, + "grad_norm": 1.9304941892623901, + "learning_rate": 4.189698438294974e-06, + "loss": 0.5144011974334717, + "step": 223210 + }, + { + "epoch": 0.9583301134265818, + "grad_norm": 0.24945738911628723, + "learning_rate": 4.185386718177349e-06, + "loss": 0.1446653962135315, + "step": 223220 + }, + { + "epoch": 0.9583730455166019, + "grad_norm": 0.09023217856884003, + "learning_rate": 4.181074998059726e-06, + "loss": 0.10228971242904664, + "step": 223230 + }, + { + "epoch": 0.9584159776066219, + "grad_norm": 0.42283934354782104, + "learning_rate": 4.176763277942102e-06, + "loss": 0.19438369274139405, + "step": 223240 + }, + { + "epoch": 0.9584589096966418, + "grad_norm": 0.002236647065728903, + "learning_rate": 4.172451557824479e-06, + "loss": 0.2513619899749756, + "step": 223250 + }, + { + "epoch": 0.9585018417866619, + "grad_norm": 3.2790701389312744, + "learning_rate": 4.1681398377068544e-06, + "loss": 0.5641608238220215, + "step": 223260 + }, + { + "epoch": 0.9585447738766819, + "grad_norm": 1.5386213064193726, + "learning_rate": 4.163828117589232e-06, + "loss": 0.23426687717437744, + "step": 223270 + }, + { + "epoch": 0.9585877059667018, + "grad_norm": 0.5800334811210632, + "learning_rate": 4.1595163974716074e-06, + "loss": 0.12683658599853515, + "step": 223280 + }, + { + "epoch": 0.9586306380567219, + "grad_norm": 0.014845290221273899, + "learning_rate": 4.155204677353984e-06, + "loss": 0.07357310652732849, + "step": 223290 + }, + { + "epoch": 0.9586735701467419, + "grad_norm": 0.0030349476728588343, + "learning_rate": 4.15089295723636e-06, + "loss": 0.14453049898147582, + "step": 223300 + }, + { + "epoch": 0.9587165022367619, + "grad_norm": 0.0032269014045596123, + "learning_rate": 4.146581237118737e-06, + "loss": 0.120048987865448, + "step": 223310 + }, + { + "epoch": 0.9587594343267819, + "grad_norm": 0.016529643908143044, + "learning_rate": 4.1422695170011126e-06, + "loss": 0.11575425863265991, + "step": 223320 + }, + { + "epoch": 0.9588023664168019, + "grad_norm": 2.5239858627319336, + "learning_rate": 4.137957796883489e-06, + "loss": 0.2145371913909912, + "step": 223330 + }, + { + "epoch": 0.9588452985068219, + "grad_norm": 1.4063358306884766, + "learning_rate": 4.1336460767658656e-06, + "loss": 0.25419816970825193, + "step": 223340 + }, + { + "epoch": 0.9588882305968419, + "grad_norm": 1.2896349430084229, + "learning_rate": 4.129334356648241e-06, + "loss": 0.1854541301727295, + "step": 223350 + }, + { + "epoch": 0.958931162686862, + "grad_norm": 45.9904670715332, + "learning_rate": 4.125022636530618e-06, + "loss": 0.1698264718055725, + "step": 223360 + }, + { + "epoch": 0.9589740947768819, + "grad_norm": 0.11553294956684113, + "learning_rate": 4.120710916412994e-06, + "loss": 0.307094144821167, + "step": 223370 + }, + { + "epoch": 0.9590170268669019, + "grad_norm": 0.1716066598892212, + "learning_rate": 4.11639919629537e-06, + "loss": 0.09169681668281555, + "step": 223380 + }, + { + "epoch": 0.959059958956922, + "grad_norm": 0.0001855223235907033, + "learning_rate": 4.112087476177746e-06, + "loss": 0.07396456003189086, + "step": 223390 + }, + { + "epoch": 0.9591028910469419, + "grad_norm": 0.0005895741633139551, + "learning_rate": 4.107775756060123e-06, + "loss": 0.20217819213867189, + "step": 223400 + }, + { + "epoch": 0.959145823136962, + "grad_norm": 0.0007842977647669613, + "learning_rate": 4.1034640359424985e-06, + "loss": 0.3810096502304077, + "step": 223410 + }, + { + "epoch": 0.959188755226982, + "grad_norm": 0.07560084015130997, + "learning_rate": 4.099152315824875e-06, + "loss": 0.15365735292434693, + "step": 223420 + }, + { + "epoch": 0.9592316873170019, + "grad_norm": 2.457524061203003, + "learning_rate": 4.0948405957072515e-06, + "loss": 0.2020556926727295, + "step": 223430 + }, + { + "epoch": 0.959274619407022, + "grad_norm": 0.02854176051914692, + "learning_rate": 4.090528875589628e-06, + "loss": 0.2204937219619751, + "step": 223440 + }, + { + "epoch": 0.959317551497042, + "grad_norm": 1.960445523262024, + "learning_rate": 4.0862171554720045e-06, + "loss": 0.28523755073547363, + "step": 223450 + }, + { + "epoch": 0.9593604835870619, + "grad_norm": 3.8729004859924316, + "learning_rate": 4.081905435354381e-06, + "loss": 0.3639643669128418, + "step": 223460 + }, + { + "epoch": 0.959403415677082, + "grad_norm": 2.3094053268432617, + "learning_rate": 4.077593715236757e-06, + "loss": 0.18904602527618408, + "step": 223470 + }, + { + "epoch": 0.959446347767102, + "grad_norm": 0.011753720231354237, + "learning_rate": 4.073281995119133e-06, + "loss": 0.21364269256591797, + "step": 223480 + }, + { + "epoch": 0.959489279857122, + "grad_norm": 0.005512732081115246, + "learning_rate": 4.06897027500151e-06, + "loss": 0.09811052083969116, + "step": 223490 + }, + { + "epoch": 0.959532211947142, + "grad_norm": 2.5590529441833496, + "learning_rate": 4.064658554883886e-06, + "loss": 0.12399983406066895, + "step": 223500 + }, + { + "epoch": 0.959575144037162, + "grad_norm": 0.13864728808403015, + "learning_rate": 4.060346834766262e-06, + "loss": 0.24441328048706054, + "step": 223510 + }, + { + "epoch": 0.959618076127182, + "grad_norm": 0.00043273199116811156, + "learning_rate": 4.056035114648638e-06, + "loss": 0.21513741016387938, + "step": 223520 + }, + { + "epoch": 0.959661008217202, + "grad_norm": 4.524774074554443, + "learning_rate": 4.051723394531015e-06, + "loss": 0.2797492504119873, + "step": 223530 + }, + { + "epoch": 0.9597039403072221, + "grad_norm": 1.4905755519866943, + "learning_rate": 4.04741167441339e-06, + "loss": 0.3571767330169678, + "step": 223540 + }, + { + "epoch": 0.959746872397242, + "grad_norm": 0.12812760472297668, + "learning_rate": 4.043099954295767e-06, + "loss": 0.04512379467487335, + "step": 223550 + }, + { + "epoch": 0.959789804487262, + "grad_norm": 1.246614694595337, + "learning_rate": 4.038788234178143e-06, + "loss": 0.08907458782196045, + "step": 223560 + }, + { + "epoch": 0.9598327365772821, + "grad_norm": 0.001846889266744256, + "learning_rate": 4.034476514060519e-06, + "loss": 0.17755554914474486, + "step": 223570 + }, + { + "epoch": 0.959875668667302, + "grad_norm": 0.11436885595321655, + "learning_rate": 4.0301647939428955e-06, + "loss": 0.12528493404388427, + "step": 223580 + }, + { + "epoch": 0.959918600757322, + "grad_norm": 0.002565787872299552, + "learning_rate": 4.025853073825272e-06, + "loss": 0.06130242347717285, + "step": 223590 + }, + { + "epoch": 0.9599615328473421, + "grad_norm": 1.4368469715118408, + "learning_rate": 4.021541353707648e-06, + "loss": 0.2077068567276001, + "step": 223600 + }, + { + "epoch": 0.9600044649373621, + "grad_norm": 0.19996072351932526, + "learning_rate": 4.017229633590024e-06, + "loss": 0.20007028579711914, + "step": 223610 + }, + { + "epoch": 0.9600473970273821, + "grad_norm": 11.39544677734375, + "learning_rate": 4.012917913472401e-06, + "loss": 0.1851373314857483, + "step": 223620 + }, + { + "epoch": 0.9600903291174021, + "grad_norm": 0.0127298878505826, + "learning_rate": 4.008606193354777e-06, + "loss": 0.17328726053237914, + "step": 223630 + }, + { + "epoch": 0.9601332612074222, + "grad_norm": 0.060895953327417374, + "learning_rate": 4.004294473237154e-06, + "loss": 0.3138414859771729, + "step": 223640 + }, + { + "epoch": 0.9601761932974421, + "grad_norm": 0.009014183655381203, + "learning_rate": 3.99998275311953e-06, + "loss": 0.19367181062698363, + "step": 223650 + }, + { + "epoch": 0.9602191253874621, + "grad_norm": 2.9843554496765137, + "learning_rate": 3.995671033001906e-06, + "loss": 0.3071397304534912, + "step": 223660 + }, + { + "epoch": 0.9602620574774822, + "grad_norm": 0.016983680427074432, + "learning_rate": 3.991359312884282e-06, + "loss": 0.210011625289917, + "step": 223670 + }, + { + "epoch": 0.9603049895675021, + "grad_norm": 0.7923820614814758, + "learning_rate": 3.987047592766659e-06, + "loss": 0.3432178020477295, + "step": 223680 + }, + { + "epoch": 0.9603479216575221, + "grad_norm": 1.8482073545455933, + "learning_rate": 3.982735872649035e-06, + "loss": 0.4096959114074707, + "step": 223690 + }, + { + "epoch": 0.9603908537475422, + "grad_norm": 0.057525552809238434, + "learning_rate": 3.978424152531411e-06, + "loss": 0.10442249774932862, + "step": 223700 + }, + { + "epoch": 0.9604337858375621, + "grad_norm": 1.0683971643447876, + "learning_rate": 3.9741124324137874e-06, + "loss": 0.5931124687194824, + "step": 223710 + }, + { + "epoch": 0.9604767179275822, + "grad_norm": 0.08928456157445908, + "learning_rate": 3.969800712296164e-06, + "loss": 0.2801962375640869, + "step": 223720 + }, + { + "epoch": 0.9605196500176022, + "grad_norm": 0.08087646961212158, + "learning_rate": 3.96548899217854e-06, + "loss": 0.20012621879577636, + "step": 223730 + }, + { + "epoch": 0.9605625821076221, + "grad_norm": 0.01258047018200159, + "learning_rate": 3.961177272060916e-06, + "loss": 0.15536543130874633, + "step": 223740 + }, + { + "epoch": 0.9606055141976422, + "grad_norm": 1.7692468166351318, + "learning_rate": 3.9568655519432926e-06, + "loss": 0.2260221004486084, + "step": 223750 + }, + { + "epoch": 0.9606484462876622, + "grad_norm": 0.01273643970489502, + "learning_rate": 3.952553831825668e-06, + "loss": 0.2594797134399414, + "step": 223760 + }, + { + "epoch": 0.9606913783776821, + "grad_norm": 1.927130937576294, + "learning_rate": 3.948242111708045e-06, + "loss": 0.18560283184051513, + "step": 223770 + }, + { + "epoch": 0.9607343104677022, + "grad_norm": 1.519124150276184, + "learning_rate": 3.943930391590421e-06, + "loss": 0.11475672721862792, + "step": 223780 + }, + { + "epoch": 0.9607772425577222, + "grad_norm": 0.023710211738944054, + "learning_rate": 3.939618671472797e-06, + "loss": 0.2021331548690796, + "step": 223790 + }, + { + "epoch": 0.9608201746477422, + "grad_norm": 0.036301396787166595, + "learning_rate": 3.935306951355173e-06, + "loss": 0.1153161883354187, + "step": 223800 + }, + { + "epoch": 0.9608631067377622, + "grad_norm": 0.016259074211120605, + "learning_rate": 3.93099523123755e-06, + "loss": 0.1772770404815674, + "step": 223810 + }, + { + "epoch": 0.9609060388277822, + "grad_norm": 1.032292366027832, + "learning_rate": 3.926683511119926e-06, + "loss": 0.06174069046974182, + "step": 223820 + }, + { + "epoch": 0.9609489709178022, + "grad_norm": 0.0033859829418361187, + "learning_rate": 3.922371791002303e-06, + "loss": 0.014263886213302612, + "step": 223830 + }, + { + "epoch": 0.9609919030078222, + "grad_norm": 8.165559768676758, + "learning_rate": 3.918060070884679e-06, + "loss": 0.40694475173950195, + "step": 223840 + }, + { + "epoch": 0.9610348350978423, + "grad_norm": 0.003001274075359106, + "learning_rate": 3.913748350767055e-06, + "loss": 0.09890034198760986, + "step": 223850 + }, + { + "epoch": 0.9610777671878622, + "grad_norm": 5.088690280914307, + "learning_rate": 3.9094366306494315e-06, + "loss": 0.09827024936676025, + "step": 223860 + }, + { + "epoch": 0.9611206992778822, + "grad_norm": 3.8880326747894287, + "learning_rate": 3.905124910531808e-06, + "loss": 0.40003390312194825, + "step": 223870 + }, + { + "epoch": 0.9611636313679023, + "grad_norm": 0.08274823427200317, + "learning_rate": 3.9008131904141845e-06, + "loss": 0.1569799542427063, + "step": 223880 + }, + { + "epoch": 0.9612065634579222, + "grad_norm": 0.0208677276968956, + "learning_rate": 3.89650147029656e-06, + "loss": 0.07912614941596985, + "step": 223890 + }, + { + "epoch": 0.9612494955479423, + "grad_norm": 0.006856684572994709, + "learning_rate": 3.892189750178937e-06, + "loss": 0.1457221269607544, + "step": 223900 + }, + { + "epoch": 0.9612924276379623, + "grad_norm": 0.0018087215721607208, + "learning_rate": 3.887878030061313e-06, + "loss": 0.0596615195274353, + "step": 223910 + }, + { + "epoch": 0.9613353597279822, + "grad_norm": 2.4244134426116943, + "learning_rate": 3.883566309943689e-06, + "loss": 0.07604253888130189, + "step": 223920 + }, + { + "epoch": 0.9613782918180023, + "grad_norm": 1.4929369688034058, + "learning_rate": 3.879254589826065e-06, + "loss": 0.24013571739196776, + "step": 223930 + }, + { + "epoch": 0.9614212239080223, + "grad_norm": 0.011781311593949795, + "learning_rate": 3.874942869708442e-06, + "loss": 0.20817484855651855, + "step": 223940 + }, + { + "epoch": 0.9614641559980422, + "grad_norm": 0.06727916747331619, + "learning_rate": 3.870631149590817e-06, + "loss": 0.003799154236912727, + "step": 223950 + }, + { + "epoch": 0.9615070880880623, + "grad_norm": 3.397916316986084, + "learning_rate": 3.866319429473194e-06, + "loss": 0.049684774875640866, + "step": 223960 + }, + { + "epoch": 0.9615500201780823, + "grad_norm": 0.789635181427002, + "learning_rate": 3.86200770935557e-06, + "loss": 0.1622435212135315, + "step": 223970 + }, + { + "epoch": 0.9615929522681023, + "grad_norm": 0.0496024563908577, + "learning_rate": 3.857695989237947e-06, + "loss": 0.28729078769683836, + "step": 223980 + }, + { + "epoch": 0.9616358843581223, + "grad_norm": 0.3095422089099884, + "learning_rate": 3.8533842691203225e-06, + "loss": 0.21515951156616211, + "step": 223990 + }, + { + "epoch": 0.9616788164481423, + "grad_norm": 0.009204866364598274, + "learning_rate": 3.8490725490027e-06, + "loss": 0.12539956569671631, + "step": 224000 + }, + { + "epoch": 0.9616788164481423, + "eval_loss": 0.3682700991630554, + "eval_runtime": 27.4644, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 3.641, + "step": 224000 + }, + { + "epoch": 0.9617217485381623, + "grad_norm": 2.353421449661255, + "learning_rate": 3.8447608288850755e-06, + "loss": 0.21565442085266112, + "step": 224010 + }, + { + "epoch": 0.9617646806281823, + "grad_norm": 0.01283611822873354, + "learning_rate": 3.840449108767452e-06, + "loss": 0.202630352973938, + "step": 224020 + }, + { + "epoch": 0.9618076127182024, + "grad_norm": 0.0016208564629778266, + "learning_rate": 3.8361373886498285e-06, + "loss": 0.16334728002548218, + "step": 224030 + }, + { + "epoch": 0.9618505448082224, + "grad_norm": 0.1348349153995514, + "learning_rate": 3.831825668532204e-06, + "loss": 0.13696320056915284, + "step": 224040 + }, + { + "epoch": 0.9618934768982423, + "grad_norm": 0.009350267238914967, + "learning_rate": 3.827513948414581e-06, + "loss": 0.20097155570983888, + "step": 224050 + }, + { + "epoch": 0.9619364089882624, + "grad_norm": 0.005436836276203394, + "learning_rate": 3.823202228296957e-06, + "loss": 0.016508430242538452, + "step": 224060 + }, + { + "epoch": 0.9619793410782824, + "grad_norm": 5.0872883796691895, + "learning_rate": 3.818890508179334e-06, + "loss": 0.09875186681747436, + "step": 224070 + }, + { + "epoch": 0.9620222731683024, + "grad_norm": 0.0039020462427288294, + "learning_rate": 3.8145787880617097e-06, + "loss": 0.12000479698181152, + "step": 224080 + }, + { + "epoch": 0.9620652052583224, + "grad_norm": 6.722840785980225, + "learning_rate": 3.810267067944086e-06, + "loss": 0.4321133613586426, + "step": 224090 + }, + { + "epoch": 0.9621081373483424, + "grad_norm": 0.001120436587370932, + "learning_rate": 3.805955347826462e-06, + "loss": 0.21240513324737548, + "step": 224100 + }, + { + "epoch": 0.9621510694383624, + "grad_norm": 0.03145579993724823, + "learning_rate": 3.8016436277088384e-06, + "loss": 0.19584509134292602, + "step": 224110 + }, + { + "epoch": 0.9621940015283824, + "grad_norm": 2.7041938304901123, + "learning_rate": 3.7973319075912145e-06, + "loss": 0.2914386510848999, + "step": 224120 + }, + { + "epoch": 0.9622369336184025, + "grad_norm": 0.040991175919771194, + "learning_rate": 3.7930201874735905e-06, + "loss": 0.1548812985420227, + "step": 224130 + }, + { + "epoch": 0.9622798657084224, + "grad_norm": 0.7310913801193237, + "learning_rate": 3.788708467355967e-06, + "loss": 0.0296543151140213, + "step": 224140 + }, + { + "epoch": 0.9623227977984424, + "grad_norm": 2.342799425125122, + "learning_rate": 3.784396747238343e-06, + "loss": 0.14656716585159302, + "step": 224150 + }, + { + "epoch": 0.9623657298884625, + "grad_norm": 2.133543014526367, + "learning_rate": 3.7800850271207196e-06, + "loss": 0.1531446695327759, + "step": 224160 + }, + { + "epoch": 0.9624086619784824, + "grad_norm": 3.5911266803741455, + "learning_rate": 3.7757733070030957e-06, + "loss": 0.3761185884475708, + "step": 224170 + }, + { + "epoch": 0.9624515940685024, + "grad_norm": 1.0610127449035645, + "learning_rate": 3.7714615868854726e-06, + "loss": 0.16036534309387207, + "step": 224180 + }, + { + "epoch": 0.9624945261585225, + "grad_norm": 0.003502331208437681, + "learning_rate": 3.7671498667678487e-06, + "loss": 0.13411206007003784, + "step": 224190 + }, + { + "epoch": 0.9625374582485424, + "grad_norm": 0.00993330217897892, + "learning_rate": 3.762838146650225e-06, + "loss": 0.37001564502716067, + "step": 224200 + }, + { + "epoch": 0.9625803903385625, + "grad_norm": 1.013261079788208, + "learning_rate": 3.7585264265326012e-06, + "loss": 0.3331045627593994, + "step": 224210 + }, + { + "epoch": 0.9626233224285825, + "grad_norm": 0.04176459461450577, + "learning_rate": 3.7542147064149777e-06, + "loss": 0.04846885502338409, + "step": 224220 + }, + { + "epoch": 0.9626662545186024, + "grad_norm": 0.0031787080224603415, + "learning_rate": 3.749902986297354e-06, + "loss": 0.18654595613479613, + "step": 224230 + }, + { + "epoch": 0.9627091866086225, + "grad_norm": 0.7569457292556763, + "learning_rate": 3.74559126617973e-06, + "loss": 0.18365461826324464, + "step": 224240 + }, + { + "epoch": 0.9627521186986425, + "grad_norm": 0.019828135147690773, + "learning_rate": 3.7412795460621064e-06, + "loss": 0.06259853839874267, + "step": 224250 + }, + { + "epoch": 0.9627950507886625, + "grad_norm": 0.0054312837310135365, + "learning_rate": 3.7369678259444824e-06, + "loss": 0.17033387422561647, + "step": 224260 + }, + { + "epoch": 0.9628379828786825, + "grad_norm": 2.7154340744018555, + "learning_rate": 3.732656105826859e-06, + "loss": 0.042243891954422, + "step": 224270 + }, + { + "epoch": 0.9628809149687025, + "grad_norm": 0.9128412008285522, + "learning_rate": 3.728344385709235e-06, + "loss": 0.05694934129714966, + "step": 224280 + }, + { + "epoch": 0.9629238470587225, + "grad_norm": 1.8611356019973755, + "learning_rate": 3.724032665591611e-06, + "loss": 0.32024905681610105, + "step": 224290 + }, + { + "epoch": 0.9629667791487425, + "grad_norm": 0.0045630838721990585, + "learning_rate": 3.7197209454739876e-06, + "loss": 0.14713085889816285, + "step": 224300 + }, + { + "epoch": 0.9630097112387626, + "grad_norm": 0.05896364524960518, + "learning_rate": 3.7154092253563636e-06, + "loss": 0.11309947967529296, + "step": 224310 + }, + { + "epoch": 0.9630526433287825, + "grad_norm": 3.036715507507324, + "learning_rate": 3.7110975052387397e-06, + "loss": 0.13869774341583252, + "step": 224320 + }, + { + "epoch": 0.9630955754188025, + "grad_norm": 0.044732533395290375, + "learning_rate": 3.706785785121116e-06, + "loss": 0.1689812183380127, + "step": 224330 + }, + { + "epoch": 0.9631385075088226, + "grad_norm": 0.0014035659842193127, + "learning_rate": 3.7024740650034923e-06, + "loss": 0.0963472604751587, + "step": 224340 + }, + { + "epoch": 0.9631814395988425, + "grad_norm": 1.8297466039657593, + "learning_rate": 3.6981623448858688e-06, + "loss": 0.31110439300537107, + "step": 224350 + }, + { + "epoch": 0.9632243716888625, + "grad_norm": 0.05191795900464058, + "learning_rate": 3.6938506247682457e-06, + "loss": 0.10177983045578003, + "step": 224360 + }, + { + "epoch": 0.9632673037788826, + "grad_norm": 1.1834299564361572, + "learning_rate": 3.6895389046506218e-06, + "loss": 0.1933537244796753, + "step": 224370 + }, + { + "epoch": 0.9633102358689025, + "grad_norm": 0.16819176077842712, + "learning_rate": 3.685227184532998e-06, + "loss": 0.1773150682449341, + "step": 224380 + }, + { + "epoch": 0.9633531679589226, + "grad_norm": 0.02834729105234146, + "learning_rate": 3.6809154644153743e-06, + "loss": 0.13921488523483277, + "step": 224390 + }, + { + "epoch": 0.9633961000489426, + "grad_norm": 0.4217594861984253, + "learning_rate": 3.6766037442977504e-06, + "loss": 0.24627528190612794, + "step": 224400 + }, + { + "epoch": 0.9634390321389625, + "grad_norm": 0.05298386886715889, + "learning_rate": 3.672292024180127e-06, + "loss": 0.12269526720046997, + "step": 224410 + }, + { + "epoch": 0.9634819642289826, + "grad_norm": 2.3080687522888184, + "learning_rate": 3.667980304062503e-06, + "loss": 0.16422743797302247, + "step": 224420 + }, + { + "epoch": 0.9635248963190026, + "grad_norm": 4.178096771240234, + "learning_rate": 3.663668583944879e-06, + "loss": 0.17461690902709961, + "step": 224430 + }, + { + "epoch": 0.9635678284090226, + "grad_norm": 2.450517177581787, + "learning_rate": 3.6593568638272555e-06, + "loss": 0.18653972148895265, + "step": 224440 + }, + { + "epoch": 0.9636107604990426, + "grad_norm": 0.036486510187387466, + "learning_rate": 3.6550451437096316e-06, + "loss": 0.19325416088104247, + "step": 224450 + }, + { + "epoch": 0.9636536925890626, + "grad_norm": 0.03563200309872627, + "learning_rate": 3.650733423592008e-06, + "loss": 0.1459072709083557, + "step": 224460 + }, + { + "epoch": 0.9636966246790827, + "grad_norm": 0.25156867504119873, + "learning_rate": 3.646421703474384e-06, + "loss": 0.1999955177307129, + "step": 224470 + }, + { + "epoch": 0.9637395567691026, + "grad_norm": 8.842079162597656, + "learning_rate": 3.6421099833567603e-06, + "loss": 0.19552001953125, + "step": 224480 + }, + { + "epoch": 0.9637824888591227, + "grad_norm": 0.0028808624483644962, + "learning_rate": 3.6377982632391368e-06, + "loss": 0.04963212013244629, + "step": 224490 + }, + { + "epoch": 0.9638254209491427, + "grad_norm": 0.001806130982004106, + "learning_rate": 3.633486543121513e-06, + "loss": 0.15328739881515502, + "step": 224500 + }, + { + "epoch": 0.9638683530391626, + "grad_norm": 0.007961916737258434, + "learning_rate": 3.6291748230038893e-06, + "loss": 0.24264960289001464, + "step": 224510 + }, + { + "epoch": 0.9639112851291827, + "grad_norm": 1.3244423866271973, + "learning_rate": 3.6248631028862654e-06, + "loss": 0.28087844848632815, + "step": 224520 + }, + { + "epoch": 0.9639542172192027, + "grad_norm": 0.01275936421006918, + "learning_rate": 3.6205513827686415e-06, + "loss": 0.3498584032058716, + "step": 224530 + }, + { + "epoch": 0.9639971493092226, + "grad_norm": 0.0005850349552929401, + "learning_rate": 3.6162396626510184e-06, + "loss": 0.2803433656692505, + "step": 224540 + }, + { + "epoch": 0.9640400813992427, + "grad_norm": 0.00403195945546031, + "learning_rate": 3.611927942533395e-06, + "loss": 0.1901412010192871, + "step": 224550 + }, + { + "epoch": 0.9640830134892627, + "grad_norm": 0.9117140769958496, + "learning_rate": 3.607616222415771e-06, + "loss": 0.09261349439620972, + "step": 224560 + }, + { + "epoch": 0.9641259455792827, + "grad_norm": 7.377902030944824, + "learning_rate": 3.603304502298147e-06, + "loss": 0.39287283420562746, + "step": 224570 + }, + { + "epoch": 0.9641688776693027, + "grad_norm": 1.6168286800384521, + "learning_rate": 3.5989927821805235e-06, + "loss": 0.25367326736450196, + "step": 224580 + }, + { + "epoch": 0.9642118097593227, + "grad_norm": 3.83086895942688, + "learning_rate": 3.5946810620628996e-06, + "loss": 0.20805747509002687, + "step": 224590 + }, + { + "epoch": 0.9642547418493427, + "grad_norm": 0.015164944343268871, + "learning_rate": 3.590369341945276e-06, + "loss": 0.12288215160369872, + "step": 224600 + }, + { + "epoch": 0.9642976739393627, + "grad_norm": 0.22293995320796967, + "learning_rate": 3.586057621827652e-06, + "loss": 0.04461737871170044, + "step": 224610 + }, + { + "epoch": 0.9643406060293828, + "grad_norm": 3.296658992767334, + "learning_rate": 3.5817459017100282e-06, + "loss": 0.2322096824645996, + "step": 224620 + }, + { + "epoch": 0.9643835381194027, + "grad_norm": 5.496777057647705, + "learning_rate": 3.5774341815924047e-06, + "loss": 0.2097012996673584, + "step": 224630 + }, + { + "epoch": 0.9644264702094227, + "grad_norm": 3.9928760528564453, + "learning_rate": 3.573122461474781e-06, + "loss": 0.4075942516326904, + "step": 224640 + }, + { + "epoch": 0.9644694022994428, + "grad_norm": 0.23641102015972137, + "learning_rate": 3.5688107413571573e-06, + "loss": 0.16521319150924682, + "step": 224650 + }, + { + "epoch": 0.9645123343894627, + "grad_norm": 2.510756731033325, + "learning_rate": 3.5644990212395334e-06, + "loss": 0.35691146850585936, + "step": 224660 + }, + { + "epoch": 0.9645552664794828, + "grad_norm": 3.972184896469116, + "learning_rate": 3.5601873011219094e-06, + "loss": 0.17900394201278685, + "step": 224670 + }, + { + "epoch": 0.9645981985695028, + "grad_norm": 0.0042314473539590836, + "learning_rate": 3.555875581004286e-06, + "loss": 0.11929336786270142, + "step": 224680 + }, + { + "epoch": 0.9646411306595227, + "grad_norm": 6.589748859405518, + "learning_rate": 3.551563860886662e-06, + "loss": 0.2866029739379883, + "step": 224690 + }, + { + "epoch": 0.9646840627495428, + "grad_norm": 0.02086971141397953, + "learning_rate": 3.5472521407690385e-06, + "loss": 0.17334396839141847, + "step": 224700 + }, + { + "epoch": 0.9647269948395628, + "grad_norm": 2.366854190826416, + "learning_rate": 3.5429404206514146e-06, + "loss": 0.33515105247497556, + "step": 224710 + }, + { + "epoch": 0.9647699269295827, + "grad_norm": 0.06535419821739197, + "learning_rate": 3.5386287005337907e-06, + "loss": 0.1478777050971985, + "step": 224720 + }, + { + "epoch": 0.9648128590196028, + "grad_norm": 0.06608917564153671, + "learning_rate": 3.5343169804161676e-06, + "loss": 0.2516626358032227, + "step": 224730 + }, + { + "epoch": 0.9648557911096228, + "grad_norm": 0.3839847147464752, + "learning_rate": 3.530005260298544e-06, + "loss": 0.07610129714012145, + "step": 224740 + }, + { + "epoch": 0.9648987231996428, + "grad_norm": 0.0032831490971148014, + "learning_rate": 3.52569354018092e-06, + "loss": 0.2178466796875, + "step": 224750 + }, + { + "epoch": 0.9649416552896628, + "grad_norm": 1.180791974067688, + "learning_rate": 3.5213818200632962e-06, + "loss": 0.18217480182647705, + "step": 224760 + }, + { + "epoch": 0.9649845873796828, + "grad_norm": 0.0028181334491819143, + "learning_rate": 3.5170700999456727e-06, + "loss": 0.052039462327957156, + "step": 224770 + }, + { + "epoch": 0.9650275194697028, + "grad_norm": 0.01064180489629507, + "learning_rate": 3.5127583798280488e-06, + "loss": 0.0493350625038147, + "step": 224780 + }, + { + "epoch": 0.9650704515597228, + "grad_norm": 5.400768756866455, + "learning_rate": 3.5084466597104253e-06, + "loss": 0.3771092176437378, + "step": 224790 + }, + { + "epoch": 0.9651133836497429, + "grad_norm": 1.3995170593261719, + "learning_rate": 3.5041349395928013e-06, + "loss": 0.5951415538787842, + "step": 224800 + }, + { + "epoch": 0.9651563157397628, + "grad_norm": 0.006610509008169174, + "learning_rate": 3.4998232194751774e-06, + "loss": 0.1903270959854126, + "step": 224810 + }, + { + "epoch": 0.9651992478297828, + "grad_norm": 0.007922952994704247, + "learning_rate": 3.495511499357554e-06, + "loss": 0.17838799953460693, + "step": 224820 + }, + { + "epoch": 0.9652421799198029, + "grad_norm": 2.0023837089538574, + "learning_rate": 3.49119977923993e-06, + "loss": 0.14361563920974732, + "step": 224830 + }, + { + "epoch": 0.9652851120098228, + "grad_norm": 0.020767943933606148, + "learning_rate": 3.4868880591223065e-06, + "loss": 0.23522424697875977, + "step": 224840 + }, + { + "epoch": 0.9653280440998429, + "grad_norm": 0.014713380485773087, + "learning_rate": 3.4825763390046826e-06, + "loss": 0.19255516529083253, + "step": 224850 + }, + { + "epoch": 0.9653709761898629, + "grad_norm": 0.006427861750125885, + "learning_rate": 3.4782646188870586e-06, + "loss": 0.12834020853042602, + "step": 224860 + }, + { + "epoch": 0.9654139082798828, + "grad_norm": 0.9519116282463074, + "learning_rate": 3.473952898769435e-06, + "loss": 0.3461976289749146, + "step": 224870 + }, + { + "epoch": 0.9654568403699029, + "grad_norm": 0.017268039286136627, + "learning_rate": 3.469641178651811e-06, + "loss": 0.21045241355895997, + "step": 224880 + }, + { + "epoch": 0.9654997724599229, + "grad_norm": 0.013922857120633125, + "learning_rate": 3.4653294585341877e-06, + "loss": 0.0968497633934021, + "step": 224890 + }, + { + "epoch": 0.965542704549943, + "grad_norm": 0.28565648198127747, + "learning_rate": 3.4610177384165638e-06, + "loss": 0.06030838489532471, + "step": 224900 + }, + { + "epoch": 0.9655856366399629, + "grad_norm": 0.003438707906752825, + "learning_rate": 3.4567060182989407e-06, + "loss": 0.06618756055831909, + "step": 224910 + }, + { + "epoch": 0.9656285687299829, + "grad_norm": 0.03495679795742035, + "learning_rate": 3.4523942981813168e-06, + "loss": 0.2902189254760742, + "step": 224920 + }, + { + "epoch": 0.965671500820003, + "grad_norm": 1.1262989044189453, + "learning_rate": 3.4480825780636933e-06, + "loss": 0.17496834993362426, + "step": 224930 + }, + { + "epoch": 0.9657144329100229, + "grad_norm": 0.047534435987472534, + "learning_rate": 3.4437708579460693e-06, + "loss": 0.09073890447616577, + "step": 224940 + }, + { + "epoch": 0.9657573650000429, + "grad_norm": 1.0417522192001343, + "learning_rate": 3.439459137828446e-06, + "loss": 0.16527436971664428, + "step": 224950 + }, + { + "epoch": 0.965800297090063, + "grad_norm": 0.9508054256439209, + "learning_rate": 3.435147417710822e-06, + "loss": 0.22022783756256104, + "step": 224960 + }, + { + "epoch": 0.9658432291800829, + "grad_norm": 0.0005445060087367892, + "learning_rate": 3.430835697593198e-06, + "loss": 0.2978523731231689, + "step": 224970 + }, + { + "epoch": 0.965886161270103, + "grad_norm": 17.093509674072266, + "learning_rate": 3.4265239774755745e-06, + "loss": 0.05229206085205078, + "step": 224980 + }, + { + "epoch": 0.965929093360123, + "grad_norm": 0.005673188250511885, + "learning_rate": 3.4222122573579505e-06, + "loss": 0.1114910364151001, + "step": 224990 + }, + { + "epoch": 0.9659720254501429, + "grad_norm": 1.3230369091033936, + "learning_rate": 3.4179005372403266e-06, + "loss": 0.18673681020736693, + "step": 225000 + }, + { + "epoch": 0.9659720254501429, + "eval_loss": 0.3696173131465912, + "eval_runtime": 27.4602, + "eval_samples_per_second": 3.642, + "eval_steps_per_second": 3.642, + "step": 225000 + }, + { + "epoch": 0.966014957540163, + "grad_norm": 2.1282687187194824, + "learning_rate": 3.413588817122703e-06, + "loss": 0.20100622177124022, + "step": 225010 + }, + { + "epoch": 0.966057889630183, + "grad_norm": 3.6257035732269287, + "learning_rate": 3.409277097005079e-06, + "loss": 0.299635124206543, + "step": 225020 + }, + { + "epoch": 0.966100821720203, + "grad_norm": 2.9898667335510254, + "learning_rate": 3.4049653768874557e-06, + "loss": 0.14922019243240356, + "step": 225030 + }, + { + "epoch": 0.966143753810223, + "grad_norm": 0.0014072444755584002, + "learning_rate": 3.4006536567698317e-06, + "loss": 0.14536370038986207, + "step": 225040 + }, + { + "epoch": 0.966186685900243, + "grad_norm": 2.570472478866577, + "learning_rate": 3.396341936652208e-06, + "loss": 0.3303256034851074, + "step": 225050 + }, + { + "epoch": 0.966229617990263, + "grad_norm": 0.4072743356227875, + "learning_rate": 3.3920302165345843e-06, + "loss": 0.228645920753479, + "step": 225060 + }, + { + "epoch": 0.966272550080283, + "grad_norm": 0.035002633929252625, + "learning_rate": 3.3877184964169604e-06, + "loss": 0.25193569660186765, + "step": 225070 + }, + { + "epoch": 0.966315482170303, + "grad_norm": 2.990675687789917, + "learning_rate": 3.383406776299337e-06, + "loss": 0.23985731601715088, + "step": 225080 + }, + { + "epoch": 0.966358414260323, + "grad_norm": 0.006291983183473349, + "learning_rate": 3.379095056181714e-06, + "loss": 0.28874037265777586, + "step": 225090 + }, + { + "epoch": 0.966401346350343, + "grad_norm": 4.065964698791504, + "learning_rate": 3.37478333606409e-06, + "loss": 0.26277947425842285, + "step": 225100 + }, + { + "epoch": 0.9664442784403631, + "grad_norm": 0.004781167954206467, + "learning_rate": 3.370471615946466e-06, + "loss": 0.33129491806030276, + "step": 225110 + }, + { + "epoch": 0.966487210530383, + "grad_norm": 0.10862305015325546, + "learning_rate": 3.3661598958288424e-06, + "loss": 0.32240264415740966, + "step": 225120 + }, + { + "epoch": 0.966530142620403, + "grad_norm": 0.06308849155902863, + "learning_rate": 3.3618481757112185e-06, + "loss": 0.17908284664154053, + "step": 225130 + }, + { + "epoch": 0.9665730747104231, + "grad_norm": 1.10364830493927, + "learning_rate": 3.357536455593595e-06, + "loss": 0.3277503252029419, + "step": 225140 + }, + { + "epoch": 0.966616006800443, + "grad_norm": 6.852668762207031, + "learning_rate": 3.353224735475971e-06, + "loss": 0.23288826942443847, + "step": 225150 + }, + { + "epoch": 0.9666589388904631, + "grad_norm": 4.911060333251953, + "learning_rate": 3.348913015358347e-06, + "loss": 0.12526233196258546, + "step": 225160 + }, + { + "epoch": 0.9667018709804831, + "grad_norm": 0.010838578455150127, + "learning_rate": 3.3446012952407237e-06, + "loss": 0.17414969205856323, + "step": 225170 + }, + { + "epoch": 0.966744803070503, + "grad_norm": 0.016853701323270798, + "learning_rate": 3.3402895751230997e-06, + "loss": 0.3909905195236206, + "step": 225180 + }, + { + "epoch": 0.9667877351605231, + "grad_norm": 7.306344509124756, + "learning_rate": 3.335977855005476e-06, + "loss": 0.2299267292022705, + "step": 225190 + }, + { + "epoch": 0.9668306672505431, + "grad_norm": 0.004599553067237139, + "learning_rate": 3.3316661348878523e-06, + "loss": 0.21219370365142823, + "step": 225200 + }, + { + "epoch": 0.966873599340563, + "grad_norm": 1.972286343574524, + "learning_rate": 3.3273544147702284e-06, + "loss": 0.18740177154541016, + "step": 225210 + }, + { + "epoch": 0.9669165314305831, + "grad_norm": 2.4764668941497803, + "learning_rate": 3.323042694652605e-06, + "loss": 0.21578762531280518, + "step": 225220 + }, + { + "epoch": 0.9669594635206031, + "grad_norm": 0.6758376359939575, + "learning_rate": 3.318730974534981e-06, + "loss": 0.2908621788024902, + "step": 225230 + }, + { + "epoch": 0.9670023956106231, + "grad_norm": 0.005352508742362261, + "learning_rate": 3.314419254417357e-06, + "loss": 0.23948318958282472, + "step": 225240 + }, + { + "epoch": 0.9670453277006431, + "grad_norm": 5.595332622528076, + "learning_rate": 3.3101075342997335e-06, + "loss": 0.15925862789154052, + "step": 225250 + }, + { + "epoch": 0.9670882597906632, + "grad_norm": 0.034605976194143295, + "learning_rate": 3.3057958141821096e-06, + "loss": 0.02419239580631256, + "step": 225260 + }, + { + "epoch": 0.9671311918806831, + "grad_norm": 4.634537696838379, + "learning_rate": 3.3014840940644865e-06, + "loss": 0.15210497379302979, + "step": 225270 + }, + { + "epoch": 0.9671741239707031, + "grad_norm": 0.0017427064012736082, + "learning_rate": 3.297172373946863e-06, + "loss": 0.09682173728942871, + "step": 225280 + }, + { + "epoch": 0.9672170560607232, + "grad_norm": 0.02096271514892578, + "learning_rate": 3.292860653829239e-06, + "loss": 0.18717020750045776, + "step": 225290 + }, + { + "epoch": 0.9672599881507431, + "grad_norm": 0.21846190094947815, + "learning_rate": 3.288548933711615e-06, + "loss": 0.21091217994689943, + "step": 225300 + }, + { + "epoch": 0.9673029202407631, + "grad_norm": 0.03409862890839577, + "learning_rate": 3.2842372135939916e-06, + "loss": 0.2435748815536499, + "step": 225310 + }, + { + "epoch": 0.9673458523307832, + "grad_norm": 1.015627384185791, + "learning_rate": 3.2799254934763677e-06, + "loss": 0.2684969186782837, + "step": 225320 + }, + { + "epoch": 0.9673887844208032, + "grad_norm": 0.0285097174346447, + "learning_rate": 3.275613773358744e-06, + "loss": 0.06942356228828431, + "step": 225330 + }, + { + "epoch": 0.9674317165108232, + "grad_norm": 0.07428458333015442, + "learning_rate": 3.2713020532411203e-06, + "loss": 0.17607952356338502, + "step": 225340 + }, + { + "epoch": 0.9674746486008432, + "grad_norm": 0.018062138929963112, + "learning_rate": 3.2669903331234963e-06, + "loss": 0.19157010316848755, + "step": 225350 + }, + { + "epoch": 0.9675175806908632, + "grad_norm": 0.4948386251926422, + "learning_rate": 3.262678613005873e-06, + "loss": 0.15009280443191528, + "step": 225360 + }, + { + "epoch": 0.9675605127808832, + "grad_norm": 0.09620340168476105, + "learning_rate": 3.258366892888249e-06, + "loss": 0.2818382978439331, + "step": 225370 + }, + { + "epoch": 0.9676034448709032, + "grad_norm": 0.001367167104035616, + "learning_rate": 3.254055172770625e-06, + "loss": 0.2431161642074585, + "step": 225380 + }, + { + "epoch": 0.9676463769609233, + "grad_norm": 0.32261449098587036, + "learning_rate": 3.2497434526530015e-06, + "loss": 0.2842667818069458, + "step": 225390 + }, + { + "epoch": 0.9676893090509432, + "grad_norm": 1.915399432182312, + "learning_rate": 3.2454317325353776e-06, + "loss": 0.17604867219924927, + "step": 225400 + }, + { + "epoch": 0.9677322411409632, + "grad_norm": 0.08497922867536545, + "learning_rate": 3.241120012417754e-06, + "loss": 0.11107239723205567, + "step": 225410 + }, + { + "epoch": 0.9677751732309833, + "grad_norm": 2.0568411350250244, + "learning_rate": 3.23680829230013e-06, + "loss": 0.27031285762786866, + "step": 225420 + }, + { + "epoch": 0.9678181053210032, + "grad_norm": 0.01337936706840992, + "learning_rate": 3.232496572182506e-06, + "loss": 0.12987562417984008, + "step": 225430 + }, + { + "epoch": 0.9678610374110233, + "grad_norm": 3.4393444061279297, + "learning_rate": 3.2281848520648827e-06, + "loss": 0.13027408123016357, + "step": 225440 + }, + { + "epoch": 0.9679039695010433, + "grad_norm": 2.069347858428955, + "learning_rate": 3.2238731319472596e-06, + "loss": 0.15661873817443847, + "step": 225450 + }, + { + "epoch": 0.9679469015910632, + "grad_norm": 0.00041850711568258703, + "learning_rate": 3.2195614118296357e-06, + "loss": 0.15425716638565062, + "step": 225460 + }, + { + "epoch": 0.9679898336810833, + "grad_norm": 3.110551357269287, + "learning_rate": 3.215249691712012e-06, + "loss": 0.03208427131175995, + "step": 225470 + }, + { + "epoch": 0.9680327657711033, + "grad_norm": 0.019477995112538338, + "learning_rate": 3.2109379715943882e-06, + "loss": 0.15079782009124756, + "step": 225480 + }, + { + "epoch": 0.9680756978611232, + "grad_norm": 0.09103574603796005, + "learning_rate": 3.2066262514767643e-06, + "loss": 0.1436993360519409, + "step": 225490 + }, + { + "epoch": 0.9681186299511433, + "grad_norm": 1.1488316059112549, + "learning_rate": 3.202314531359141e-06, + "loss": 0.4554173946380615, + "step": 225500 + }, + { + "epoch": 0.9681615620411633, + "grad_norm": 8.552606582641602, + "learning_rate": 3.198002811241517e-06, + "loss": 0.25870747566223146, + "step": 225510 + }, + { + "epoch": 0.9682044941311833, + "grad_norm": 1.9646648168563843, + "learning_rate": 3.1936910911238934e-06, + "loss": 0.3105947732925415, + "step": 225520 + }, + { + "epoch": 0.9682474262212033, + "grad_norm": 0.2984846830368042, + "learning_rate": 3.1893793710062695e-06, + "loss": 0.12478234767913818, + "step": 225530 + }, + { + "epoch": 0.9682903583112233, + "grad_norm": 0.0017521742265671492, + "learning_rate": 3.1850676508886455e-06, + "loss": 0.1251183867454529, + "step": 225540 + }, + { + "epoch": 0.9683332904012433, + "grad_norm": 0.008583495393395424, + "learning_rate": 3.180755930771022e-06, + "loss": 0.08604643344879151, + "step": 225550 + }, + { + "epoch": 0.9683762224912633, + "grad_norm": 0.0018312825122848153, + "learning_rate": 3.176444210653398e-06, + "loss": 0.10744818449020385, + "step": 225560 + }, + { + "epoch": 0.9684191545812834, + "grad_norm": 0.009577479213476181, + "learning_rate": 3.172132490535774e-06, + "loss": 0.12377842664718627, + "step": 225570 + }, + { + "epoch": 0.9684620866713033, + "grad_norm": 0.0017507995944470167, + "learning_rate": 3.1678207704181507e-06, + "loss": 0.3090746641159058, + "step": 225580 + }, + { + "epoch": 0.9685050187613233, + "grad_norm": 0.9828161597251892, + "learning_rate": 3.1635090503005267e-06, + "loss": 0.2902642250061035, + "step": 225590 + }, + { + "epoch": 0.9685479508513434, + "grad_norm": 0.14380550384521484, + "learning_rate": 3.1591973301829032e-06, + "loss": 0.11983634233474731, + "step": 225600 + }, + { + "epoch": 0.9685908829413633, + "grad_norm": 1.3398834466934204, + "learning_rate": 3.1548856100652793e-06, + "loss": 0.23884208202362062, + "step": 225610 + }, + { + "epoch": 0.9686338150313833, + "grad_norm": 0.012832066975533962, + "learning_rate": 3.1505738899476554e-06, + "loss": 0.19006496667861938, + "step": 225620 + }, + { + "epoch": 0.9686767471214034, + "grad_norm": 0.0011093963403254747, + "learning_rate": 3.146262169830032e-06, + "loss": 0.20199911594390868, + "step": 225630 + }, + { + "epoch": 0.9687196792114233, + "grad_norm": 1.6886579990386963, + "learning_rate": 3.141950449712409e-06, + "loss": 0.19745049476623536, + "step": 225640 + }, + { + "epoch": 0.9687626113014434, + "grad_norm": 2.032439708709717, + "learning_rate": 3.137638729594785e-06, + "loss": 0.24782955646514893, + "step": 225650 + }, + { + "epoch": 0.9688055433914634, + "grad_norm": 0.0008506453596055508, + "learning_rate": 3.1333270094771614e-06, + "loss": 0.5238415718078613, + "step": 225660 + }, + { + "epoch": 0.9688484754814833, + "grad_norm": 0.043741144239902496, + "learning_rate": 3.1290152893595374e-06, + "loss": 0.3239001274108887, + "step": 225670 + }, + { + "epoch": 0.9688914075715034, + "grad_norm": 0.0012500104494392872, + "learning_rate": 3.1247035692419135e-06, + "loss": 0.12101260423660279, + "step": 225680 + }, + { + "epoch": 0.9689343396615234, + "grad_norm": 0.009424582123756409, + "learning_rate": 3.12039184912429e-06, + "loss": 0.17357531785964966, + "step": 225690 + }, + { + "epoch": 0.9689772717515434, + "grad_norm": 0.06987378001213074, + "learning_rate": 3.116080129006666e-06, + "loss": 0.5514712810516358, + "step": 225700 + }, + { + "epoch": 0.9690202038415634, + "grad_norm": 0.8753553032875061, + "learning_rate": 3.1117684088890426e-06, + "loss": 0.1353633165359497, + "step": 225710 + }, + { + "epoch": 0.9690631359315834, + "grad_norm": 3.089796304702759, + "learning_rate": 3.1074566887714186e-06, + "loss": 0.14911720752716065, + "step": 225720 + }, + { + "epoch": 0.9691060680216034, + "grad_norm": 2.0691325664520264, + "learning_rate": 3.1031449686537947e-06, + "loss": 0.17857154607772827, + "step": 225730 + }, + { + "epoch": 0.9691490001116234, + "grad_norm": 0.24299030005931854, + "learning_rate": 3.0988332485361712e-06, + "loss": 0.1547597885131836, + "step": 225740 + }, + { + "epoch": 0.9691919322016435, + "grad_norm": 0.003923803102225065, + "learning_rate": 3.0945215284185473e-06, + "loss": 0.07332241535186768, + "step": 225750 + }, + { + "epoch": 0.9692348642916635, + "grad_norm": 0.0067015704698860645, + "learning_rate": 3.0902098083009238e-06, + "loss": 0.17883477210998536, + "step": 225760 + }, + { + "epoch": 0.9692777963816834, + "grad_norm": 0.0028638611547648907, + "learning_rate": 3.0858980881833003e-06, + "loss": 0.003787020221352577, + "step": 225770 + }, + { + "epoch": 0.9693207284717035, + "grad_norm": 1.8772034645080566, + "learning_rate": 3.0815863680656763e-06, + "loss": 0.15486459732055663, + "step": 225780 + }, + { + "epoch": 0.9693636605617235, + "grad_norm": 0.00863329041749239, + "learning_rate": 3.077274647948053e-06, + "loss": 0.0408756822347641, + "step": 225790 + }, + { + "epoch": 0.9694065926517434, + "grad_norm": 3.4329917430877686, + "learning_rate": 3.072962927830429e-06, + "loss": 0.3295733451843262, + "step": 225800 + }, + { + "epoch": 0.9694495247417635, + "grad_norm": 2.6693780422210693, + "learning_rate": 3.068651207712805e-06, + "loss": 0.273480224609375, + "step": 225810 + }, + { + "epoch": 0.9694924568317835, + "grad_norm": 0.059428147971630096, + "learning_rate": 3.0643394875951815e-06, + "loss": 0.11988484859466553, + "step": 225820 + }, + { + "epoch": 0.9695353889218035, + "grad_norm": 0.3197462260723114, + "learning_rate": 3.0600277674775576e-06, + "loss": 0.008120565116405487, + "step": 225830 + }, + { + "epoch": 0.9695783210118235, + "grad_norm": 0.00911969318985939, + "learning_rate": 3.0557160473599336e-06, + "loss": 0.04398435056209564, + "step": 225840 + }, + { + "epoch": 0.9696212531018435, + "grad_norm": 0.040632810443639755, + "learning_rate": 3.05140432724231e-06, + "loss": 0.04892999529838562, + "step": 225850 + }, + { + "epoch": 0.9696641851918635, + "grad_norm": 1.2151085138320923, + "learning_rate": 3.0470926071246866e-06, + "loss": 0.2384195327758789, + "step": 225860 + }, + { + "epoch": 0.9697071172818835, + "grad_norm": 4.104531288146973, + "learning_rate": 3.0427808870070627e-06, + "loss": 0.11756811141967774, + "step": 225870 + }, + { + "epoch": 0.9697500493719036, + "grad_norm": 0.016218269243836403, + "learning_rate": 3.038469166889439e-06, + "loss": 0.1457047700881958, + "step": 225880 + }, + { + "epoch": 0.9697929814619235, + "grad_norm": 0.11366493999958038, + "learning_rate": 3.0341574467718153e-06, + "loss": 0.09916544556617737, + "step": 225890 + }, + { + "epoch": 0.9698359135519435, + "grad_norm": 0.08160996437072754, + "learning_rate": 3.0298457266541918e-06, + "loss": 0.15691580772399902, + "step": 225900 + }, + { + "epoch": 0.9698788456419636, + "grad_norm": 0.02445426769554615, + "learning_rate": 3.025534006536568e-06, + "loss": 0.19571220874786377, + "step": 225910 + }, + { + "epoch": 0.9699217777319835, + "grad_norm": 0.006836262531578541, + "learning_rate": 3.021222286418944e-06, + "loss": 0.17478055953979493, + "step": 225920 + }, + { + "epoch": 0.9699647098220036, + "grad_norm": 0.005737704690545797, + "learning_rate": 3.0169105663013204e-06, + "loss": 0.18237990140914917, + "step": 225930 + }, + { + "epoch": 0.9700076419120236, + "grad_norm": 0.008860369212925434, + "learning_rate": 3.0125988461836965e-06, + "loss": 0.2603444576263428, + "step": 225940 + }, + { + "epoch": 0.9700505740020435, + "grad_norm": 0.04425879567861557, + "learning_rate": 3.008287126066073e-06, + "loss": 0.20395264625549317, + "step": 225950 + }, + { + "epoch": 0.9700935060920636, + "grad_norm": 0.02337292581796646, + "learning_rate": 3.0039754059484495e-06, + "loss": 0.021476563811302186, + "step": 225960 + }, + { + "epoch": 0.9701364381820836, + "grad_norm": 0.08876360952854156, + "learning_rate": 2.9996636858308255e-06, + "loss": 0.2368551015853882, + "step": 225970 + }, + { + "epoch": 0.9701793702721035, + "grad_norm": 0.5836232304573059, + "learning_rate": 2.995351965713202e-06, + "loss": 0.10550501346588134, + "step": 225980 + }, + { + "epoch": 0.9702223023621236, + "grad_norm": 0.042574867606163025, + "learning_rate": 2.991040245595578e-06, + "loss": 0.07009645700454711, + "step": 225990 + }, + { + "epoch": 0.9702652344521436, + "grad_norm": 0.003975628409534693, + "learning_rate": 2.986728525477954e-06, + "loss": 0.13062582015991211, + "step": 226000 + }, + { + "epoch": 0.9702652344521436, + "eval_loss": 0.3667832612991333, + "eval_runtime": 27.4415, + "eval_samples_per_second": 3.644, + "eval_steps_per_second": 3.644, + "step": 226000 + }, + { + "epoch": 0.9703081665421636, + "grad_norm": 0.0030697945039719343, + "learning_rate": 2.9824168053603307e-06, + "loss": 0.3693444013595581, + "step": 226010 + }, + { + "epoch": 0.9703510986321836, + "grad_norm": 0.6716877818107605, + "learning_rate": 2.9781050852427067e-06, + "loss": 0.25938024520874026, + "step": 226020 + }, + { + "epoch": 0.9703940307222036, + "grad_norm": 1.5804414749145508, + "learning_rate": 2.973793365125083e-06, + "loss": 0.2481531620025635, + "step": 226030 + }, + { + "epoch": 0.9704369628122236, + "grad_norm": 1.3616093397140503, + "learning_rate": 2.9694816450074593e-06, + "loss": 0.18937134742736816, + "step": 226040 + }, + { + "epoch": 0.9704798949022436, + "grad_norm": 1.3570085763931274, + "learning_rate": 2.965169924889836e-06, + "loss": 0.12422878742218017, + "step": 226050 + }, + { + "epoch": 0.9705228269922637, + "grad_norm": 0.004477211739867926, + "learning_rate": 2.960858204772212e-06, + "loss": 0.014526186883449555, + "step": 226060 + }, + { + "epoch": 0.9705657590822836, + "grad_norm": 0.3260514438152313, + "learning_rate": 2.9565464846545884e-06, + "loss": 0.05073114037513733, + "step": 226070 + }, + { + "epoch": 0.9706086911723036, + "grad_norm": 0.0009509851224720478, + "learning_rate": 2.9522347645369644e-06, + "loss": 0.16182929277420044, + "step": 226080 + }, + { + "epoch": 0.9706516232623237, + "grad_norm": 2.467970371246338, + "learning_rate": 2.947923044419341e-06, + "loss": 0.3950214862823486, + "step": 226090 + }, + { + "epoch": 0.9706945553523436, + "grad_norm": 0.006497784983366728, + "learning_rate": 2.943611324301717e-06, + "loss": 0.15148746967315674, + "step": 226100 + }, + { + "epoch": 0.9707374874423637, + "grad_norm": 0.019508181139826775, + "learning_rate": 2.939299604184093e-06, + "loss": 0.02425907403230667, + "step": 226110 + }, + { + "epoch": 0.9707804195323837, + "grad_norm": 0.0019436703296378255, + "learning_rate": 2.9349878840664696e-06, + "loss": 0.20029797554016113, + "step": 226120 + }, + { + "epoch": 0.9708233516224036, + "grad_norm": 4.597479343414307, + "learning_rate": 2.9306761639488457e-06, + "loss": 0.3057328939437866, + "step": 226130 + }, + { + "epoch": 0.9708662837124237, + "grad_norm": 0.0020158360712230206, + "learning_rate": 2.926364443831222e-06, + "loss": 0.1163069725036621, + "step": 226140 + }, + { + "epoch": 0.9709092158024437, + "grad_norm": 0.00654413690790534, + "learning_rate": 2.9220527237135987e-06, + "loss": 0.2541393995285034, + "step": 226150 + }, + { + "epoch": 0.9709521478924636, + "grad_norm": 0.08558319509029388, + "learning_rate": 2.9177410035959747e-06, + "loss": 0.19713094234466552, + "step": 226160 + }, + { + "epoch": 0.9709950799824837, + "grad_norm": 0.7128730416297913, + "learning_rate": 2.9134292834783512e-06, + "loss": 0.4300590991973877, + "step": 226170 + }, + { + "epoch": 0.9710380120725037, + "grad_norm": 0.0020223883911967278, + "learning_rate": 2.9091175633607273e-06, + "loss": 0.01936231702566147, + "step": 226180 + }, + { + "epoch": 0.9710809441625238, + "grad_norm": 0.005232381168752909, + "learning_rate": 2.9048058432431034e-06, + "loss": 0.1344476342201233, + "step": 226190 + }, + { + "epoch": 0.9711238762525437, + "grad_norm": 0.4355989694595337, + "learning_rate": 2.90049412312548e-06, + "loss": 0.25427677631378176, + "step": 226200 + }, + { + "epoch": 0.9711668083425637, + "grad_norm": 0.0015556018333882093, + "learning_rate": 2.896182403007856e-06, + "loss": 0.26048619747161866, + "step": 226210 + }, + { + "epoch": 0.9712097404325838, + "grad_norm": 0.005317199043929577, + "learning_rate": 2.891870682890232e-06, + "loss": 0.30444252490997314, + "step": 226220 + }, + { + "epoch": 0.9712526725226037, + "grad_norm": 0.013098231516778469, + "learning_rate": 2.887558962772609e-06, + "loss": 0.1752004861831665, + "step": 226230 + }, + { + "epoch": 0.9712956046126238, + "grad_norm": 0.014951915480196476, + "learning_rate": 2.883247242654985e-06, + "loss": 0.18930820226669312, + "step": 226240 + }, + { + "epoch": 0.9713385367026438, + "grad_norm": 0.002639220794662833, + "learning_rate": 2.878935522537361e-06, + "loss": 0.17423256635665893, + "step": 226250 + }, + { + "epoch": 0.9713814687926637, + "grad_norm": 0.0012302837567403913, + "learning_rate": 2.8746238024197376e-06, + "loss": 0.23043808937072754, + "step": 226260 + }, + { + "epoch": 0.9714244008826838, + "grad_norm": 3.3449714183807373, + "learning_rate": 2.8703120823021136e-06, + "loss": 0.15526468753814698, + "step": 226270 + }, + { + "epoch": 0.9714673329727038, + "grad_norm": 1.695763349533081, + "learning_rate": 2.86600036218449e-06, + "loss": 0.29962406158447263, + "step": 226280 + }, + { + "epoch": 0.9715102650627238, + "grad_norm": 0.01537360530346632, + "learning_rate": 2.861688642066866e-06, + "loss": 0.03783855736255646, + "step": 226290 + }, + { + "epoch": 0.9715531971527438, + "grad_norm": 0.000410413253121078, + "learning_rate": 2.8573769219492423e-06, + "loss": 0.3130494117736816, + "step": 226300 + }, + { + "epoch": 0.9715961292427638, + "grad_norm": 2.2895946502685547, + "learning_rate": 2.8530652018316188e-06, + "loss": 0.3380448579788208, + "step": 226310 + }, + { + "epoch": 0.9716390613327838, + "grad_norm": 2.4340476989746094, + "learning_rate": 2.8487534817139953e-06, + "loss": 0.35978193283081056, + "step": 226320 + }, + { + "epoch": 0.9716819934228038, + "grad_norm": 0.40139251947402954, + "learning_rate": 2.8444417615963713e-06, + "loss": 0.18451598882675171, + "step": 226330 + }, + { + "epoch": 0.9717249255128239, + "grad_norm": 3.936011552810669, + "learning_rate": 2.840130041478748e-06, + "loss": 0.40130367279052737, + "step": 226340 + }, + { + "epoch": 0.9717678576028438, + "grad_norm": 0.03930728882551193, + "learning_rate": 2.835818321361124e-06, + "loss": 0.14985283613204955, + "step": 226350 + }, + { + "epoch": 0.9718107896928638, + "grad_norm": 0.5176249146461487, + "learning_rate": 2.8315066012435004e-06, + "loss": 0.19097598791122436, + "step": 226360 + }, + { + "epoch": 0.9718537217828839, + "grad_norm": 0.0014861667295917869, + "learning_rate": 2.8271948811258765e-06, + "loss": 0.24988765716552735, + "step": 226370 + }, + { + "epoch": 0.9718966538729038, + "grad_norm": 1.4453141689300537, + "learning_rate": 2.8228831610082526e-06, + "loss": 0.28491060733795165, + "step": 226380 + }, + { + "epoch": 0.9719395859629238, + "grad_norm": 0.6912074685096741, + "learning_rate": 2.818571440890629e-06, + "loss": 0.022906261682510375, + "step": 226390 + }, + { + "epoch": 0.9719825180529439, + "grad_norm": 2.4349849224090576, + "learning_rate": 2.814259720773005e-06, + "loss": 0.24235982894897462, + "step": 226400 + }, + { + "epoch": 0.9720254501429638, + "grad_norm": 4.375247001647949, + "learning_rate": 2.8099480006553816e-06, + "loss": 0.43722686767578123, + "step": 226410 + }, + { + "epoch": 0.9720683822329839, + "grad_norm": 0.750277042388916, + "learning_rate": 2.805636280537758e-06, + "loss": 0.11602920293807983, + "step": 226420 + }, + { + "epoch": 0.9721113143230039, + "grad_norm": 0.5917320251464844, + "learning_rate": 2.801324560420134e-06, + "loss": 0.2671214580535889, + "step": 226430 + }, + { + "epoch": 0.9721542464130238, + "grad_norm": 1.1064698696136475, + "learning_rate": 2.7970128403025103e-06, + "loss": 0.16829732656478882, + "step": 226440 + }, + { + "epoch": 0.9721971785030439, + "grad_norm": 3.341284990310669, + "learning_rate": 2.7927011201848868e-06, + "loss": 0.13691198825836182, + "step": 226450 + }, + { + "epoch": 0.9722401105930639, + "grad_norm": 1.4227256774902344, + "learning_rate": 2.788389400067263e-06, + "loss": 0.20174951553344728, + "step": 226460 + }, + { + "epoch": 0.9722830426830839, + "grad_norm": 0.05399510636925697, + "learning_rate": 2.7840776799496393e-06, + "loss": 0.25884032249450684, + "step": 226470 + }, + { + "epoch": 0.9723259747731039, + "grad_norm": 1.700989842414856, + "learning_rate": 2.7797659598320154e-06, + "loss": 0.15701980590820314, + "step": 226480 + }, + { + "epoch": 0.9723689068631239, + "grad_norm": 0.06168566271662712, + "learning_rate": 2.7754542397143915e-06, + "loss": 0.3472371816635132, + "step": 226490 + }, + { + "epoch": 0.9724118389531439, + "grad_norm": 0.029171306639909744, + "learning_rate": 2.7711425195967684e-06, + "loss": 0.2525274991989136, + "step": 226500 + }, + { + "epoch": 0.9724547710431639, + "grad_norm": 0.006166656501591206, + "learning_rate": 2.7668307994791445e-06, + "loss": 0.41352314949035646, + "step": 226510 + }, + { + "epoch": 0.972497703133184, + "grad_norm": 0.18927429616451263, + "learning_rate": 2.7625190793615205e-06, + "loss": 0.17671269178390503, + "step": 226520 + }, + { + "epoch": 0.9725406352232039, + "grad_norm": 1.8159795999526978, + "learning_rate": 2.758207359243897e-06, + "loss": 0.15529967546463014, + "step": 226530 + }, + { + "epoch": 0.9725835673132239, + "grad_norm": 0.02586592175066471, + "learning_rate": 2.753895639126273e-06, + "loss": 0.15995392799377442, + "step": 226540 + }, + { + "epoch": 0.972626499403244, + "grad_norm": 0.007591897621750832, + "learning_rate": 2.7495839190086496e-06, + "loss": 0.04184904992580414, + "step": 226550 + }, + { + "epoch": 0.9726694314932639, + "grad_norm": 0.012123005464673042, + "learning_rate": 2.7452721988910257e-06, + "loss": 0.21659538745880128, + "step": 226560 + }, + { + "epoch": 0.972712363583284, + "grad_norm": 1.6355416774749756, + "learning_rate": 2.7409604787734017e-06, + "loss": 0.35333163738250734, + "step": 226570 + }, + { + "epoch": 0.972755295673304, + "grad_norm": 0.010789229534566402, + "learning_rate": 2.7366487586557782e-06, + "loss": 0.12032248973846435, + "step": 226580 + }, + { + "epoch": 0.9727982277633239, + "grad_norm": 1.9976438283920288, + "learning_rate": 2.7323370385381547e-06, + "loss": 0.05439336895942688, + "step": 226590 + }, + { + "epoch": 0.972841159853344, + "grad_norm": 0.013741575181484222, + "learning_rate": 2.728025318420531e-06, + "loss": 0.15803490877151488, + "step": 226600 + }, + { + "epoch": 0.972884091943364, + "grad_norm": 0.2421882599592209, + "learning_rate": 2.7237135983029073e-06, + "loss": 0.2856144905090332, + "step": 226610 + }, + { + "epoch": 0.972927024033384, + "grad_norm": 1.9955962896347046, + "learning_rate": 2.7194018781852834e-06, + "loss": 0.16014361381530762, + "step": 226620 + }, + { + "epoch": 0.972969956123404, + "grad_norm": 0.28946375846862793, + "learning_rate": 2.7150901580676594e-06, + "loss": 0.2100440740585327, + "step": 226630 + }, + { + "epoch": 0.973012888213424, + "grad_norm": 1.82895827293396, + "learning_rate": 2.710778437950036e-06, + "loss": 0.2709784746170044, + "step": 226640 + }, + { + "epoch": 0.9730558203034441, + "grad_norm": 0.0039507439360022545, + "learning_rate": 2.706466717832412e-06, + "loss": 0.15208420753479004, + "step": 226650 + }, + { + "epoch": 0.973098752393464, + "grad_norm": 8.616528511047363, + "learning_rate": 2.7021549977147885e-06, + "loss": 0.31345291137695314, + "step": 226660 + }, + { + "epoch": 0.973141684483484, + "grad_norm": 0.6658341884613037, + "learning_rate": 2.6978432775971646e-06, + "loss": 0.21907491683959962, + "step": 226670 + }, + { + "epoch": 0.9731846165735041, + "grad_norm": 0.02232818678021431, + "learning_rate": 2.693531557479541e-06, + "loss": 0.201385498046875, + "step": 226680 + }, + { + "epoch": 0.973227548663524, + "grad_norm": 1.584527850151062, + "learning_rate": 2.6892198373619176e-06, + "loss": 0.4038440704345703, + "step": 226690 + }, + { + "epoch": 0.973270480753544, + "grad_norm": 0.014107631519436836, + "learning_rate": 2.6849081172442936e-06, + "loss": 0.2761273145675659, + "step": 226700 + }, + { + "epoch": 0.9733134128435641, + "grad_norm": 0.002957344288006425, + "learning_rate": 2.6805963971266697e-06, + "loss": 0.12850207090377808, + "step": 226710 + }, + { + "epoch": 0.973356344933584, + "grad_norm": 0.0005787658737972379, + "learning_rate": 2.676284677009046e-06, + "loss": 0.14453613758087158, + "step": 226720 + }, + { + "epoch": 0.9733992770236041, + "grad_norm": 4.067680835723877, + "learning_rate": 2.6719729568914223e-06, + "loss": 0.20334532260894775, + "step": 226730 + }, + { + "epoch": 0.9734422091136241, + "grad_norm": 7.407779216766357, + "learning_rate": 2.6676612367737988e-06, + "loss": 0.35728869438171384, + "step": 226740 + }, + { + "epoch": 0.973485141203644, + "grad_norm": 3.8809151649475098, + "learning_rate": 2.663349516656175e-06, + "loss": 0.23047690391540526, + "step": 226750 + }, + { + "epoch": 0.9735280732936641, + "grad_norm": 0.03422745689749718, + "learning_rate": 2.659037796538551e-06, + "loss": 0.2741354703903198, + "step": 226760 + }, + { + "epoch": 0.9735710053836841, + "grad_norm": 4.280041217803955, + "learning_rate": 2.654726076420928e-06, + "loss": 0.04992449879646301, + "step": 226770 + }, + { + "epoch": 0.9736139374737041, + "grad_norm": 0.03290945664048195, + "learning_rate": 2.650414356303304e-06, + "loss": 0.07728307247161866, + "step": 226780 + }, + { + "epoch": 0.9736568695637241, + "grad_norm": 0.003999393433332443, + "learning_rate": 2.64610263618568e-06, + "loss": 0.1892857313156128, + "step": 226790 + }, + { + "epoch": 0.9736998016537441, + "grad_norm": 1.8522354364395142, + "learning_rate": 2.6417909160680565e-06, + "loss": 0.37737085819244387, + "step": 226800 + }, + { + "epoch": 0.9737427337437641, + "grad_norm": 0.1001518964767456, + "learning_rate": 2.6374791959504326e-06, + "loss": 0.12852399349212645, + "step": 226810 + }, + { + "epoch": 0.9737856658337841, + "grad_norm": 0.04786868020892143, + "learning_rate": 2.633167475832809e-06, + "loss": 0.09738991856575012, + "step": 226820 + }, + { + "epoch": 0.9738285979238042, + "grad_norm": 0.3759821951389313, + "learning_rate": 2.628855755715185e-06, + "loss": 0.18861787319183348, + "step": 226830 + }, + { + "epoch": 0.9738715300138241, + "grad_norm": 0.0015561191830784082, + "learning_rate": 2.624544035597561e-06, + "loss": 0.1046902894973755, + "step": 226840 + }, + { + "epoch": 0.9739144621038441, + "grad_norm": 0.010539585724473, + "learning_rate": 2.6202323154799377e-06, + "loss": 0.14529746770858765, + "step": 226850 + }, + { + "epoch": 0.9739573941938642, + "grad_norm": 0.002079649828374386, + "learning_rate": 2.615920595362314e-06, + "loss": 0.1505010724067688, + "step": 226860 + }, + { + "epoch": 0.9740003262838841, + "grad_norm": 0.0004063397354912013, + "learning_rate": 2.6116088752446903e-06, + "loss": 0.07104050517082214, + "step": 226870 + }, + { + "epoch": 0.9740432583739042, + "grad_norm": 0.649005651473999, + "learning_rate": 2.6072971551270668e-06, + "loss": 0.12537972927093505, + "step": 226880 + }, + { + "epoch": 0.9740861904639242, + "grad_norm": 2.9869272708892822, + "learning_rate": 2.602985435009443e-06, + "loss": 0.4363424301147461, + "step": 226890 + }, + { + "epoch": 0.9741291225539441, + "grad_norm": 0.004428323358297348, + "learning_rate": 2.598673714891819e-06, + "loss": 0.268056321144104, + "step": 226900 + }, + { + "epoch": 0.9741720546439642, + "grad_norm": 0.025846082717180252, + "learning_rate": 2.5943619947741954e-06, + "loss": 0.17284902334213256, + "step": 226910 + }, + { + "epoch": 0.9742149867339842, + "grad_norm": 0.07464879006147385, + "learning_rate": 2.5900502746565715e-06, + "loss": 0.35149922370910647, + "step": 226920 + }, + { + "epoch": 0.9742579188240041, + "grad_norm": 0.003758594859391451, + "learning_rate": 2.585738554538948e-06, + "loss": 0.08602445125579834, + "step": 226930 + }, + { + "epoch": 0.9743008509140242, + "grad_norm": 0.0022107944823801517, + "learning_rate": 2.581426834421324e-06, + "loss": 0.3697013854980469, + "step": 226940 + }, + { + "epoch": 0.9743437830040442, + "grad_norm": 1.706981897354126, + "learning_rate": 2.5771151143037e-06, + "loss": 0.309721565246582, + "step": 226950 + }, + { + "epoch": 0.9743867150940642, + "grad_norm": 1.509405255317688, + "learning_rate": 2.572803394186077e-06, + "loss": 0.12059307098388672, + "step": 226960 + }, + { + "epoch": 0.9744296471840842, + "grad_norm": 0.014540722593665123, + "learning_rate": 2.568491674068453e-06, + "loss": 0.13089817762374878, + "step": 226970 + }, + { + "epoch": 0.9744725792741042, + "grad_norm": 5.593906402587891, + "learning_rate": 2.564179953950829e-06, + "loss": 0.12888550758361816, + "step": 226980 + }, + { + "epoch": 0.9745155113641242, + "grad_norm": 0.0023993600625544786, + "learning_rate": 2.5598682338332057e-06, + "loss": 0.14289275407791138, + "step": 226990 + }, + { + "epoch": 0.9745584434541442, + "grad_norm": 0.0118110878393054, + "learning_rate": 2.5555565137155817e-06, + "loss": 0.43169412612915037, + "step": 227000 + }, + { + "epoch": 0.9745584434541442, + "eval_loss": 0.36603018641471863, + "eval_runtime": 27.3928, + "eval_samples_per_second": 3.651, + "eval_steps_per_second": 3.651, + "step": 227000 + }, + { + "epoch": 0.9746013755441643, + "grad_norm": 0.03272734209895134, + "learning_rate": 2.5512447935979582e-06, + "loss": 0.18798859119415284, + "step": 227010 + }, + { + "epoch": 0.9746443076341842, + "grad_norm": 0.07965968549251556, + "learning_rate": 2.5469330734803343e-06, + "loss": 0.1925390124320984, + "step": 227020 + }, + { + "epoch": 0.9746872397242042, + "grad_norm": 6.077420234680176, + "learning_rate": 2.5426213533627104e-06, + "loss": 0.19045372009277345, + "step": 227030 + }, + { + "epoch": 0.9747301718142243, + "grad_norm": 0.2418179214000702, + "learning_rate": 2.538309633245087e-06, + "loss": 0.1367825984954834, + "step": 227040 + }, + { + "epoch": 0.9747731039042443, + "grad_norm": 0.0008199986768886447, + "learning_rate": 2.5339979131274634e-06, + "loss": 0.15820411443710328, + "step": 227050 + }, + { + "epoch": 0.9748160359942643, + "grad_norm": 0.24902009963989258, + "learning_rate": 2.5296861930098394e-06, + "loss": 0.08510852456092835, + "step": 227060 + }, + { + "epoch": 0.9748589680842843, + "grad_norm": 0.8076011538505554, + "learning_rate": 2.525374472892216e-06, + "loss": 0.16344135999679565, + "step": 227070 + }, + { + "epoch": 0.9749019001743043, + "grad_norm": 0.015640171244740486, + "learning_rate": 2.521062752774592e-06, + "loss": 0.07457131147384644, + "step": 227080 + }, + { + "epoch": 0.9749448322643243, + "grad_norm": 0.001444989233277738, + "learning_rate": 2.516751032656968e-06, + "loss": 0.1386529803276062, + "step": 227090 + }, + { + "epoch": 0.9749877643543443, + "grad_norm": 0.039012517780065536, + "learning_rate": 2.5124393125393446e-06, + "loss": 0.002665388956665993, + "step": 227100 + }, + { + "epoch": 0.9750306964443644, + "grad_norm": 0.15500108897686005, + "learning_rate": 2.5081275924217207e-06, + "loss": 0.33515994548797606, + "step": 227110 + }, + { + "epoch": 0.9750736285343843, + "grad_norm": 0.0024809353053569794, + "learning_rate": 2.503815872304097e-06, + "loss": 0.06954295039176941, + "step": 227120 + }, + { + "epoch": 0.9751165606244043, + "grad_norm": 0.0008949413313530385, + "learning_rate": 2.4995041521864732e-06, + "loss": 0.20227794647216796, + "step": 227130 + }, + { + "epoch": 0.9751594927144244, + "grad_norm": 0.0061431932263076305, + "learning_rate": 2.4951924320688497e-06, + "loss": 0.07348456382751464, + "step": 227140 + }, + { + "epoch": 0.9752024248044443, + "grad_norm": 0.039200812578201294, + "learning_rate": 2.4908807119512262e-06, + "loss": 0.1787712812423706, + "step": 227150 + }, + { + "epoch": 0.9752453568944643, + "grad_norm": 0.2038622498512268, + "learning_rate": 2.4865689918336023e-06, + "loss": 0.3955040454864502, + "step": 227160 + }, + { + "epoch": 0.9752882889844844, + "grad_norm": 0.0030347283463925123, + "learning_rate": 2.4822572717159784e-06, + "loss": 0.3216248989105225, + "step": 227170 + }, + { + "epoch": 0.9753312210745043, + "grad_norm": 0.23773819208145142, + "learning_rate": 2.477945551598355e-06, + "loss": 0.0572589635848999, + "step": 227180 + }, + { + "epoch": 0.9753741531645244, + "grad_norm": 4.1003546714782715, + "learning_rate": 2.473633831480731e-06, + "loss": 0.3647440433502197, + "step": 227190 + }, + { + "epoch": 0.9754170852545444, + "grad_norm": 0.010763336904346943, + "learning_rate": 2.4693221113631074e-06, + "loss": 0.07364763617515564, + "step": 227200 + }, + { + "epoch": 0.9754600173445643, + "grad_norm": 0.05231078341603279, + "learning_rate": 2.4650103912454835e-06, + "loss": 0.21562786102294923, + "step": 227210 + }, + { + "epoch": 0.9755029494345844, + "grad_norm": 0.0015636775642633438, + "learning_rate": 2.4606986711278596e-06, + "loss": 0.1701973557472229, + "step": 227220 + }, + { + "epoch": 0.9755458815246044, + "grad_norm": 0.0006244481191970408, + "learning_rate": 2.4563869510102365e-06, + "loss": 0.22180135250091554, + "step": 227230 + }, + { + "epoch": 0.9755888136146244, + "grad_norm": 1.9431523084640503, + "learning_rate": 2.4520752308926126e-06, + "loss": 0.3102797269821167, + "step": 227240 + }, + { + "epoch": 0.9756317457046444, + "grad_norm": 4.471474647521973, + "learning_rate": 2.4477635107749886e-06, + "loss": 0.47100114822387695, + "step": 227250 + }, + { + "epoch": 0.9756746777946644, + "grad_norm": 0.007208989933133125, + "learning_rate": 2.443451790657365e-06, + "loss": 0.06431352496147155, + "step": 227260 + }, + { + "epoch": 0.9757176098846844, + "grad_norm": 2.231614589691162, + "learning_rate": 2.439140070539741e-06, + "loss": 0.3845525741577148, + "step": 227270 + }, + { + "epoch": 0.9757605419747044, + "grad_norm": 0.024662956595420837, + "learning_rate": 2.4348283504221173e-06, + "loss": 0.1768028736114502, + "step": 227280 + }, + { + "epoch": 0.9758034740647245, + "grad_norm": 0.19310876727104187, + "learning_rate": 2.4305166303044938e-06, + "loss": 0.12158677577972413, + "step": 227290 + }, + { + "epoch": 0.9758464061547444, + "grad_norm": 1.1528043746948242, + "learning_rate": 2.42620491018687e-06, + "loss": 0.1612181305885315, + "step": 227300 + }, + { + "epoch": 0.9758893382447644, + "grad_norm": 4.803441047668457, + "learning_rate": 2.4218931900692463e-06, + "loss": 0.2455395221710205, + "step": 227310 + }, + { + "epoch": 0.9759322703347845, + "grad_norm": 0.8248661160469055, + "learning_rate": 2.417581469951623e-06, + "loss": 0.1723708391189575, + "step": 227320 + }, + { + "epoch": 0.9759752024248044, + "grad_norm": 1.202093243598938, + "learning_rate": 2.413269749833999e-06, + "loss": 0.15180646181106566, + "step": 227330 + }, + { + "epoch": 0.9760181345148244, + "grad_norm": 3.625997304916382, + "learning_rate": 2.4089580297163754e-06, + "loss": 0.24789741039276122, + "step": 227340 + }, + { + "epoch": 0.9760610666048445, + "grad_norm": 0.038008879870176315, + "learning_rate": 2.4046463095987515e-06, + "loss": 0.01338741034269333, + "step": 227350 + }, + { + "epoch": 0.9761039986948644, + "grad_norm": 0.2671995759010315, + "learning_rate": 2.4003345894811275e-06, + "loss": 0.18389822244644166, + "step": 227360 + }, + { + "epoch": 0.9761469307848845, + "grad_norm": 0.003803535597398877, + "learning_rate": 2.396022869363504e-06, + "loss": 0.0031682711094617845, + "step": 227370 + }, + { + "epoch": 0.9761898628749045, + "grad_norm": 0.008440026082098484, + "learning_rate": 2.39171114924588e-06, + "loss": 0.207161021232605, + "step": 227380 + }, + { + "epoch": 0.9762327949649244, + "grad_norm": 112.1073989868164, + "learning_rate": 2.3873994291282566e-06, + "loss": 0.3629873275756836, + "step": 227390 + }, + { + "epoch": 0.9762757270549445, + "grad_norm": 0.013895579613745213, + "learning_rate": 2.3830877090106327e-06, + "loss": 0.09868028163909912, + "step": 227400 + }, + { + "epoch": 0.9763186591449645, + "grad_norm": 0.8464810848236084, + "learning_rate": 2.378775988893009e-06, + "loss": 0.2205869197845459, + "step": 227410 + }, + { + "epoch": 0.9763615912349844, + "grad_norm": 0.12771189212799072, + "learning_rate": 2.3744642687753857e-06, + "loss": 0.07797903418540955, + "step": 227420 + }, + { + "epoch": 0.9764045233250045, + "grad_norm": 0.3783540427684784, + "learning_rate": 2.3701525486577618e-06, + "loss": 0.1723588228225708, + "step": 227430 + }, + { + "epoch": 0.9764474554150245, + "grad_norm": 0.02006363309919834, + "learning_rate": 2.365840828540138e-06, + "loss": 0.2192901849746704, + "step": 227440 + }, + { + "epoch": 0.9764903875050445, + "grad_norm": 0.024119436740875244, + "learning_rate": 2.3615291084225143e-06, + "loss": 0.14305353164672852, + "step": 227450 + }, + { + "epoch": 0.9765333195950645, + "grad_norm": 7.444068908691406, + "learning_rate": 2.3572173883048904e-06, + "loss": 0.31871623992919923, + "step": 227460 + }, + { + "epoch": 0.9765762516850846, + "grad_norm": 0.6991757154464722, + "learning_rate": 2.3529056681872665e-06, + "loss": 0.10647947788238525, + "step": 227470 + }, + { + "epoch": 0.9766191837751046, + "grad_norm": 1.1979470252990723, + "learning_rate": 2.348593948069643e-06, + "loss": 0.17139954566955568, + "step": 227480 + }, + { + "epoch": 0.9766621158651245, + "grad_norm": 0.047781504690647125, + "learning_rate": 2.344282227952019e-06, + "loss": 0.24302124977111816, + "step": 227490 + }, + { + "epoch": 0.9767050479551446, + "grad_norm": 43.781436920166016, + "learning_rate": 2.3399705078343955e-06, + "loss": 0.09933934807777405, + "step": 227500 + }, + { + "epoch": 0.9767479800451646, + "grad_norm": 0.06575685739517212, + "learning_rate": 2.335658787716772e-06, + "loss": 0.3872169017791748, + "step": 227510 + }, + { + "epoch": 0.9767909121351845, + "grad_norm": 1.0957386493682861, + "learning_rate": 2.331347067599148e-06, + "loss": 0.17452040910720826, + "step": 227520 + }, + { + "epoch": 0.9768338442252046, + "grad_norm": 1.1414989233016968, + "learning_rate": 2.3270353474815246e-06, + "loss": 0.32037715911865233, + "step": 227530 + }, + { + "epoch": 0.9768767763152246, + "grad_norm": 0.021362558007240295, + "learning_rate": 2.3227236273639007e-06, + "loss": 0.15376040935516358, + "step": 227540 + }, + { + "epoch": 0.9769197084052446, + "grad_norm": 3.3690531253814697, + "learning_rate": 2.3184119072462767e-06, + "loss": 0.35323572158813477, + "step": 227550 + }, + { + "epoch": 0.9769626404952646, + "grad_norm": 0.03152088448405266, + "learning_rate": 2.3141001871286532e-06, + "loss": 0.15891146659851074, + "step": 227560 + }, + { + "epoch": 0.9770055725852846, + "grad_norm": 0.018579134717583656, + "learning_rate": 2.3097884670110293e-06, + "loss": 0.06911247372627258, + "step": 227570 + }, + { + "epoch": 0.9770485046753046, + "grad_norm": 0.0024237283505499363, + "learning_rate": 2.305476746893406e-06, + "loss": 0.02136930972337723, + "step": 227580 + }, + { + "epoch": 0.9770914367653246, + "grad_norm": 4.892970085144043, + "learning_rate": 2.3011650267757823e-06, + "loss": 0.24032959938049317, + "step": 227590 + }, + { + "epoch": 0.9771343688553447, + "grad_norm": 0.0077386279590427876, + "learning_rate": 2.2968533066581584e-06, + "loss": 0.12323408126831055, + "step": 227600 + }, + { + "epoch": 0.9771773009453646, + "grad_norm": 4.715496063232422, + "learning_rate": 2.292541586540535e-06, + "loss": 0.2505172252655029, + "step": 227610 + }, + { + "epoch": 0.9772202330353846, + "grad_norm": 0.002098772209137678, + "learning_rate": 2.288229866422911e-06, + "loss": 0.08634217381477356, + "step": 227620 + }, + { + "epoch": 0.9772631651254047, + "grad_norm": 0.9451601505279541, + "learning_rate": 2.283918146305287e-06, + "loss": 0.23014225959777831, + "step": 227630 + }, + { + "epoch": 0.9773060972154246, + "grad_norm": 0.08155321329832077, + "learning_rate": 2.2796064261876635e-06, + "loss": 0.15814539194107055, + "step": 227640 + }, + { + "epoch": 0.9773490293054446, + "grad_norm": 1.6224653720855713, + "learning_rate": 2.2752947060700396e-06, + "loss": 0.2659270763397217, + "step": 227650 + }, + { + "epoch": 0.9773919613954647, + "grad_norm": 0.10807585716247559, + "learning_rate": 2.2709829859524157e-06, + "loss": 0.17804590463638306, + "step": 227660 + }, + { + "epoch": 0.9774348934854846, + "grad_norm": 0.5821875929832458, + "learning_rate": 2.266671265834792e-06, + "loss": 0.368826699256897, + "step": 227670 + }, + { + "epoch": 0.9774778255755047, + "grad_norm": 0.0009552693809382617, + "learning_rate": 2.2623595457171686e-06, + "loss": 0.22152063846588135, + "step": 227680 + }, + { + "epoch": 0.9775207576655247, + "grad_norm": 0.00434734346345067, + "learning_rate": 2.2580478255995447e-06, + "loss": 0.22306718826293945, + "step": 227690 + }, + { + "epoch": 0.9775636897555446, + "grad_norm": 6.497489929199219, + "learning_rate": 2.253736105481921e-06, + "loss": 0.2369317054748535, + "step": 227700 + }, + { + "epoch": 0.9776066218455647, + "grad_norm": 0.027641694992780685, + "learning_rate": 2.2494243853642973e-06, + "loss": 0.22835359573364258, + "step": 227710 + }, + { + "epoch": 0.9776495539355847, + "grad_norm": 0.12441679835319519, + "learning_rate": 2.2451126652466738e-06, + "loss": 0.06525906324386596, + "step": 227720 + }, + { + "epoch": 0.9776924860256047, + "grad_norm": 1.1552482843399048, + "learning_rate": 2.24080094512905e-06, + "loss": 0.24290194511413574, + "step": 227730 + }, + { + "epoch": 0.9777354181156247, + "grad_norm": 0.45403793454170227, + "learning_rate": 2.236489225011426e-06, + "loss": 0.17993674278259278, + "step": 227740 + }, + { + "epoch": 0.9777783502056447, + "grad_norm": 2.896038293838501, + "learning_rate": 2.2321775048938024e-06, + "loss": 0.2308863639831543, + "step": 227750 + }, + { + "epoch": 0.9778212822956647, + "grad_norm": 0.5456775426864624, + "learning_rate": 2.2278657847761785e-06, + "loss": 0.20217106342315674, + "step": 227760 + }, + { + "epoch": 0.9778642143856847, + "grad_norm": 3.1207029819488525, + "learning_rate": 2.223554064658555e-06, + "loss": 0.33245410919189455, + "step": 227770 + }, + { + "epoch": 0.9779071464757048, + "grad_norm": 2.0536296367645264, + "learning_rate": 2.2192423445409315e-06, + "loss": 0.14470397233963012, + "step": 227780 + }, + { + "epoch": 0.9779500785657247, + "grad_norm": 0.0007920139469206333, + "learning_rate": 2.2149306244233076e-06, + "loss": 0.20721204280853273, + "step": 227790 + }, + { + "epoch": 0.9779930106557447, + "grad_norm": 0.09720521420240402, + "learning_rate": 2.210618904305684e-06, + "loss": 0.14968944787979127, + "step": 227800 + }, + { + "epoch": 0.9780359427457648, + "grad_norm": 0.5558373332023621, + "learning_rate": 2.20630718418806e-06, + "loss": 0.08571412563323974, + "step": 227810 + }, + { + "epoch": 0.9780788748357847, + "grad_norm": 0.005012707784771919, + "learning_rate": 2.201995464070436e-06, + "loss": 0.23334116935729982, + "step": 227820 + }, + { + "epoch": 0.9781218069258047, + "grad_norm": 0.002742292359471321, + "learning_rate": 2.1976837439528127e-06, + "loss": 0.3009469032287598, + "step": 227830 + }, + { + "epoch": 0.9781647390158248, + "grad_norm": 0.18742695450782776, + "learning_rate": 2.1933720238351888e-06, + "loss": 0.1338501214981079, + "step": 227840 + }, + { + "epoch": 0.9782076711058447, + "grad_norm": 0.001777024706825614, + "learning_rate": 2.1890603037175653e-06, + "loss": 0.2050083875656128, + "step": 227850 + }, + { + "epoch": 0.9782506031958648, + "grad_norm": 1.8684906959533691, + "learning_rate": 2.1847485835999413e-06, + "loss": 0.44011287689208983, + "step": 227860 + }, + { + "epoch": 0.9782935352858848, + "grad_norm": 0.13816608488559723, + "learning_rate": 2.180436863482318e-06, + "loss": 0.023053205013275145, + "step": 227870 + }, + { + "epoch": 0.9783364673759047, + "grad_norm": 0.063438281416893, + "learning_rate": 2.176125143364694e-06, + "loss": 0.32267377376556394, + "step": 227880 + }, + { + "epoch": 0.9783793994659248, + "grad_norm": 0.0014309908729046583, + "learning_rate": 2.1718134232470704e-06, + "loss": 0.10749776363372802, + "step": 227890 + }, + { + "epoch": 0.9784223315559448, + "grad_norm": 0.02877630479633808, + "learning_rate": 2.1675017031294465e-06, + "loss": 0.2096189498901367, + "step": 227900 + }, + { + "epoch": 0.9784652636459649, + "grad_norm": 0.0027715996839106083, + "learning_rate": 2.163189983011823e-06, + "loss": 0.17635757923126222, + "step": 227910 + }, + { + "epoch": 0.9785081957359848, + "grad_norm": 0.06941622495651245, + "learning_rate": 2.158878262894199e-06, + "loss": 0.006739767640829087, + "step": 227920 + }, + { + "epoch": 0.9785511278260048, + "grad_norm": 0.0025414933916181326, + "learning_rate": 2.154566542776575e-06, + "loss": 0.002655784413218498, + "step": 227930 + }, + { + "epoch": 0.9785940599160249, + "grad_norm": 3.310178518295288, + "learning_rate": 2.1502548226589516e-06, + "loss": 0.30617356300354004, + "step": 227940 + }, + { + "epoch": 0.9786369920060448, + "grad_norm": 1.4081175327301025, + "learning_rate": 2.1459431025413277e-06, + "loss": 0.2300107955932617, + "step": 227950 + }, + { + "epoch": 0.9786799240960649, + "grad_norm": 0.009740284644067287, + "learning_rate": 2.141631382423704e-06, + "loss": 0.07422532439231873, + "step": 227960 + }, + { + "epoch": 0.9787228561860849, + "grad_norm": 1.0734556913375854, + "learning_rate": 2.1373196623060807e-06, + "loss": 0.21981630325317383, + "step": 227970 + }, + { + "epoch": 0.9787657882761048, + "grad_norm": 0.22914394736289978, + "learning_rate": 2.1330079421884567e-06, + "loss": 0.11649729013442993, + "step": 227980 + }, + { + "epoch": 0.9788087203661249, + "grad_norm": 0.034725673496723175, + "learning_rate": 2.1286962220708332e-06, + "loss": 0.2758263349533081, + "step": 227990 + }, + { + "epoch": 0.9788516524561449, + "grad_norm": 0.03354437276721001, + "learning_rate": 2.1243845019532093e-06, + "loss": 0.18928390741348267, + "step": 228000 + }, + { + "epoch": 0.9788516524561449, + "eval_loss": 0.3664787709712982, + "eval_runtime": 27.507, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 3.635, + "step": 228000 + }, + { + "epoch": 0.9788945845461648, + "grad_norm": 0.3679724335670471, + "learning_rate": 2.1200727818355854e-06, + "loss": 0.09740736484527587, + "step": 228010 + }, + { + "epoch": 0.9789375166361849, + "grad_norm": 0.17959082126617432, + "learning_rate": 2.115761061717962e-06, + "loss": 0.11371631622314453, + "step": 228020 + }, + { + "epoch": 0.9789804487262049, + "grad_norm": 0.14090149104595184, + "learning_rate": 2.111449341600338e-06, + "loss": 0.1154598593711853, + "step": 228030 + }, + { + "epoch": 0.9790233808162249, + "grad_norm": 0.14479191601276398, + "learning_rate": 2.1071376214827144e-06, + "loss": 0.2554008960723877, + "step": 228040 + }, + { + "epoch": 0.9790663129062449, + "grad_norm": 0.08767344802618027, + "learning_rate": 2.102825901365091e-06, + "loss": 0.17707765102386475, + "step": 228050 + }, + { + "epoch": 0.979109244996265, + "grad_norm": 0.00018606445519253612, + "learning_rate": 2.098514181247467e-06, + "loss": 0.0368975430727005, + "step": 228060 + }, + { + "epoch": 0.9791521770862849, + "grad_norm": 0.026990529149770737, + "learning_rate": 2.0942024611298435e-06, + "loss": 0.16851993799209594, + "step": 228070 + }, + { + "epoch": 0.9791951091763049, + "grad_norm": 0.02671569027006626, + "learning_rate": 2.0898907410122196e-06, + "loss": 0.19040234088897706, + "step": 228080 + }, + { + "epoch": 0.979238041266325, + "grad_norm": 0.0703616812825203, + "learning_rate": 2.0855790208945957e-06, + "loss": 0.14678169488906861, + "step": 228090 + }, + { + "epoch": 0.9792809733563449, + "grad_norm": 0.011678442358970642, + "learning_rate": 2.081267300776972e-06, + "loss": 0.14489219188690186, + "step": 228100 + }, + { + "epoch": 0.9793239054463649, + "grad_norm": 0.03233587369322777, + "learning_rate": 2.0769555806593482e-06, + "loss": 0.0989188551902771, + "step": 228110 + }, + { + "epoch": 0.979366837536385, + "grad_norm": 0.010937056504189968, + "learning_rate": 2.0726438605417243e-06, + "loss": 0.2196566343307495, + "step": 228120 + }, + { + "epoch": 0.9794097696264049, + "grad_norm": 0.43799400329589844, + "learning_rate": 2.068332140424101e-06, + "loss": 0.43077888488769533, + "step": 228130 + }, + { + "epoch": 0.979452701716425, + "grad_norm": 0.2214488834142685, + "learning_rate": 2.0640204203064773e-06, + "loss": 0.22788968086242675, + "step": 228140 + }, + { + "epoch": 0.979495633806445, + "grad_norm": 0.004069837741553783, + "learning_rate": 2.0597087001888534e-06, + "loss": 0.12345916032791138, + "step": 228150 + }, + { + "epoch": 0.9795385658964649, + "grad_norm": 0.025346960872411728, + "learning_rate": 2.05539698007123e-06, + "loss": 0.2612889051437378, + "step": 228160 + }, + { + "epoch": 0.979581497986485, + "grad_norm": 0.016129495576024055, + "learning_rate": 2.051085259953606e-06, + "loss": 0.3327603816986084, + "step": 228170 + }, + { + "epoch": 0.979624430076505, + "grad_norm": 0.668687105178833, + "learning_rate": 2.0467735398359824e-06, + "loss": 0.2924813270568848, + "step": 228180 + }, + { + "epoch": 0.979667362166525, + "grad_norm": 3.980806827545166, + "learning_rate": 2.0424618197183585e-06, + "loss": 0.27712819576263426, + "step": 228190 + }, + { + "epoch": 0.979710294256545, + "grad_norm": 4.9260783195495605, + "learning_rate": 2.0381500996007346e-06, + "loss": 0.18192075490951537, + "step": 228200 + }, + { + "epoch": 0.979753226346565, + "grad_norm": 5.185247898101807, + "learning_rate": 2.033838379483111e-06, + "loss": 0.22626786231994628, + "step": 228210 + }, + { + "epoch": 0.979796158436585, + "grad_norm": 0.0023593876976519823, + "learning_rate": 2.029526659365487e-06, + "loss": 0.22910573482513427, + "step": 228220 + }, + { + "epoch": 0.979839090526605, + "grad_norm": 0.22327831387519836, + "learning_rate": 2.0252149392478636e-06, + "loss": 0.16981350183486937, + "step": 228230 + }, + { + "epoch": 0.979882022616625, + "grad_norm": 0.478909432888031, + "learning_rate": 2.02090321913024e-06, + "loss": 0.12508680820465087, + "step": 228240 + }, + { + "epoch": 0.979924954706645, + "grad_norm": 0.007253910880535841, + "learning_rate": 2.016591499012616e-06, + "loss": 0.34746356010437013, + "step": 228250 + }, + { + "epoch": 0.979967886796665, + "grad_norm": 0.684379518032074, + "learning_rate": 2.0122797788949927e-06, + "loss": 0.25901849269866944, + "step": 228260 + }, + { + "epoch": 0.9800108188866851, + "grad_norm": 0.6113347411155701, + "learning_rate": 2.0079680587773688e-06, + "loss": 0.04099811613559723, + "step": 228270 + }, + { + "epoch": 0.980053750976705, + "grad_norm": 2.26021409034729, + "learning_rate": 2.003656338659745e-06, + "loss": 0.1390989303588867, + "step": 228280 + }, + { + "epoch": 0.980096683066725, + "grad_norm": 0.06597410887479782, + "learning_rate": 1.9993446185421213e-06, + "loss": 0.04822684526443481, + "step": 228290 + }, + { + "epoch": 0.9801396151567451, + "grad_norm": 0.3486667573451996, + "learning_rate": 1.9950328984244974e-06, + "loss": 0.27595837116241456, + "step": 228300 + }, + { + "epoch": 0.980182547246765, + "grad_norm": 0.01786203868687153, + "learning_rate": 1.9907211783068735e-06, + "loss": 0.1529044032096863, + "step": 228310 + }, + { + "epoch": 0.980225479336785, + "grad_norm": 2.636488437652588, + "learning_rate": 1.9864094581892504e-06, + "loss": 0.15176565647125245, + "step": 228320 + }, + { + "epoch": 0.9802684114268051, + "grad_norm": 0.136933371424675, + "learning_rate": 1.9820977380716265e-06, + "loss": 0.23685121536254883, + "step": 228330 + }, + { + "epoch": 0.9803113435168251, + "grad_norm": 0.007709937170147896, + "learning_rate": 1.9777860179540025e-06, + "loss": 0.11736712455749512, + "step": 228340 + }, + { + "epoch": 0.9803542756068451, + "grad_norm": 0.028216032311320305, + "learning_rate": 1.973474297836379e-06, + "loss": 0.2329657793045044, + "step": 228350 + }, + { + "epoch": 0.9803972076968651, + "grad_norm": 0.03977706655859947, + "learning_rate": 1.969162577718755e-06, + "loss": 0.24770543575286866, + "step": 228360 + }, + { + "epoch": 0.9804401397868852, + "grad_norm": 0.0018368628807365894, + "learning_rate": 1.9648508576011316e-06, + "loss": 0.1355770230293274, + "step": 228370 + }, + { + "epoch": 0.9804830718769051, + "grad_norm": 1.5883880853652954, + "learning_rate": 1.9605391374835077e-06, + "loss": 0.3594334840774536, + "step": 228380 + }, + { + "epoch": 0.9805260039669251, + "grad_norm": 0.007805908564478159, + "learning_rate": 1.9562274173658838e-06, + "loss": 0.3174657583236694, + "step": 228390 + }, + { + "epoch": 0.9805689360569452, + "grad_norm": 0.01015984546393156, + "learning_rate": 1.9519156972482603e-06, + "loss": 0.08694056272506714, + "step": 228400 + }, + { + "epoch": 0.9806118681469651, + "grad_norm": 1.0115506649017334, + "learning_rate": 1.9476039771306367e-06, + "loss": 0.08374444246292115, + "step": 228410 + }, + { + "epoch": 0.9806548002369851, + "grad_norm": 0.0012809137115254998, + "learning_rate": 1.943292257013013e-06, + "loss": 0.2278204917907715, + "step": 228420 + }, + { + "epoch": 0.9806977323270052, + "grad_norm": 0.02701873891055584, + "learning_rate": 1.9389805368953893e-06, + "loss": 0.21419262886047363, + "step": 228430 + }, + { + "epoch": 0.9807406644170251, + "grad_norm": 0.000725551275536418, + "learning_rate": 1.9346688167777654e-06, + "loss": 0.38900730609893797, + "step": 228440 + }, + { + "epoch": 0.9807835965070452, + "grad_norm": 0.09149477630853653, + "learning_rate": 1.930357096660142e-06, + "loss": 0.13757355213165284, + "step": 228450 + }, + { + "epoch": 0.9808265285970652, + "grad_norm": 0.22916379570960999, + "learning_rate": 1.926045376542518e-06, + "loss": 0.18153246641159057, + "step": 228460 + }, + { + "epoch": 0.9808694606870851, + "grad_norm": 0.0007235017255879939, + "learning_rate": 1.921733656424894e-06, + "loss": 0.12875158786773683, + "step": 228470 + }, + { + "epoch": 0.9809123927771052, + "grad_norm": 0.01834966614842415, + "learning_rate": 1.9174219363072705e-06, + "loss": 0.1255298972129822, + "step": 228480 + }, + { + "epoch": 0.9809553248671252, + "grad_norm": 0.016327425837516785, + "learning_rate": 1.9131102161896466e-06, + "loss": 0.17698729038238525, + "step": 228490 + }, + { + "epoch": 0.9809982569571452, + "grad_norm": 0.001244712620973587, + "learning_rate": 1.908798496072023e-06, + "loss": 0.14560694694519044, + "step": 228500 + }, + { + "epoch": 0.9810411890471652, + "grad_norm": 0.011022629216313362, + "learning_rate": 1.9044867759543994e-06, + "loss": 0.4523012638092041, + "step": 228510 + }, + { + "epoch": 0.9810841211371852, + "grad_norm": 2.6876344680786133, + "learning_rate": 1.9001750558367757e-06, + "loss": 0.14019935131072997, + "step": 228520 + }, + { + "epoch": 0.9811270532272052, + "grad_norm": 0.038009800016880035, + "learning_rate": 1.895863335719152e-06, + "loss": 0.22825298309326172, + "step": 228530 + }, + { + "epoch": 0.9811699853172252, + "grad_norm": 0.0013615781208500266, + "learning_rate": 1.8915516156015282e-06, + "loss": 0.29034783840179446, + "step": 228540 + }, + { + "epoch": 0.9812129174072453, + "grad_norm": 1.2209573984146118, + "learning_rate": 1.8872398954839045e-06, + "loss": 0.12078677415847779, + "step": 228550 + }, + { + "epoch": 0.9812558494972652, + "grad_norm": 4.229518890380859, + "learning_rate": 1.8829281753662806e-06, + "loss": 0.276714563369751, + "step": 228560 + }, + { + "epoch": 0.9812987815872852, + "grad_norm": 0.014234269969165325, + "learning_rate": 1.8786164552486569e-06, + "loss": 0.1266363263130188, + "step": 228570 + }, + { + "epoch": 0.9813417136773053, + "grad_norm": 0.09993533790111542, + "learning_rate": 1.8743047351310332e-06, + "loss": 0.23038718700408936, + "step": 228580 + }, + { + "epoch": 0.9813846457673252, + "grad_norm": 19.95162582397461, + "learning_rate": 1.8699930150134097e-06, + "loss": 0.045977193117141726, + "step": 228590 + }, + { + "epoch": 0.9814275778573452, + "grad_norm": 66.88458251953125, + "learning_rate": 1.865681294895786e-06, + "loss": 0.17263211011886598, + "step": 228600 + }, + { + "epoch": 0.9814705099473653, + "grad_norm": 0.0014390175929293036, + "learning_rate": 1.8613695747781622e-06, + "loss": 0.004979272186756134, + "step": 228610 + }, + { + "epoch": 0.9815134420373852, + "grad_norm": 0.035006795078516006, + "learning_rate": 1.8570578546605385e-06, + "loss": 0.08537711501121521, + "step": 228620 + }, + { + "epoch": 0.9815563741274053, + "grad_norm": 1.645871639251709, + "learning_rate": 1.8527461345429146e-06, + "loss": 0.09488987922668457, + "step": 228630 + }, + { + "epoch": 0.9815993062174253, + "grad_norm": 0.05962035059928894, + "learning_rate": 1.8484344144252909e-06, + "loss": 0.3014033794403076, + "step": 228640 + }, + { + "epoch": 0.9816422383074452, + "grad_norm": 6.166053295135498, + "learning_rate": 1.8441226943076671e-06, + "loss": 0.152974534034729, + "step": 228650 + }, + { + "epoch": 0.9816851703974653, + "grad_norm": 4.5735392570495605, + "learning_rate": 1.8398109741900434e-06, + "loss": 0.43787474632263185, + "step": 228660 + }, + { + "epoch": 0.9817281024874853, + "grad_norm": 1.1520566940307617, + "learning_rate": 1.8354992540724195e-06, + "loss": 0.27316927909851074, + "step": 228670 + }, + { + "epoch": 0.9817710345775053, + "grad_norm": 0.025254230946302414, + "learning_rate": 1.8311875339547962e-06, + "loss": 0.08152788877487183, + "step": 228680 + }, + { + "epoch": 0.9818139666675253, + "grad_norm": 0.04058744013309479, + "learning_rate": 1.8268758138371725e-06, + "loss": 0.022492873668670654, + "step": 228690 + }, + { + "epoch": 0.9818568987575453, + "grad_norm": 0.07055263966321945, + "learning_rate": 1.8225640937195486e-06, + "loss": 0.1144339919090271, + "step": 228700 + }, + { + "epoch": 0.9818998308475653, + "grad_norm": 0.009441941976547241, + "learning_rate": 1.8182523736019249e-06, + "loss": 0.22424943447113038, + "step": 228710 + }, + { + "epoch": 0.9819427629375853, + "grad_norm": 0.001161554828286171, + "learning_rate": 1.8139406534843011e-06, + "loss": 0.21555452346801757, + "step": 228720 + }, + { + "epoch": 0.9819856950276054, + "grad_norm": 0.0002816063060890883, + "learning_rate": 1.8096289333666774e-06, + "loss": 0.18701348304748536, + "step": 228730 + }, + { + "epoch": 0.9820286271176253, + "grad_norm": 0.5413615703582764, + "learning_rate": 1.8053172132490537e-06, + "loss": 0.15520825386047363, + "step": 228740 + }, + { + "epoch": 0.9820715592076453, + "grad_norm": 0.20203712582588196, + "learning_rate": 1.8010054931314298e-06, + "loss": 0.1745295763015747, + "step": 228750 + }, + { + "epoch": 0.9821144912976654, + "grad_norm": 0.0021156531292945147, + "learning_rate": 1.796693773013806e-06, + "loss": 0.055399179458618164, + "step": 228760 + }, + { + "epoch": 0.9821574233876854, + "grad_norm": 0.01750045455992222, + "learning_rate": 1.7923820528961828e-06, + "loss": 0.18583918809890748, + "step": 228770 + }, + { + "epoch": 0.9822003554777053, + "grad_norm": 0.007187838666141033, + "learning_rate": 1.7880703327785588e-06, + "loss": 0.060408055782318115, + "step": 228780 + }, + { + "epoch": 0.9822432875677254, + "grad_norm": 0.003980322275310755, + "learning_rate": 1.7837586126609351e-06, + "loss": 0.1356638789176941, + "step": 228790 + }, + { + "epoch": 0.9822862196577454, + "grad_norm": 1.3486459255218506, + "learning_rate": 1.7794468925433114e-06, + "loss": 0.15132099390029907, + "step": 228800 + }, + { + "epoch": 0.9823291517477654, + "grad_norm": 0.3155653178691864, + "learning_rate": 1.7751351724256877e-06, + "loss": 0.1574857473373413, + "step": 228810 + }, + { + "epoch": 0.9823720838377854, + "grad_norm": 0.0023163023870438337, + "learning_rate": 1.7708234523080638e-06, + "loss": 0.24329051971435547, + "step": 228820 + }, + { + "epoch": 0.9824150159278054, + "grad_norm": 0.0015276200138032436, + "learning_rate": 1.76651173219044e-06, + "loss": 0.3124807357788086, + "step": 228830 + }, + { + "epoch": 0.9824579480178254, + "grad_norm": 0.006269668694585562, + "learning_rate": 1.7622000120728163e-06, + "loss": 0.1125043511390686, + "step": 228840 + }, + { + "epoch": 0.9825008801078454, + "grad_norm": 0.7968956232070923, + "learning_rate": 1.7578882919551926e-06, + "loss": 0.14964487552642822, + "step": 228850 + }, + { + "epoch": 0.9825438121978655, + "grad_norm": 0.000462773023173213, + "learning_rate": 1.7535765718375687e-06, + "loss": 0.08822548389434814, + "step": 228860 + }, + { + "epoch": 0.9825867442878854, + "grad_norm": 0.04455536976456642, + "learning_rate": 1.7492648517199454e-06, + "loss": 0.21988744735717775, + "step": 228870 + }, + { + "epoch": 0.9826296763779054, + "grad_norm": 0.14884206652641296, + "learning_rate": 1.7449531316023217e-06, + "loss": 0.06686034798622131, + "step": 228880 + }, + { + "epoch": 0.9826726084679255, + "grad_norm": 0.18106015026569366, + "learning_rate": 1.7406414114846978e-06, + "loss": 0.0585746705532074, + "step": 228890 + }, + { + "epoch": 0.9827155405579454, + "grad_norm": 1.4197570085525513, + "learning_rate": 1.736329691367074e-06, + "loss": 0.045840752124786374, + "step": 228900 + }, + { + "epoch": 0.9827584726479655, + "grad_norm": 1.01323401927948, + "learning_rate": 1.7320179712494503e-06, + "loss": 0.13620872497558595, + "step": 228910 + }, + { + "epoch": 0.9828014047379855, + "grad_norm": 1.0370769500732422, + "learning_rate": 1.7277062511318266e-06, + "loss": 0.32361865043640137, + "step": 228920 + }, + { + "epoch": 0.9828443368280054, + "grad_norm": 0.024156205356121063, + "learning_rate": 1.7233945310142029e-06, + "loss": 0.13605793714523315, + "step": 228930 + }, + { + "epoch": 0.9828872689180255, + "grad_norm": 0.034660086035728455, + "learning_rate": 1.719082810896579e-06, + "loss": 0.27935500144958497, + "step": 228940 + }, + { + "epoch": 0.9829302010080455, + "grad_norm": 3.552581548690796, + "learning_rate": 1.7147710907789552e-06, + "loss": 0.2805205821990967, + "step": 228950 + }, + { + "epoch": 0.9829731330980654, + "grad_norm": 0.0014244000194594264, + "learning_rate": 1.710459370661332e-06, + "loss": 0.1224864363670349, + "step": 228960 + }, + { + "epoch": 0.9830160651880855, + "grad_norm": 0.1946384459733963, + "learning_rate": 1.706147650543708e-06, + "loss": 0.24968397617340088, + "step": 228970 + }, + { + "epoch": 0.9830589972781055, + "grad_norm": 0.009285945445299149, + "learning_rate": 1.7018359304260843e-06, + "loss": 0.023589310050010682, + "step": 228980 + }, + { + "epoch": 0.9831019293681255, + "grad_norm": 0.004341872408986092, + "learning_rate": 1.6975242103084606e-06, + "loss": 0.08136585950851441, + "step": 228990 + }, + { + "epoch": 0.9831448614581455, + "grad_norm": 0.016660042107105255, + "learning_rate": 1.6932124901908369e-06, + "loss": 0.2626636266708374, + "step": 229000 + }, + { + "epoch": 0.9831448614581455, + "eval_loss": 0.36609160900115967, + "eval_runtime": 27.3907, + "eval_samples_per_second": 3.651, + "eval_steps_per_second": 3.651, + "step": 229000 + }, + { + "epoch": 0.9831877935481655, + "grad_norm": 2.750297784805298, + "learning_rate": 1.688900770073213e-06, + "loss": 0.2714789152145386, + "step": 229010 + }, + { + "epoch": 0.9832307256381855, + "grad_norm": 1.1807098388671875, + "learning_rate": 1.6845890499555892e-06, + "loss": 0.09504563212394715, + "step": 229020 + }, + { + "epoch": 0.9832736577282055, + "grad_norm": 0.00848210509866476, + "learning_rate": 1.6802773298379655e-06, + "loss": 0.20887374877929688, + "step": 229030 + }, + { + "epoch": 0.9833165898182256, + "grad_norm": 0.053657423704862595, + "learning_rate": 1.6759656097203418e-06, + "loss": 0.31155006885528563, + "step": 229040 + }, + { + "epoch": 0.9833595219082455, + "grad_norm": 0.0011517629027366638, + "learning_rate": 1.6716538896027183e-06, + "loss": 0.4373622894287109, + "step": 229050 + }, + { + "epoch": 0.9834024539982655, + "grad_norm": 0.021499576047062874, + "learning_rate": 1.6673421694850946e-06, + "loss": 0.28340489864349366, + "step": 229060 + }, + { + "epoch": 0.9834453860882856, + "grad_norm": 1.2539126873016357, + "learning_rate": 1.6630304493674709e-06, + "loss": 0.29106993675231935, + "step": 229070 + }, + { + "epoch": 0.9834883181783055, + "grad_norm": 3.4739508628845215, + "learning_rate": 1.658718729249847e-06, + "loss": 0.0644143283367157, + "step": 229080 + }, + { + "epoch": 0.9835312502683256, + "grad_norm": 3.182511806488037, + "learning_rate": 1.6544070091322232e-06, + "loss": 0.3640718936920166, + "step": 229090 + }, + { + "epoch": 0.9835741823583456, + "grad_norm": 1.5243083238601685, + "learning_rate": 1.6500952890145995e-06, + "loss": 0.20260207653045653, + "step": 229100 + }, + { + "epoch": 0.9836171144483655, + "grad_norm": 0.06290149688720703, + "learning_rate": 1.6457835688969758e-06, + "loss": 0.32620935440063475, + "step": 229110 + }, + { + "epoch": 0.9836600465383856, + "grad_norm": 0.03638211637735367, + "learning_rate": 1.641471848779352e-06, + "loss": 0.06037415862083435, + "step": 229120 + }, + { + "epoch": 0.9837029786284056, + "grad_norm": 0.030279604718089104, + "learning_rate": 1.6371601286617281e-06, + "loss": 0.17438408136367797, + "step": 229130 + }, + { + "epoch": 0.9837459107184255, + "grad_norm": 0.0020007449202239513, + "learning_rate": 1.6328484085441049e-06, + "loss": 0.220800518989563, + "step": 229140 + }, + { + "epoch": 0.9837888428084456, + "grad_norm": 0.115732841193676, + "learning_rate": 1.6285366884264811e-06, + "loss": 0.22637467384338378, + "step": 229150 + }, + { + "epoch": 0.9838317748984656, + "grad_norm": 0.05772421509027481, + "learning_rate": 1.6242249683088572e-06, + "loss": 0.13500807285308838, + "step": 229160 + }, + { + "epoch": 0.9838747069884856, + "grad_norm": 0.0166893620043993, + "learning_rate": 1.6199132481912335e-06, + "loss": 0.1712018847465515, + "step": 229170 + }, + { + "epoch": 0.9839176390785056, + "grad_norm": 0.01135605201125145, + "learning_rate": 1.6156015280736098e-06, + "loss": 0.06851279139518737, + "step": 229180 + }, + { + "epoch": 0.9839605711685256, + "grad_norm": 0.06863057613372803, + "learning_rate": 1.611289807955986e-06, + "loss": 0.10661604404449462, + "step": 229190 + }, + { + "epoch": 0.9840035032585457, + "grad_norm": 3.3983232975006104, + "learning_rate": 1.6069780878383621e-06, + "loss": 0.21318387985229492, + "step": 229200 + }, + { + "epoch": 0.9840464353485656, + "grad_norm": 0.024844232946634293, + "learning_rate": 1.6026663677207384e-06, + "loss": 0.1229941725730896, + "step": 229210 + }, + { + "epoch": 0.9840893674385857, + "grad_norm": 2.1055259704589844, + "learning_rate": 1.5983546476031147e-06, + "loss": 0.276519250869751, + "step": 229220 + }, + { + "epoch": 0.9841322995286057, + "grad_norm": 0.04002037271857262, + "learning_rate": 1.5940429274854912e-06, + "loss": 0.17145614624023436, + "step": 229230 + }, + { + "epoch": 0.9841752316186256, + "grad_norm": 0.00037594526656903327, + "learning_rate": 1.5897312073678675e-06, + "loss": 0.22023613452911378, + "step": 229240 + }, + { + "epoch": 0.9842181637086457, + "grad_norm": 1.376724362373352, + "learning_rate": 1.5854194872502438e-06, + "loss": 0.19156415462493898, + "step": 229250 + }, + { + "epoch": 0.9842610957986657, + "grad_norm": 0.0014392342418432236, + "learning_rate": 1.58110776713262e-06, + "loss": 0.21174421310424804, + "step": 229260 + }, + { + "epoch": 0.9843040278886857, + "grad_norm": 0.0014310575788840652, + "learning_rate": 1.5767960470149963e-06, + "loss": 0.021212969720363618, + "step": 229270 + }, + { + "epoch": 0.9843469599787057, + "grad_norm": 0.0008379554492421448, + "learning_rate": 1.5724843268973724e-06, + "loss": 0.16715192794799805, + "step": 229280 + }, + { + "epoch": 0.9843898920687257, + "grad_norm": 38.737857818603516, + "learning_rate": 1.5681726067797487e-06, + "loss": 0.12215352058410645, + "step": 229290 + }, + { + "epoch": 0.9844328241587457, + "grad_norm": 2.1133065223693848, + "learning_rate": 1.563860886662125e-06, + "loss": 0.20123109817504883, + "step": 229300 + }, + { + "epoch": 0.9844757562487657, + "grad_norm": 0.1494905948638916, + "learning_rate": 1.5595491665445013e-06, + "loss": 0.11521637439727783, + "step": 229310 + }, + { + "epoch": 0.9845186883387858, + "grad_norm": 0.08978375047445297, + "learning_rate": 1.5552374464268775e-06, + "loss": 0.17968294620513917, + "step": 229320 + }, + { + "epoch": 0.9845616204288057, + "grad_norm": 0.14778830111026764, + "learning_rate": 1.5509257263092538e-06, + "loss": 0.09077887535095215, + "step": 229330 + }, + { + "epoch": 0.9846045525188257, + "grad_norm": 1.6615095138549805, + "learning_rate": 1.5466140061916303e-06, + "loss": 0.10497822761535644, + "step": 229340 + }, + { + "epoch": 0.9846474846088458, + "grad_norm": 0.014423206448554993, + "learning_rate": 1.5423022860740064e-06, + "loss": 0.005706658959388733, + "step": 229350 + }, + { + "epoch": 0.9846904166988657, + "grad_norm": 0.8330245018005371, + "learning_rate": 1.5379905659563827e-06, + "loss": 0.1308761477470398, + "step": 229360 + }, + { + "epoch": 0.9847333487888857, + "grad_norm": 3.2889111042022705, + "learning_rate": 1.533678845838759e-06, + "loss": 0.43849334716796873, + "step": 229370 + }, + { + "epoch": 0.9847762808789058, + "grad_norm": 0.02888214774429798, + "learning_rate": 1.5293671257211353e-06, + "loss": 0.26519927978515623, + "step": 229380 + }, + { + "epoch": 0.9848192129689257, + "grad_norm": 0.22269928455352783, + "learning_rate": 1.5250554056035115e-06, + "loss": 0.19621092081069946, + "step": 229390 + }, + { + "epoch": 0.9848621450589458, + "grad_norm": 0.007023406680673361, + "learning_rate": 1.5207436854858878e-06, + "loss": 0.14593169689178467, + "step": 229400 + }, + { + "epoch": 0.9849050771489658, + "grad_norm": 0.22836387157440186, + "learning_rate": 1.516431965368264e-06, + "loss": 0.19318276643753052, + "step": 229410 + }, + { + "epoch": 0.9849480092389857, + "grad_norm": 0.010965757071971893, + "learning_rate": 1.5121202452506404e-06, + "loss": 0.08530879020690918, + "step": 229420 + }, + { + "epoch": 0.9849909413290058, + "grad_norm": 0.0280422605574131, + "learning_rate": 1.5078085251330167e-06, + "loss": 0.22887613773345947, + "step": 229430 + }, + { + "epoch": 0.9850338734190258, + "grad_norm": 0.0066929347813129425, + "learning_rate": 1.503496805015393e-06, + "loss": 0.3554296255111694, + "step": 229440 + }, + { + "epoch": 0.9850768055090457, + "grad_norm": 3.0852301120758057, + "learning_rate": 1.4991850848977692e-06, + "loss": 0.18785364627838136, + "step": 229450 + }, + { + "epoch": 0.9851197375990658, + "grad_norm": 6.435519218444824, + "learning_rate": 1.4948733647801455e-06, + "loss": 0.25583548545837403, + "step": 229460 + }, + { + "epoch": 0.9851626696890858, + "grad_norm": 6.700213432312012, + "learning_rate": 1.4905616446625216e-06, + "loss": 0.28615341186523435, + "step": 229470 + }, + { + "epoch": 0.9852056017791058, + "grad_norm": 1.2810556888580322, + "learning_rate": 1.486249924544898e-06, + "loss": 0.17413702011108398, + "step": 229480 + }, + { + "epoch": 0.9852485338691258, + "grad_norm": 0.018091721460223198, + "learning_rate": 1.4819382044272744e-06, + "loss": 0.12498599290847778, + "step": 229490 + }, + { + "epoch": 0.9852914659591459, + "grad_norm": 0.043885841965675354, + "learning_rate": 1.4776264843096505e-06, + "loss": 0.09390342235565186, + "step": 229500 + }, + { + "epoch": 0.9853343980491658, + "grad_norm": 7.9579176902771, + "learning_rate": 1.4733147641920267e-06, + "loss": 0.286728048324585, + "step": 229510 + }, + { + "epoch": 0.9853773301391858, + "grad_norm": 0.016106192022562027, + "learning_rate": 1.469003044074403e-06, + "loss": 0.05692702531814575, + "step": 229520 + }, + { + "epoch": 0.9854202622292059, + "grad_norm": 6.578338623046875, + "learning_rate": 1.4646913239567795e-06, + "loss": 0.2311037540435791, + "step": 229530 + }, + { + "epoch": 0.9854631943192258, + "grad_norm": 0.002258319640532136, + "learning_rate": 1.4603796038391556e-06, + "loss": 0.2164003372192383, + "step": 229540 + }, + { + "epoch": 0.9855061264092458, + "grad_norm": 0.015184340067207813, + "learning_rate": 1.4560678837215319e-06, + "loss": 0.05240732431411743, + "step": 229550 + }, + { + "epoch": 0.9855490584992659, + "grad_norm": 0.1849614530801773, + "learning_rate": 1.4517561636039082e-06, + "loss": 0.07531896233558655, + "step": 229560 + }, + { + "epoch": 0.9855919905892858, + "grad_norm": 0.0011321872007101774, + "learning_rate": 1.4474444434862847e-06, + "loss": 0.3143985509872437, + "step": 229570 + }, + { + "epoch": 0.9856349226793059, + "grad_norm": 0.0027777322102338076, + "learning_rate": 1.4431327233686607e-06, + "loss": 0.17318645715713502, + "step": 229580 + }, + { + "epoch": 0.9856778547693259, + "grad_norm": 4.1844611167907715, + "learning_rate": 1.438821003251037e-06, + "loss": 0.09338077902793884, + "step": 229590 + }, + { + "epoch": 0.9857207868593458, + "grad_norm": 0.7365671992301941, + "learning_rate": 1.4345092831334133e-06, + "loss": 0.12292720079421997, + "step": 229600 + }, + { + "epoch": 0.9857637189493659, + "grad_norm": 0.04432155191898346, + "learning_rate": 1.4301975630157896e-06, + "loss": 0.1919556140899658, + "step": 229610 + }, + { + "epoch": 0.9858066510393859, + "grad_norm": 0.1292286068201065, + "learning_rate": 1.4258858428981659e-06, + "loss": 0.2025829315185547, + "step": 229620 + }, + { + "epoch": 0.985849583129406, + "grad_norm": 0.43458276987075806, + "learning_rate": 1.4215741227805421e-06, + "loss": 0.19809385538101196, + "step": 229630 + }, + { + "epoch": 0.9858925152194259, + "grad_norm": 4.930924892425537, + "learning_rate": 1.4172624026629184e-06, + "loss": 0.3780056953430176, + "step": 229640 + }, + { + "epoch": 0.9859354473094459, + "grad_norm": 0.002644116058945656, + "learning_rate": 1.4129506825452947e-06, + "loss": 0.12666265964508056, + "step": 229650 + }, + { + "epoch": 0.985978379399466, + "grad_norm": 0.002665426814928651, + "learning_rate": 1.408638962427671e-06, + "loss": 0.025838717818260193, + "step": 229660 + }, + { + "epoch": 0.9860213114894859, + "grad_norm": 0.9424791932106018, + "learning_rate": 1.4043272423100473e-06, + "loss": 0.07721037864685058, + "step": 229670 + }, + { + "epoch": 0.986064243579506, + "grad_norm": 0.03978941589593887, + "learning_rate": 1.4000155221924236e-06, + "loss": 0.10961095094680787, + "step": 229680 + }, + { + "epoch": 0.986107175669526, + "grad_norm": 0.0040367403998970985, + "learning_rate": 1.3957038020747999e-06, + "loss": 0.13419954776763915, + "step": 229690 + }, + { + "epoch": 0.9861501077595459, + "grad_norm": 2.451991081237793, + "learning_rate": 1.391392081957176e-06, + "loss": 0.31821794509887696, + "step": 229700 + }, + { + "epoch": 0.986193039849566, + "grad_norm": 0.006343350745737553, + "learning_rate": 1.3870803618395524e-06, + "loss": 0.11663144826889038, + "step": 229710 + }, + { + "epoch": 0.986235971939586, + "grad_norm": 1.4940195083618164, + "learning_rate": 1.3827686417219287e-06, + "loss": 0.21371042728424072, + "step": 229720 + }, + { + "epoch": 0.9862789040296059, + "grad_norm": 0.7261769771575928, + "learning_rate": 1.3784569216043048e-06, + "loss": 0.19989676475524903, + "step": 229730 + }, + { + "epoch": 0.986321836119626, + "grad_norm": 1.1416183710098267, + "learning_rate": 1.374145201486681e-06, + "loss": 0.05739124417304993, + "step": 229740 + }, + { + "epoch": 0.986364768209646, + "grad_norm": 1.768046498298645, + "learning_rate": 1.3698334813690576e-06, + "loss": 0.16595814228057862, + "step": 229750 + }, + { + "epoch": 0.986407700299666, + "grad_norm": 0.37908732891082764, + "learning_rate": 1.3655217612514338e-06, + "loss": 0.5042348861694336, + "step": 229760 + }, + { + "epoch": 0.986450632389686, + "grad_norm": 0.004125974606722593, + "learning_rate": 1.36121004113381e-06, + "loss": 0.26359896659851073, + "step": 229770 + }, + { + "epoch": 0.986493564479706, + "grad_norm": 0.008460349403321743, + "learning_rate": 1.3568983210161862e-06, + "loss": 0.14525082111358642, + "step": 229780 + }, + { + "epoch": 0.986536496569726, + "grad_norm": 1.253076195716858, + "learning_rate": 1.3525866008985625e-06, + "loss": 0.16380696296691893, + "step": 229790 + }, + { + "epoch": 0.986579428659746, + "grad_norm": 0.3819549083709717, + "learning_rate": 1.348274880780939e-06, + "loss": 0.2786677598953247, + "step": 229800 + }, + { + "epoch": 0.9866223607497661, + "grad_norm": 0.02352546714246273, + "learning_rate": 1.343963160663315e-06, + "loss": 0.08861920833587647, + "step": 229810 + }, + { + "epoch": 0.986665292839786, + "grad_norm": 0.006553748622536659, + "learning_rate": 1.3396514405456913e-06, + "loss": 0.09064736366271972, + "step": 229820 + }, + { + "epoch": 0.986708224929806, + "grad_norm": 0.03975367173552513, + "learning_rate": 1.3353397204280676e-06, + "loss": 0.08393974304199218, + "step": 229830 + }, + { + "epoch": 0.9867511570198261, + "grad_norm": 0.01190192997455597, + "learning_rate": 1.331028000310444e-06, + "loss": 0.16109168529510498, + "step": 229840 + }, + { + "epoch": 0.986794089109846, + "grad_norm": 0.017632009461522102, + "learning_rate": 1.3267162801928202e-06, + "loss": 0.02562235891819, + "step": 229850 + }, + { + "epoch": 0.986837021199866, + "grad_norm": 0.003476449754089117, + "learning_rate": 1.3224045600751965e-06, + "loss": 0.04230686128139496, + "step": 229860 + }, + { + "epoch": 0.9868799532898861, + "grad_norm": 0.005417051259428263, + "learning_rate": 1.3180928399575728e-06, + "loss": 0.07968264818191528, + "step": 229870 + }, + { + "epoch": 0.986922885379906, + "grad_norm": 0.0005414534243755043, + "learning_rate": 1.313781119839949e-06, + "loss": 0.09873697757720948, + "step": 229880 + }, + { + "epoch": 0.9869658174699261, + "grad_norm": 3.2428600788116455, + "learning_rate": 1.3094693997223253e-06, + "loss": 0.11447465419769287, + "step": 229890 + }, + { + "epoch": 0.9870087495599461, + "grad_norm": 0.9381573796272278, + "learning_rate": 1.3051576796047016e-06, + "loss": 0.2896867036819458, + "step": 229900 + }, + { + "epoch": 0.987051681649966, + "grad_norm": 0.0015979417366907, + "learning_rate": 1.3008459594870779e-06, + "loss": 0.16397885084152222, + "step": 229910 + }, + { + "epoch": 0.9870946137399861, + "grad_norm": 2.4915566444396973, + "learning_rate": 1.296534239369454e-06, + "loss": 0.4369049549102783, + "step": 229920 + }, + { + "epoch": 0.9871375458300061, + "grad_norm": 1.179463267326355, + "learning_rate": 1.2922225192518305e-06, + "loss": 0.1902614951133728, + "step": 229930 + }, + { + "epoch": 0.9871804779200261, + "grad_norm": 0.12325243651866913, + "learning_rate": 1.2879107991342067e-06, + "loss": 0.14833759069442748, + "step": 229940 + }, + { + "epoch": 0.9872234100100461, + "grad_norm": 0.050647057592868805, + "learning_rate": 1.283599079016583e-06, + "loss": 0.1878619074821472, + "step": 229950 + }, + { + "epoch": 0.9872663421000661, + "grad_norm": 0.003908630460500717, + "learning_rate": 1.279287358898959e-06, + "loss": 0.300618839263916, + "step": 229960 + }, + { + "epoch": 0.9873092741900861, + "grad_norm": 0.04351991042494774, + "learning_rate": 1.2749756387813354e-06, + "loss": 0.16139765977859497, + "step": 229970 + }, + { + "epoch": 0.9873522062801061, + "grad_norm": 0.992188036441803, + "learning_rate": 1.2706639186637119e-06, + "loss": 0.22291405200958253, + "step": 229980 + }, + { + "epoch": 0.9873951383701262, + "grad_norm": 6.605251789093018, + "learning_rate": 1.2663521985460882e-06, + "loss": 0.0996120810508728, + "step": 229990 + }, + { + "epoch": 0.9874380704601461, + "grad_norm": 0.0715533122420311, + "learning_rate": 1.2620404784284642e-06, + "loss": 0.14207704067230226, + "step": 230000 + }, + { + "epoch": 0.9874380704601461, + "eval_loss": 0.367033451795578, + "eval_runtime": 27.4805, + "eval_samples_per_second": 3.639, + "eval_steps_per_second": 3.639, + "step": 230000 + }, + { + "epoch": 0.9874810025501661, + "grad_norm": 0.060485634952783585, + "learning_rate": 1.2577287583108405e-06, + "loss": 0.0019446693360805512, + "step": 230010 + }, + { + "epoch": 0.9875239346401862, + "grad_norm": 0.0023204952012747526, + "learning_rate": 1.2534170381932168e-06, + "loss": 0.31025264263153074, + "step": 230020 + }, + { + "epoch": 0.9875668667302061, + "grad_norm": 2.3061840534210205, + "learning_rate": 1.249105318075593e-06, + "loss": 0.17957751750946044, + "step": 230030 + }, + { + "epoch": 0.9876097988202261, + "grad_norm": 0.0011746891541406512, + "learning_rate": 1.2447935979579694e-06, + "loss": 0.2284639835357666, + "step": 230040 + }, + { + "epoch": 0.9876527309102462, + "grad_norm": 0.05280338600277901, + "learning_rate": 1.2404818778403457e-06, + "loss": 0.30267183780670165, + "step": 230050 + }, + { + "epoch": 0.9876956630002662, + "grad_norm": 0.1548868864774704, + "learning_rate": 1.236170157722722e-06, + "loss": 0.11789828538894653, + "step": 230060 + }, + { + "epoch": 0.9877385950902862, + "grad_norm": 0.7587932348251343, + "learning_rate": 1.2318584376050982e-06, + "loss": 0.07684867978096008, + "step": 230070 + }, + { + "epoch": 0.9877815271803062, + "grad_norm": 1.2028552293777466, + "learning_rate": 1.2275467174874745e-06, + "loss": 0.24567055702209473, + "step": 230080 + }, + { + "epoch": 0.9878244592703262, + "grad_norm": 0.00883434247225523, + "learning_rate": 1.2232349973698508e-06, + "loss": 0.10406320095062256, + "step": 230090 + }, + { + "epoch": 0.9878673913603462, + "grad_norm": 0.02592717483639717, + "learning_rate": 1.218923277252227e-06, + "loss": 0.25481562614440917, + "step": 230100 + }, + { + "epoch": 0.9879103234503662, + "grad_norm": 6.862185001373291, + "learning_rate": 1.2146115571346034e-06, + "loss": 0.3252408981323242, + "step": 230110 + }, + { + "epoch": 0.9879532555403863, + "grad_norm": 0.0014842419186607003, + "learning_rate": 1.2102998370169796e-06, + "loss": 0.19278960227966307, + "step": 230120 + }, + { + "epoch": 0.9879961876304062, + "grad_norm": 0.38045448064804077, + "learning_rate": 1.205988116899356e-06, + "loss": 0.09647968411445618, + "step": 230130 + }, + { + "epoch": 0.9880391197204262, + "grad_norm": 0.0013819060986861587, + "learning_rate": 1.2016763967817322e-06, + "loss": 0.30128400325775145, + "step": 230140 + }, + { + "epoch": 0.9880820518104463, + "grad_norm": 0.012147623114287853, + "learning_rate": 1.1973646766641083e-06, + "loss": 0.24420483112335206, + "step": 230150 + }, + { + "epoch": 0.9881249839004662, + "grad_norm": 0.0804034173488617, + "learning_rate": 1.1930529565464848e-06, + "loss": 0.31123244762420654, + "step": 230160 + }, + { + "epoch": 0.9881679159904863, + "grad_norm": 0.006644760724157095, + "learning_rate": 1.188741236428861e-06, + "loss": 0.18235220909118652, + "step": 230170 + }, + { + "epoch": 0.9882108480805063, + "grad_norm": 0.8137630224227905, + "learning_rate": 1.1844295163112373e-06, + "loss": 0.10330394506454468, + "step": 230180 + }, + { + "epoch": 0.9882537801705262, + "grad_norm": 0.013571128249168396, + "learning_rate": 1.1801177961936134e-06, + "loss": 0.08659087419509888, + "step": 230190 + }, + { + "epoch": 0.9882967122605463, + "grad_norm": 0.011523943394422531, + "learning_rate": 1.1758060760759897e-06, + "loss": 0.2964245557785034, + "step": 230200 + }, + { + "epoch": 0.9883396443505663, + "grad_norm": 0.1999071091413498, + "learning_rate": 1.1714943559583662e-06, + "loss": 0.27459328174591063, + "step": 230210 + }, + { + "epoch": 0.9883825764405862, + "grad_norm": 0.0024741236120462418, + "learning_rate": 1.1671826358407425e-06, + "loss": 0.03986948430538177, + "step": 230220 + }, + { + "epoch": 0.9884255085306063, + "grad_norm": 0.007317548152059317, + "learning_rate": 1.1628709157231186e-06, + "loss": 0.1312456488609314, + "step": 230230 + }, + { + "epoch": 0.9884684406206263, + "grad_norm": 0.020278507843613625, + "learning_rate": 1.1585591956054948e-06, + "loss": 0.07149394154548645, + "step": 230240 + }, + { + "epoch": 0.9885113727106463, + "grad_norm": 0.03085501119494438, + "learning_rate": 1.1542474754878713e-06, + "loss": 0.30056056976318357, + "step": 230250 + }, + { + "epoch": 0.9885543048006663, + "grad_norm": 2.383174180984497, + "learning_rate": 1.1499357553702474e-06, + "loss": 0.19107441902160643, + "step": 230260 + }, + { + "epoch": 0.9885972368906863, + "grad_norm": 0.20591601729393005, + "learning_rate": 1.1456240352526237e-06, + "loss": 0.028360658884048463, + "step": 230270 + }, + { + "epoch": 0.9886401689807063, + "grad_norm": 0.023873161524534225, + "learning_rate": 1.141312315135e-06, + "loss": 0.24436991214752196, + "step": 230280 + }, + { + "epoch": 0.9886831010707263, + "grad_norm": 1.3845298290252686, + "learning_rate": 1.1370005950173763e-06, + "loss": 0.3007499217987061, + "step": 230290 + }, + { + "epoch": 0.9887260331607464, + "grad_norm": 0.015084992162883282, + "learning_rate": 1.1326888748997525e-06, + "loss": 0.22297992706298828, + "step": 230300 + }, + { + "epoch": 0.9887689652507663, + "grad_norm": 0.5837798714637756, + "learning_rate": 1.1283771547821288e-06, + "loss": 0.26232342720031737, + "step": 230310 + }, + { + "epoch": 0.9888118973407863, + "grad_norm": 0.14470583200454712, + "learning_rate": 1.1240654346645051e-06, + "loss": 0.19923378229141236, + "step": 230320 + }, + { + "epoch": 0.9888548294308064, + "grad_norm": 0.2220803201198578, + "learning_rate": 1.1197537145468814e-06, + "loss": 0.1936778426170349, + "step": 230330 + }, + { + "epoch": 0.9888977615208263, + "grad_norm": 3.847043752670288, + "learning_rate": 1.1154419944292577e-06, + "loss": 0.07521622180938721, + "step": 230340 + }, + { + "epoch": 0.9889406936108464, + "grad_norm": 0.016686517745256424, + "learning_rate": 1.111130274311634e-06, + "loss": 0.11862159967422485, + "step": 230350 + }, + { + "epoch": 0.9889836257008664, + "grad_norm": 3.9269278049468994, + "learning_rate": 1.1068185541940103e-06, + "loss": 0.2966684579849243, + "step": 230360 + }, + { + "epoch": 0.9890265577908863, + "grad_norm": 0.10296718031167984, + "learning_rate": 1.1025068340763865e-06, + "loss": 0.16811978816986084, + "step": 230370 + }, + { + "epoch": 0.9890694898809064, + "grad_norm": 0.07834142446517944, + "learning_rate": 1.0981951139587626e-06, + "loss": 0.07888695001602172, + "step": 230380 + }, + { + "epoch": 0.9891124219709264, + "grad_norm": 1.7769652605056763, + "learning_rate": 1.093883393841139e-06, + "loss": 0.287352180480957, + "step": 230390 + }, + { + "epoch": 0.9891553540609463, + "grad_norm": 0.0602647066116333, + "learning_rate": 1.0895716737235154e-06, + "loss": 0.16694862842559816, + "step": 230400 + }, + { + "epoch": 0.9891982861509664, + "grad_norm": 0.7788509726524353, + "learning_rate": 1.0852599536058917e-06, + "loss": 0.18424661159515382, + "step": 230410 + }, + { + "epoch": 0.9892412182409864, + "grad_norm": 2.877631664276123, + "learning_rate": 1.0809482334882677e-06, + "loss": 0.12469482421875, + "step": 230420 + }, + { + "epoch": 0.9892841503310064, + "grad_norm": 0.00023763379431329668, + "learning_rate": 1.0766365133706442e-06, + "loss": 0.10847523212432861, + "step": 230430 + }, + { + "epoch": 0.9893270824210264, + "grad_norm": 4.226632595062256, + "learning_rate": 1.0723247932530205e-06, + "loss": 0.08445930480957031, + "step": 230440 + }, + { + "epoch": 0.9893700145110464, + "grad_norm": 0.3350447118282318, + "learning_rate": 1.0680130731353966e-06, + "loss": 0.006454658508300781, + "step": 230450 + }, + { + "epoch": 0.9894129466010664, + "grad_norm": 6.135351657867432, + "learning_rate": 1.0637013530177729e-06, + "loss": 0.22950005531311035, + "step": 230460 + }, + { + "epoch": 0.9894558786910864, + "grad_norm": 0.987662672996521, + "learning_rate": 1.0593896329001492e-06, + "loss": 0.12434533834457398, + "step": 230470 + }, + { + "epoch": 0.9894988107811065, + "grad_norm": 1.4658321142196655, + "learning_rate": 1.0550779127825257e-06, + "loss": 0.2138460874557495, + "step": 230480 + }, + { + "epoch": 0.9895417428711265, + "grad_norm": 0.0016754432581365108, + "learning_rate": 1.0507661926649017e-06, + "loss": 0.3065653324127197, + "step": 230490 + }, + { + "epoch": 0.9895846749611464, + "grad_norm": 0.002386566484346986, + "learning_rate": 1.046454472547278e-06, + "loss": 0.04717585146427154, + "step": 230500 + }, + { + "epoch": 0.9896276070511665, + "grad_norm": 0.03728202357888222, + "learning_rate": 1.0421427524296543e-06, + "loss": 0.026825031638145445, + "step": 230510 + }, + { + "epoch": 0.9896705391411865, + "grad_norm": 0.03018287941813469, + "learning_rate": 1.0378310323120306e-06, + "loss": 0.09836741089820862, + "step": 230520 + }, + { + "epoch": 0.9897134712312065, + "grad_norm": 0.006733125075697899, + "learning_rate": 1.0335193121944069e-06, + "loss": 0.3137779951095581, + "step": 230530 + }, + { + "epoch": 0.9897564033212265, + "grad_norm": 0.040674638003110886, + "learning_rate": 1.0292075920767832e-06, + "loss": 0.03551376461982727, + "step": 230540 + }, + { + "epoch": 0.9897993354112465, + "grad_norm": 0.3072684705257416, + "learning_rate": 1.0248958719591594e-06, + "loss": 0.2572211265563965, + "step": 230550 + }, + { + "epoch": 0.9898422675012665, + "grad_norm": 2.135777473449707, + "learning_rate": 1.0205841518415357e-06, + "loss": 0.06069689989089966, + "step": 230560 + }, + { + "epoch": 0.9898851995912865, + "grad_norm": 3.5530755519866943, + "learning_rate": 1.016272431723912e-06, + "loss": 0.1646146297454834, + "step": 230570 + }, + { + "epoch": 0.9899281316813066, + "grad_norm": 2.017244815826416, + "learning_rate": 1.0119607116062883e-06, + "loss": 0.17254488468170165, + "step": 230580 + }, + { + "epoch": 0.9899710637713265, + "grad_norm": 0.00539855333045125, + "learning_rate": 1.0076489914886646e-06, + "loss": 0.24719760417938233, + "step": 230590 + }, + { + "epoch": 0.9900139958613465, + "grad_norm": 2.330122947692871, + "learning_rate": 1.0033372713710409e-06, + "loss": 0.15231788158416748, + "step": 230600 + }, + { + "epoch": 0.9900569279513666, + "grad_norm": 0.00920083187520504, + "learning_rate": 9.99025551253417e-07, + "loss": 0.3541694641113281, + "step": 230610 + }, + { + "epoch": 0.9900998600413865, + "grad_norm": 2.9414072036743164, + "learning_rate": 9.947138311357934e-07, + "loss": 0.0661549985408783, + "step": 230620 + }, + { + "epoch": 0.9901427921314065, + "grad_norm": 1.7807673215866089, + "learning_rate": 9.904021110181697e-07, + "loss": 0.20318183898925782, + "step": 230630 + }, + { + "epoch": 0.9901857242214266, + "grad_norm": 0.013886654749512672, + "learning_rate": 9.860903909005458e-07, + "loss": 0.14823532104492188, + "step": 230640 + }, + { + "epoch": 0.9902286563114465, + "grad_norm": 0.048982731997966766, + "learning_rate": 9.81778670782922e-07, + "loss": 0.0046006467193365095, + "step": 230650 + }, + { + "epoch": 0.9902715884014666, + "grad_norm": 0.2466362863779068, + "learning_rate": 9.774669506652986e-07, + "loss": 0.23479697704315186, + "step": 230660 + }, + { + "epoch": 0.9903145204914866, + "grad_norm": 0.19999799132347107, + "learning_rate": 9.731552305476748e-07, + "loss": 0.1353333353996277, + "step": 230670 + }, + { + "epoch": 0.9903574525815065, + "grad_norm": 0.005991742480546236, + "learning_rate": 9.68843510430051e-07, + "loss": 0.07191218137741089, + "step": 230680 + }, + { + "epoch": 0.9904003846715266, + "grad_norm": 0.3562621474266052, + "learning_rate": 9.645317903124272e-07, + "loss": 0.3172381162643433, + "step": 230690 + }, + { + "epoch": 0.9904433167615466, + "grad_norm": 0.008976894430816174, + "learning_rate": 9.602200701948035e-07, + "loss": 0.1105201005935669, + "step": 230700 + }, + { + "epoch": 0.9904862488515666, + "grad_norm": 35.98250198364258, + "learning_rate": 9.5590835007718e-07, + "loss": 0.2461772680282593, + "step": 230710 + }, + { + "epoch": 0.9905291809415866, + "grad_norm": 0.4796089231967926, + "learning_rate": 9.515966299595562e-07, + "loss": 0.07134444117546082, + "step": 230720 + }, + { + "epoch": 0.9905721130316066, + "grad_norm": 0.3961131274700165, + "learning_rate": 9.472849098419323e-07, + "loss": 0.16740727424621582, + "step": 230730 + }, + { + "epoch": 0.9906150451216266, + "grad_norm": 0.03454848751425743, + "learning_rate": 9.429731897243086e-07, + "loss": 0.20739531517028809, + "step": 230740 + }, + { + "epoch": 0.9906579772116466, + "grad_norm": 0.015018555335700512, + "learning_rate": 9.38661469606685e-07, + "loss": 0.2724655866622925, + "step": 230750 + }, + { + "epoch": 0.9907009093016667, + "grad_norm": 1.8921265602111816, + "learning_rate": 9.343497494890613e-07, + "loss": 0.3332388162612915, + "step": 230760 + }, + { + "epoch": 0.9907438413916866, + "grad_norm": 0.10715219378471375, + "learning_rate": 9.300380293714375e-07, + "loss": 0.004315024986863136, + "step": 230770 + }, + { + "epoch": 0.9907867734817066, + "grad_norm": 4.257637023925781, + "learning_rate": 9.257263092538138e-07, + "loss": 0.09003528356552123, + "step": 230780 + }, + { + "epoch": 0.9908297055717267, + "grad_norm": 1.0070799589157104, + "learning_rate": 9.214145891361899e-07, + "loss": 0.1680148720741272, + "step": 230790 + }, + { + "epoch": 0.9908726376617466, + "grad_norm": 1.6183319091796875, + "learning_rate": 9.171028690185663e-07, + "loss": 0.1481905460357666, + "step": 230800 + }, + { + "epoch": 0.9909155697517666, + "grad_norm": 0.005772717762738466, + "learning_rate": 9.127911489009426e-07, + "loss": 0.05922789573669433, + "step": 230810 + }, + { + "epoch": 0.9909585018417867, + "grad_norm": 0.2136821448802948, + "learning_rate": 9.084794287833189e-07, + "loss": 0.143665611743927, + "step": 230820 + }, + { + "epoch": 0.9910014339318066, + "grad_norm": 0.7301328778266907, + "learning_rate": 9.041677086656951e-07, + "loss": 0.03996670842170715, + "step": 230830 + }, + { + "epoch": 0.9910443660218267, + "grad_norm": 0.0018787942826747894, + "learning_rate": 8.998559885480715e-07, + "loss": 0.22617673873901367, + "step": 230840 + }, + { + "epoch": 0.9910872981118467, + "grad_norm": 0.10825284570455551, + "learning_rate": 8.955442684304478e-07, + "loss": 0.1180370569229126, + "step": 230850 + }, + { + "epoch": 0.9911302302018666, + "grad_norm": 16.12891387939453, + "learning_rate": 8.912325483128239e-07, + "loss": 0.2612591743469238, + "step": 230860 + }, + { + "epoch": 0.9911731622918867, + "grad_norm": 0.0020049717277288437, + "learning_rate": 8.869208281952002e-07, + "loss": 0.09563700556755066, + "step": 230870 + }, + { + "epoch": 0.9912160943819067, + "grad_norm": 0.002001575892791152, + "learning_rate": 8.826091080775764e-07, + "loss": 0.39770505428314207, + "step": 230880 + }, + { + "epoch": 0.9912590264719267, + "grad_norm": 0.7403496503829956, + "learning_rate": 8.782973879599529e-07, + "loss": 0.019684380292892455, + "step": 230890 + }, + { + "epoch": 0.9913019585619467, + "grad_norm": 0.001754825352691114, + "learning_rate": 8.739856678423291e-07, + "loss": 0.16134670972824097, + "step": 230900 + }, + { + "epoch": 0.9913448906519667, + "grad_norm": 0.3392312824726105, + "learning_rate": 8.696739477247054e-07, + "loss": 0.16810760498046876, + "step": 230910 + }, + { + "epoch": 0.9913878227419868, + "grad_norm": 3.1423470973968506, + "learning_rate": 8.653622276070815e-07, + "loss": 0.24040436744689941, + "step": 230920 + }, + { + "epoch": 0.9914307548320067, + "grad_norm": 0.0010441187769174576, + "learning_rate": 8.610505074894578e-07, + "loss": 0.1794663906097412, + "step": 230930 + }, + { + "epoch": 0.9914736869220268, + "grad_norm": 1.1992433071136475, + "learning_rate": 8.567387873718342e-07, + "loss": 0.13717471361160277, + "step": 230940 + }, + { + "epoch": 0.9915166190120468, + "grad_norm": 0.11305972188711166, + "learning_rate": 8.524270672542105e-07, + "loss": 0.1720863938331604, + "step": 230950 + }, + { + "epoch": 0.9915595511020667, + "grad_norm": 0.01105725672096014, + "learning_rate": 8.481153471365867e-07, + "loss": 0.16816022396087646, + "step": 230960 + }, + { + "epoch": 0.9916024831920868, + "grad_norm": 6.651047229766846, + "learning_rate": 8.43803627018963e-07, + "loss": 0.22323966026306152, + "step": 230970 + }, + { + "epoch": 0.9916454152821068, + "grad_norm": 0.0015010848874226213, + "learning_rate": 8.394919069013393e-07, + "loss": 0.2742466926574707, + "step": 230980 + }, + { + "epoch": 0.9916883473721267, + "grad_norm": 0.009254826232790947, + "learning_rate": 8.351801867837155e-07, + "loss": 0.19576044082641603, + "step": 230990 + }, + { + "epoch": 0.9917312794621468, + "grad_norm": 0.001263033365830779, + "learning_rate": 8.308684666660918e-07, + "loss": 0.09472488164901734, + "step": 231000 + }, + { + "epoch": 0.9917312794621468, + "eval_loss": 0.36783263087272644, + "eval_runtime": 27.4534, + "eval_samples_per_second": 3.643, + "eval_steps_per_second": 3.643, + "step": 231000 + }, + { + "epoch": 0.9917742115521668, + "grad_norm": 1.9732149839401245, + "learning_rate": 8.265567465484681e-07, + "loss": 0.23163745403289795, + "step": 231010 + }, + { + "epoch": 0.9918171436421868, + "grad_norm": 7.527266025543213, + "learning_rate": 8.222450264308443e-07, + "loss": 0.36402325630187987, + "step": 231020 + }, + { + "epoch": 0.9918600757322068, + "grad_norm": 0.02747558429837227, + "learning_rate": 8.179333063132207e-07, + "loss": 0.16112041473388672, + "step": 231030 + }, + { + "epoch": 0.9919030078222268, + "grad_norm": 0.014340748079121113, + "learning_rate": 8.136215861955969e-07, + "loss": 0.22366724014282227, + "step": 231040 + }, + { + "epoch": 0.9919459399122468, + "grad_norm": 1.6919987201690674, + "learning_rate": 8.093098660779731e-07, + "loss": 0.3583649158477783, + "step": 231050 + }, + { + "epoch": 0.9919888720022668, + "grad_norm": 0.006823307368904352, + "learning_rate": 8.049981459603494e-07, + "loss": 0.30974841117858887, + "step": 231060 + }, + { + "epoch": 0.9920318040922869, + "grad_norm": 0.005828527733683586, + "learning_rate": 8.006864258427258e-07, + "loss": 0.1370411992073059, + "step": 231070 + }, + { + "epoch": 0.9920747361823068, + "grad_norm": 0.008763202466070652, + "learning_rate": 7.963747057251021e-07, + "loss": 0.1355002760887146, + "step": 231080 + }, + { + "epoch": 0.9921176682723268, + "grad_norm": 0.003385061165317893, + "learning_rate": 7.920629856074783e-07, + "loss": 0.25493133068084717, + "step": 231090 + }, + { + "epoch": 0.9921606003623469, + "grad_norm": 0.03868903964757919, + "learning_rate": 7.877512654898545e-07, + "loss": 0.22841250896453857, + "step": 231100 + }, + { + "epoch": 0.9922035324523668, + "grad_norm": 4.405091285705566, + "learning_rate": 7.834395453722307e-07, + "loss": 0.14411439895629882, + "step": 231110 + }, + { + "epoch": 0.9922464645423869, + "grad_norm": 0.006356885191053152, + "learning_rate": 7.791278252546071e-07, + "loss": 0.24147932529449462, + "step": 231120 + }, + { + "epoch": 0.9922893966324069, + "grad_norm": 0.943030059337616, + "learning_rate": 7.748161051369834e-07, + "loss": 0.2868966579437256, + "step": 231130 + }, + { + "epoch": 0.9923323287224268, + "grad_norm": 4.7073588371276855, + "learning_rate": 7.705043850193597e-07, + "loss": 0.22108216285705568, + "step": 231140 + }, + { + "epoch": 0.9923752608124469, + "grad_norm": 0.01346675492823124, + "learning_rate": 7.66192664901736e-07, + "loss": 0.22234489917755126, + "step": 231150 + }, + { + "epoch": 0.9924181929024669, + "grad_norm": 1.4010380506515503, + "learning_rate": 7.618809447841122e-07, + "loss": 0.23675973415374757, + "step": 231160 + }, + { + "epoch": 0.9924611249924868, + "grad_norm": 1.4407527446746826, + "learning_rate": 7.575692246664885e-07, + "loss": 0.21478147506713868, + "step": 231170 + }, + { + "epoch": 0.9925040570825069, + "grad_norm": 0.014923127368092537, + "learning_rate": 7.532575045488648e-07, + "loss": 0.018306614458560945, + "step": 231180 + }, + { + "epoch": 0.9925469891725269, + "grad_norm": 0.010917030274868011, + "learning_rate": 7.48945784431241e-07, + "loss": 0.20243346691131592, + "step": 231190 + }, + { + "epoch": 0.9925899212625469, + "grad_norm": 5.202339172363281, + "learning_rate": 7.446340643136173e-07, + "loss": 0.48285846710205077, + "step": 231200 + }, + { + "epoch": 0.9926328533525669, + "grad_norm": 0.31020039319992065, + "learning_rate": 7.403223441959936e-07, + "loss": 0.2021394968032837, + "step": 231210 + }, + { + "epoch": 0.9926757854425869, + "grad_norm": 0.019344815984368324, + "learning_rate": 7.360106240783698e-07, + "loss": 0.21105799674987794, + "step": 231220 + }, + { + "epoch": 0.9927187175326069, + "grad_norm": 0.8539208173751831, + "learning_rate": 7.316989039607461e-07, + "loss": 0.2706784725189209, + "step": 231230 + }, + { + "epoch": 0.9927616496226269, + "grad_norm": 1.9695597887039185, + "learning_rate": 7.273871838431224e-07, + "loss": 0.15657505989074708, + "step": 231240 + }, + { + "epoch": 0.992804581712647, + "grad_norm": 7.32958459854126, + "learning_rate": 7.230754637254987e-07, + "loss": 0.22953457832336427, + "step": 231250 + }, + { + "epoch": 0.9928475138026669, + "grad_norm": 0.0042612794786691666, + "learning_rate": 7.187637436078749e-07, + "loss": 0.026990744471549987, + "step": 231260 + }, + { + "epoch": 0.9928904458926869, + "grad_norm": 0.039048999547958374, + "learning_rate": 7.144520234902513e-07, + "loss": 0.10248461961746216, + "step": 231270 + }, + { + "epoch": 0.992933377982707, + "grad_norm": 1.1401162147521973, + "learning_rate": 7.101403033726274e-07, + "loss": 0.2755697965621948, + "step": 231280 + }, + { + "epoch": 0.9929763100727269, + "grad_norm": 0.04793756455183029, + "learning_rate": 7.058285832550038e-07, + "loss": 0.0071813158690929415, + "step": 231290 + }, + { + "epoch": 0.993019242162747, + "grad_norm": 0.010070315562188625, + "learning_rate": 7.0151686313738e-07, + "loss": 0.14669071435928344, + "step": 231300 + }, + { + "epoch": 0.993062174252767, + "grad_norm": 1.1468803882598877, + "learning_rate": 6.972051430197564e-07, + "loss": 0.057636570930480954, + "step": 231310 + }, + { + "epoch": 0.9931051063427869, + "grad_norm": 0.005894109606742859, + "learning_rate": 6.928934229021326e-07, + "loss": 0.3529577016830444, + "step": 231320 + }, + { + "epoch": 0.993148038432807, + "grad_norm": 0.027687201276421547, + "learning_rate": 6.88581702784509e-07, + "loss": 0.13618324995040892, + "step": 231330 + }, + { + "epoch": 0.993190970522827, + "grad_norm": 0.02283935621380806, + "learning_rate": 6.842699826668851e-07, + "loss": 0.17922990322113036, + "step": 231340 + }, + { + "epoch": 0.993233902612847, + "grad_norm": 0.06553700566291809, + "learning_rate": 6.799582625492614e-07, + "loss": 0.05254532098770141, + "step": 231350 + }, + { + "epoch": 0.993276834702867, + "grad_norm": 0.2980232238769531, + "learning_rate": 6.756465424316377e-07, + "loss": 0.3425051927566528, + "step": 231360 + }, + { + "epoch": 0.993319766792887, + "grad_norm": 1.3741534948349, + "learning_rate": 6.71334822314014e-07, + "loss": 0.32896277904510496, + "step": 231370 + }, + { + "epoch": 0.9933626988829071, + "grad_norm": 2.6923539638519287, + "learning_rate": 6.670231021963903e-07, + "loss": 0.19083750247955322, + "step": 231380 + }, + { + "epoch": 0.993405630972927, + "grad_norm": 4.866817474365234, + "learning_rate": 6.627113820787666e-07, + "loss": 0.23523907661437987, + "step": 231390 + }, + { + "epoch": 0.993448563062947, + "grad_norm": 0.19903796911239624, + "learning_rate": 6.583996619611429e-07, + "loss": 0.12940009832382202, + "step": 231400 + }, + { + "epoch": 0.9934914951529671, + "grad_norm": 1.1251431703567505, + "learning_rate": 6.54087941843519e-07, + "loss": 0.13517963886260986, + "step": 231410 + }, + { + "epoch": 0.993534427242987, + "grad_norm": 0.09284210950136185, + "learning_rate": 6.497762217258954e-07, + "loss": 0.019798481464385988, + "step": 231420 + }, + { + "epoch": 0.9935773593330071, + "grad_norm": 16.369192123413086, + "learning_rate": 6.454645016082716e-07, + "loss": 0.2830683708190918, + "step": 231430 + }, + { + "epoch": 0.9936202914230271, + "grad_norm": 0.0024391154292970896, + "learning_rate": 6.411527814906479e-07, + "loss": 0.17113139629364013, + "step": 231440 + }, + { + "epoch": 0.993663223513047, + "grad_norm": 0.0015551660908386111, + "learning_rate": 6.368410613730242e-07, + "loss": 0.10649877786636353, + "step": 231450 + }, + { + "epoch": 0.9937061556030671, + "grad_norm": 5.168276309967041, + "learning_rate": 6.325293412554004e-07, + "loss": 0.19731935262680053, + "step": 231460 + }, + { + "epoch": 0.9937490876930871, + "grad_norm": 1.3026635646820068, + "learning_rate": 6.282176211377767e-07, + "loss": 0.24371423721313476, + "step": 231470 + }, + { + "epoch": 0.993792019783107, + "grad_norm": 4.633128643035889, + "learning_rate": 6.23905901020153e-07, + "loss": 0.21536602973937988, + "step": 231480 + }, + { + "epoch": 0.9938349518731271, + "grad_norm": 1.4385215044021606, + "learning_rate": 6.195941809025293e-07, + "loss": 0.21898224353790283, + "step": 231490 + }, + { + "epoch": 0.9938778839631471, + "grad_norm": 3.3495383262634277, + "learning_rate": 6.152824607849056e-07, + "loss": 0.14156938791275026, + "step": 231500 + }, + { + "epoch": 0.9939208160531671, + "grad_norm": 0.0007417344022542238, + "learning_rate": 6.109707406672818e-07, + "loss": 0.11812833547592164, + "step": 231510 + }, + { + "epoch": 0.9939637481431871, + "grad_norm": 0.034221477806568146, + "learning_rate": 6.066590205496582e-07, + "loss": 0.18498988151550294, + "step": 231520 + }, + { + "epoch": 0.9940066802332072, + "grad_norm": 0.01323069166392088, + "learning_rate": 6.023473004320343e-07, + "loss": 0.194827401638031, + "step": 231530 + }, + { + "epoch": 0.9940496123232271, + "grad_norm": 0.571534276008606, + "learning_rate": 5.980355803144107e-07, + "loss": 0.19657092094421386, + "step": 231540 + }, + { + "epoch": 0.9940925444132471, + "grad_norm": 0.00028535982710309327, + "learning_rate": 5.937238601967869e-07, + "loss": 0.10385684967041016, + "step": 231550 + }, + { + "epoch": 0.9941354765032672, + "grad_norm": 0.05803811177611351, + "learning_rate": 5.894121400791632e-07, + "loss": 0.29644711017608644, + "step": 231560 + }, + { + "epoch": 0.9941784085932871, + "grad_norm": 0.004579696338623762, + "learning_rate": 5.851004199615395e-07, + "loss": 0.20959279537200928, + "step": 231570 + }, + { + "epoch": 0.9942213406833071, + "grad_norm": 0.0018657728796824813, + "learning_rate": 5.807886998439158e-07, + "loss": 0.08454373478889465, + "step": 231580 + }, + { + "epoch": 0.9942642727733272, + "grad_norm": 0.008013848215341568, + "learning_rate": 5.76476979726292e-07, + "loss": 0.28561060428619384, + "step": 231590 + }, + { + "epoch": 0.9943072048633471, + "grad_norm": 4.572896957397461, + "learning_rate": 5.721652596086683e-07, + "loss": 0.20241074562072753, + "step": 231600 + }, + { + "epoch": 0.9943501369533672, + "grad_norm": 0.0008746059611439705, + "learning_rate": 5.678535394910446e-07, + "loss": 0.16773425340652465, + "step": 231610 + }, + { + "epoch": 0.9943930690433872, + "grad_norm": 0.04221741482615471, + "learning_rate": 5.635418193734208e-07, + "loss": 0.07259194850921631, + "step": 231620 + }, + { + "epoch": 0.9944360011334071, + "grad_norm": 2.6986231803894043, + "learning_rate": 5.592300992557972e-07, + "loss": 0.19206438064575196, + "step": 231630 + }, + { + "epoch": 0.9944789332234272, + "grad_norm": 9.255887985229492, + "learning_rate": 5.549183791381734e-07, + "loss": 0.5112367630004883, + "step": 231640 + }, + { + "epoch": 0.9945218653134472, + "grad_norm": 0.061653558164834976, + "learning_rate": 5.506066590205497e-07, + "loss": 0.09895297288894653, + "step": 231650 + }, + { + "epoch": 0.9945647974034671, + "grad_norm": 5.419751167297363, + "learning_rate": 5.462949389029259e-07, + "loss": 0.07865924835205078, + "step": 231660 + }, + { + "epoch": 0.9946077294934872, + "grad_norm": 1.7274482250213623, + "learning_rate": 5.419832187853023e-07, + "loss": 0.09330617189407349, + "step": 231670 + }, + { + "epoch": 0.9946506615835072, + "grad_norm": 0.0029562627896666527, + "learning_rate": 5.376714986676785e-07, + "loss": 0.2507413625717163, + "step": 231680 + }, + { + "epoch": 0.9946935936735272, + "grad_norm": 1.1139379739761353, + "learning_rate": 5.333597785500548e-07, + "loss": 0.3691600799560547, + "step": 231690 + }, + { + "epoch": 0.9947365257635472, + "grad_norm": 0.1695391833782196, + "learning_rate": 5.290480584324311e-07, + "loss": 0.22681543827056885, + "step": 231700 + }, + { + "epoch": 0.9947794578535673, + "grad_norm": 3.3068721294403076, + "learning_rate": 5.247363383148073e-07, + "loss": 0.23673622608184813, + "step": 231710 + }, + { + "epoch": 0.9948223899435872, + "grad_norm": 1.692142128944397, + "learning_rate": 5.204246181971836e-07, + "loss": 0.11870994567871093, + "step": 231720 + }, + { + "epoch": 0.9948653220336072, + "grad_norm": 0.03946472331881523, + "learning_rate": 5.161128980795599e-07, + "loss": 0.0988048791885376, + "step": 231730 + }, + { + "epoch": 0.9949082541236273, + "grad_norm": 2.345656394958496, + "learning_rate": 5.118011779619362e-07, + "loss": 0.12542537450790406, + "step": 231740 + }, + { + "epoch": 0.9949511862136472, + "grad_norm": 0.2746301591396332, + "learning_rate": 5.074894578443125e-07, + "loss": 0.18555349111557007, + "step": 231750 + }, + { + "epoch": 0.9949941183036672, + "grad_norm": 1.1292948722839355, + "learning_rate": 5.031777377266887e-07, + "loss": 0.3993506908416748, + "step": 231760 + }, + { + "epoch": 0.9950370503936873, + "grad_norm": 0.00699152797460556, + "learning_rate": 4.988660176090649e-07, + "loss": 0.12862871885299682, + "step": 231770 + }, + { + "epoch": 0.9950799824837073, + "grad_norm": 0.029387371614575386, + "learning_rate": 4.945542974914412e-07, + "loss": 0.33222131729125975, + "step": 231780 + }, + { + "epoch": 0.9951229145737273, + "grad_norm": 0.010533314198255539, + "learning_rate": 4.902425773738175e-07, + "loss": 0.37362515926361084, + "step": 231790 + }, + { + "epoch": 0.9951658466637473, + "grad_norm": 0.024894610047340393, + "learning_rate": 4.859308572561938e-07, + "loss": 0.18944785594940186, + "step": 231800 + }, + { + "epoch": 0.9952087787537673, + "grad_norm": 0.07286173105239868, + "learning_rate": 4.816191371385701e-07, + "loss": 0.14887828826904298, + "step": 231810 + }, + { + "epoch": 0.9952517108437873, + "grad_norm": 0.009799705818295479, + "learning_rate": 4.773074170209464e-07, + "loss": 0.08595054745674133, + "step": 231820 + }, + { + "epoch": 0.9952946429338073, + "grad_norm": 0.010909819975495338, + "learning_rate": 4.729956969033227e-07, + "loss": 0.13564144372940062, + "step": 231830 + }, + { + "epoch": 0.9953375750238274, + "grad_norm": 0.21695959568023682, + "learning_rate": 4.6868397678569893e-07, + "loss": 0.207657790184021, + "step": 231840 + }, + { + "epoch": 0.9953805071138473, + "grad_norm": 0.007266578730195761, + "learning_rate": 4.6437225666807516e-07, + "loss": 0.15762712955474853, + "step": 231850 + }, + { + "epoch": 0.9954234392038673, + "grad_norm": 0.08585427701473236, + "learning_rate": 4.6006053655045145e-07, + "loss": 0.1914979934692383, + "step": 231860 + }, + { + "epoch": 0.9954663712938874, + "grad_norm": 9.776519775390625, + "learning_rate": 4.5574881643282773e-07, + "loss": 0.3182793617248535, + "step": 231870 + }, + { + "epoch": 0.9955093033839073, + "grad_norm": 0.026343174278736115, + "learning_rate": 4.51437096315204e-07, + "loss": 0.19399741888046265, + "step": 231880 + }, + { + "epoch": 0.9955522354739273, + "grad_norm": 11.97410774230957, + "learning_rate": 4.4712537619758024e-07, + "loss": 0.3325981616973877, + "step": 231890 + }, + { + "epoch": 0.9955951675639474, + "grad_norm": 8.708888053894043, + "learning_rate": 4.428136560799566e-07, + "loss": 0.21990721225738524, + "step": 231900 + }, + { + "epoch": 0.9956380996539673, + "grad_norm": 0.031915098428726196, + "learning_rate": 4.385019359623328e-07, + "loss": 0.12815059423446656, + "step": 231910 + }, + { + "epoch": 0.9956810317439874, + "grad_norm": 0.05291389301419258, + "learning_rate": 4.3419021584470915e-07, + "loss": 0.1420881152153015, + "step": 231920 + }, + { + "epoch": 0.9957239638340074, + "grad_norm": 0.519956111907959, + "learning_rate": 4.298784957270854e-07, + "loss": 0.014936311542987824, + "step": 231930 + }, + { + "epoch": 0.9957668959240273, + "grad_norm": 0.05501473695039749, + "learning_rate": 4.255667756094616e-07, + "loss": 0.18154823780059814, + "step": 231940 + }, + { + "epoch": 0.9958098280140474, + "grad_norm": 0.0032672970555722713, + "learning_rate": 4.2125505549183795e-07, + "loss": 0.12591198682785035, + "step": 231950 + }, + { + "epoch": 0.9958527601040674, + "grad_norm": 1.8218486309051514, + "learning_rate": 4.169433353742142e-07, + "loss": 0.18262457847595215, + "step": 231960 + }, + { + "epoch": 0.9958956921940874, + "grad_norm": 0.014850892126560211, + "learning_rate": 4.126316152565905e-07, + "loss": 0.241205096244812, + "step": 231970 + }, + { + "epoch": 0.9959386242841074, + "grad_norm": 0.00398431159555912, + "learning_rate": 4.0831989513896675e-07, + "loss": 0.22642951011657714, + "step": 231980 + }, + { + "epoch": 0.9959815563741274, + "grad_norm": 0.25197991728782654, + "learning_rate": 4.040081750213431e-07, + "loss": 0.17500852346420287, + "step": 231990 + }, + { + "epoch": 0.9960244884641474, + "grad_norm": 0.0007317436393350363, + "learning_rate": 3.996964549037193e-07, + "loss": 0.2329272985458374, + "step": 232000 + }, + { + "epoch": 0.9960244884641474, + "eval_loss": 0.36665475368499756, + "eval_runtime": 27.639, + "eval_samples_per_second": 3.618, + "eval_steps_per_second": 3.618, + "step": 232000 + }, + { + "epoch": 0.9960674205541674, + "grad_norm": 1.9117028713226318, + "learning_rate": 3.9538473478609555e-07, + "loss": 0.25112009048461914, + "step": 232010 + }, + { + "epoch": 0.9961103526441875, + "grad_norm": 0.007310639601200819, + "learning_rate": 3.910730146684719e-07, + "loss": 0.11872807741165162, + "step": 232020 + }, + { + "epoch": 0.9961532847342074, + "grad_norm": 1.4803849458694458, + "learning_rate": 3.8676129455084817e-07, + "loss": 0.15340189933776854, + "step": 232030 + }, + { + "epoch": 0.9961962168242274, + "grad_norm": 2.3438141345977783, + "learning_rate": 3.824495744332244e-07, + "loss": 0.12148548364639282, + "step": 232040 + }, + { + "epoch": 0.9962391489142475, + "grad_norm": 1.9131057262420654, + "learning_rate": 3.781378543156007e-07, + "loss": 0.2916119575500488, + "step": 232050 + }, + { + "epoch": 0.9962820810042674, + "grad_norm": 2.943357467651367, + "learning_rate": 3.7382613419797697e-07, + "loss": 0.12406005859375, + "step": 232060 + }, + { + "epoch": 0.9963250130942874, + "grad_norm": 0.8212069272994995, + "learning_rate": 3.695144140803532e-07, + "loss": 0.09711835384368897, + "step": 232070 + }, + { + "epoch": 0.9963679451843075, + "grad_norm": 0.00522095849737525, + "learning_rate": 3.652026939627295e-07, + "loss": 0.4548477649688721, + "step": 232080 + }, + { + "epoch": 0.9964108772743274, + "grad_norm": 7.26854944229126, + "learning_rate": 3.6089097384510577e-07, + "loss": 0.2613101005554199, + "step": 232090 + }, + { + "epoch": 0.9964538093643475, + "grad_norm": 0.9109877347946167, + "learning_rate": 3.5657925372748205e-07, + "loss": 0.2800257444381714, + "step": 232100 + }, + { + "epoch": 0.9964967414543675, + "grad_norm": 1.436590313911438, + "learning_rate": 3.5226753360985834e-07, + "loss": 0.1499798536300659, + "step": 232110 + }, + { + "epoch": 0.9965396735443874, + "grad_norm": 0.006864710710942745, + "learning_rate": 3.479558134922346e-07, + "loss": 0.196934974193573, + "step": 232120 + }, + { + "epoch": 0.9965826056344075, + "grad_norm": 0.13871745765209198, + "learning_rate": 3.436440933746109e-07, + "loss": 0.1414160370826721, + "step": 232130 + }, + { + "epoch": 0.9966255377244275, + "grad_norm": 0.7631210684776306, + "learning_rate": 3.3933237325698714e-07, + "loss": 0.2055798053741455, + "step": 232140 + }, + { + "epoch": 0.9966684698144475, + "grad_norm": 1.24130380153656, + "learning_rate": 3.350206531393634e-07, + "loss": 0.16898894309997559, + "step": 232150 + }, + { + "epoch": 0.9967114019044675, + "grad_norm": 1.4094983339309692, + "learning_rate": 3.307089330217397e-07, + "loss": 0.18343677520751953, + "step": 232160 + }, + { + "epoch": 0.9967543339944875, + "grad_norm": 0.2644950747489929, + "learning_rate": 3.26397212904116e-07, + "loss": 0.1670290470123291, + "step": 232170 + }, + { + "epoch": 0.9967972660845075, + "grad_norm": 1.0128084421157837, + "learning_rate": 3.2208549278649227e-07, + "loss": 0.17235329151153564, + "step": 232180 + }, + { + "epoch": 0.9968401981745275, + "grad_norm": 21.385019302368164, + "learning_rate": 3.1777377266886856e-07, + "loss": 0.2850952625274658, + "step": 232190 + }, + { + "epoch": 0.9968831302645476, + "grad_norm": 0.0020975386723876, + "learning_rate": 3.1346205255124484e-07, + "loss": 0.24136075973510743, + "step": 232200 + }, + { + "epoch": 0.9969260623545676, + "grad_norm": 2.3244147300720215, + "learning_rate": 3.091503324336211e-07, + "loss": 0.1292970895767212, + "step": 232210 + }, + { + "epoch": 0.9969689944445875, + "grad_norm": 0.013622512109577656, + "learning_rate": 3.0483861231599736e-07, + "loss": 0.2299511432647705, + "step": 232220 + }, + { + "epoch": 0.9970119265346076, + "grad_norm": 0.010262306779623032, + "learning_rate": 3.0052689219837364e-07, + "loss": 0.3331043004989624, + "step": 232230 + }, + { + "epoch": 0.9970548586246276, + "grad_norm": 0.4883683919906616, + "learning_rate": 2.962151720807499e-07, + "loss": 0.11569994688034058, + "step": 232240 + }, + { + "epoch": 0.9970977907146475, + "grad_norm": 1.1489534378051758, + "learning_rate": 2.9190345196312616e-07, + "loss": 0.2363152265548706, + "step": 232250 + }, + { + "epoch": 0.9971407228046676, + "grad_norm": 0.018088897690176964, + "learning_rate": 2.8759173184550244e-07, + "loss": 0.27835333347320557, + "step": 232260 + }, + { + "epoch": 0.9971836548946876, + "grad_norm": 0.0005512001807801425, + "learning_rate": 2.832800117278787e-07, + "loss": 0.14523568153381347, + "step": 232270 + }, + { + "epoch": 0.9972265869847076, + "grad_norm": 2.278998851776123, + "learning_rate": 2.78968291610255e-07, + "loss": 0.04154463410377503, + "step": 232280 + }, + { + "epoch": 0.9972695190747276, + "grad_norm": 0.003172652330249548, + "learning_rate": 2.746565714926313e-07, + "loss": 0.12838269472122193, + "step": 232290 + }, + { + "epoch": 0.9973124511647476, + "grad_norm": 2.7414071559906006, + "learning_rate": 2.703448513750075e-07, + "loss": 0.2711420297622681, + "step": 232300 + }, + { + "epoch": 0.9973553832547676, + "grad_norm": 0.043592531234025955, + "learning_rate": 2.660331312573838e-07, + "loss": 0.2562830448150635, + "step": 232310 + }, + { + "epoch": 0.9973983153447876, + "grad_norm": 0.039814338088035583, + "learning_rate": 2.617214111397601e-07, + "loss": 0.1167643666267395, + "step": 232320 + }, + { + "epoch": 0.9974412474348077, + "grad_norm": 0.021249786019325256, + "learning_rate": 2.574096910221364e-07, + "loss": 0.14535123109817505, + "step": 232330 + }, + { + "epoch": 0.9974841795248276, + "grad_norm": 0.017002159729599953, + "learning_rate": 2.5309797090451266e-07, + "loss": 0.1427559733390808, + "step": 232340 + }, + { + "epoch": 0.9975271116148476, + "grad_norm": 0.10289645195007324, + "learning_rate": 2.4878625078688894e-07, + "loss": 0.3648236751556396, + "step": 232350 + }, + { + "epoch": 0.9975700437048677, + "grad_norm": 0.5845692753791809, + "learning_rate": 2.4447453066926523e-07, + "loss": 0.13263989686965943, + "step": 232360 + }, + { + "epoch": 0.9976129757948876, + "grad_norm": 0.176754891872406, + "learning_rate": 2.401628105516415e-07, + "loss": 0.09868491888046264, + "step": 232370 + }, + { + "epoch": 0.9976559078849077, + "grad_norm": 1.6485669612884521, + "learning_rate": 2.3585109043401774e-07, + "loss": 0.36487441062927245, + "step": 232380 + }, + { + "epoch": 0.9976988399749277, + "grad_norm": 0.001132033416070044, + "learning_rate": 2.3153937031639403e-07, + "loss": 0.24000678062438965, + "step": 232390 + }, + { + "epoch": 0.9977417720649476, + "grad_norm": 0.039538487792015076, + "learning_rate": 2.2722765019877029e-07, + "loss": 0.19298700094223023, + "step": 232400 + }, + { + "epoch": 0.9977847041549677, + "grad_norm": 0.7006238102912903, + "learning_rate": 2.2291593008114657e-07, + "loss": 0.10505164861679077, + "step": 232410 + }, + { + "epoch": 0.9978276362449877, + "grad_norm": 0.03369970619678497, + "learning_rate": 2.1860420996352285e-07, + "loss": 0.22971062660217284, + "step": 232420 + }, + { + "epoch": 0.9978705683350076, + "grad_norm": 0.20301634073257446, + "learning_rate": 2.1429248984589914e-07, + "loss": 0.32322068214416505, + "step": 232430 + }, + { + "epoch": 0.9979135004250277, + "grad_norm": 0.1694352924823761, + "learning_rate": 2.0998076972827542e-07, + "loss": 0.27818710803985597, + "step": 232440 + }, + { + "epoch": 0.9979564325150477, + "grad_norm": 0.4383462071418762, + "learning_rate": 2.056690496106517e-07, + "loss": 0.08511244058609009, + "step": 232450 + }, + { + "epoch": 0.9979993646050677, + "grad_norm": 0.9433944821357727, + "learning_rate": 2.01357329493028e-07, + "loss": 0.06735751032829285, + "step": 232460 + }, + { + "epoch": 0.9980422966950877, + "grad_norm": 0.004472394939512014, + "learning_rate": 1.9704560937540422e-07, + "loss": 0.06144242286682129, + "step": 232470 + }, + { + "epoch": 0.9980852287851077, + "grad_norm": 0.040545202791690826, + "learning_rate": 1.927338892577805e-07, + "loss": 0.1260249972343445, + "step": 232480 + }, + { + "epoch": 0.9981281608751277, + "grad_norm": 0.5861721038818359, + "learning_rate": 1.8842216914015676e-07, + "loss": 0.164767849445343, + "step": 232490 + }, + { + "epoch": 0.9981710929651477, + "grad_norm": 1.4621071815490723, + "learning_rate": 1.8411044902253305e-07, + "loss": 0.16802266836166382, + "step": 232500 + }, + { + "epoch": 0.9982140250551678, + "grad_norm": 0.0011348744155839086, + "learning_rate": 1.7979872890490933e-07, + "loss": 0.312008261680603, + "step": 232510 + }, + { + "epoch": 0.9982569571451877, + "grad_norm": 7.14293098449707, + "learning_rate": 1.7548700878728562e-07, + "loss": 0.4737683296203613, + "step": 232520 + }, + { + "epoch": 0.9982998892352077, + "grad_norm": 0.03787514939904213, + "learning_rate": 1.7117528866966187e-07, + "loss": 0.19208227396011351, + "step": 232530 + }, + { + "epoch": 0.9983428213252278, + "grad_norm": 0.021616164594888687, + "learning_rate": 1.6686356855203816e-07, + "loss": 0.3106940269470215, + "step": 232540 + }, + { + "epoch": 0.9983857534152477, + "grad_norm": 0.21478670835494995, + "learning_rate": 1.6255184843441444e-07, + "loss": 0.1717553734779358, + "step": 232550 + }, + { + "epoch": 0.9984286855052678, + "grad_norm": 0.005685772746801376, + "learning_rate": 1.5824012831679073e-07, + "loss": 0.13391354084014892, + "step": 232560 + }, + { + "epoch": 0.9984716175952878, + "grad_norm": 2.9434475898742676, + "learning_rate": 1.5392840819916698e-07, + "loss": 0.19125187397003174, + "step": 232570 + }, + { + "epoch": 0.9985145496853077, + "grad_norm": 0.04271689057350159, + "learning_rate": 1.4961668808154324e-07, + "loss": 0.19992996454238893, + "step": 232580 + }, + { + "epoch": 0.9985574817753278, + "grad_norm": 0.047845933586359024, + "learning_rate": 1.4530496796391953e-07, + "loss": 0.08684280514717102, + "step": 232590 + }, + { + "epoch": 0.9986004138653478, + "grad_norm": 2.6441221237182617, + "learning_rate": 1.409932478462958e-07, + "loss": 0.2548548698425293, + "step": 232600 + }, + { + "epoch": 0.9986433459553677, + "grad_norm": 0.08814195543527603, + "learning_rate": 1.3668152772867207e-07, + "loss": 0.09096928238868714, + "step": 232610 + }, + { + "epoch": 0.9986862780453878, + "grad_norm": 0.9759665131568909, + "learning_rate": 1.3236980761104835e-07, + "loss": 0.058413803577423096, + "step": 232620 + }, + { + "epoch": 0.9987292101354078, + "grad_norm": 1.652724027633667, + "learning_rate": 1.2805808749342464e-07, + "loss": 0.41336398124694823, + "step": 232630 + }, + { + "epoch": 0.9987721422254279, + "grad_norm": 0.04262397810816765, + "learning_rate": 1.2374636737580092e-07, + "loss": 0.2795504093170166, + "step": 232640 + }, + { + "epoch": 0.9988150743154478, + "grad_norm": 0.0015567491063848138, + "learning_rate": 1.1943464725817718e-07, + "loss": 0.15489885807037354, + "step": 232650 + }, + { + "epoch": 0.9988580064054678, + "grad_norm": 0.030675673857331276, + "learning_rate": 1.1512292714055345e-07, + "loss": 0.1793353796005249, + "step": 232660 + }, + { + "epoch": 0.9989009384954879, + "grad_norm": 1.6193761825561523, + "learning_rate": 1.1081120702292973e-07, + "loss": 0.20298526287078858, + "step": 232670 + }, + { + "epoch": 0.9989438705855078, + "grad_norm": 0.36367109417915344, + "learning_rate": 1.0649948690530602e-07, + "loss": 0.0037272989749908446, + "step": 232680 + }, + { + "epoch": 0.9989868026755279, + "grad_norm": 2.131378650665283, + "learning_rate": 1.0218776678768229e-07, + "loss": 0.14402194023132325, + "step": 232690 + }, + { + "epoch": 0.9990297347655479, + "grad_norm": 1.7471225261688232, + "learning_rate": 9.787604667005855e-08, + "loss": 0.16321039199829102, + "step": 232700 + }, + { + "epoch": 0.9990726668555678, + "grad_norm": 1.7944810390472412, + "learning_rate": 9.356432655243483e-08, + "loss": 0.12686684131622314, + "step": 232710 + }, + { + "epoch": 0.9991155989455879, + "grad_norm": 3.471855401992798, + "learning_rate": 8.925260643481111e-08, + "loss": 0.16587791442871094, + "step": 232720 + }, + { + "epoch": 0.9991585310356079, + "grad_norm": 0.028937939554452896, + "learning_rate": 8.494088631718738e-08, + "loss": 0.2687675952911377, + "step": 232730 + }, + { + "epoch": 0.9992014631256279, + "grad_norm": 6.40365743637085, + "learning_rate": 8.062916619956366e-08, + "loss": 0.36156454086303713, + "step": 232740 + }, + { + "epoch": 0.9992443952156479, + "grad_norm": 0.044099632650613785, + "learning_rate": 7.631744608193993e-08, + "loss": 0.2157670736312866, + "step": 232750 + }, + { + "epoch": 0.9992873273056679, + "grad_norm": 0.016408240422606468, + "learning_rate": 7.200572596431621e-08, + "loss": 0.08841435313224792, + "step": 232760 + }, + { + "epoch": 0.9993302593956879, + "grad_norm": 2.911112070083618, + "learning_rate": 6.769400584669248e-08, + "loss": 0.061773651838302614, + "step": 232770 + }, + { + "epoch": 0.9993731914857079, + "grad_norm": 0.005322351586073637, + "learning_rate": 6.338228572906877e-08, + "loss": 0.16446354389190673, + "step": 232780 + }, + { + "epoch": 0.999416123575728, + "grad_norm": 1.429836630821228, + "learning_rate": 5.907056561144503e-08, + "loss": 0.1530466079711914, + "step": 232790 + }, + { + "epoch": 0.9994590556657479, + "grad_norm": 0.12732602655887604, + "learning_rate": 5.475884549382131e-08, + "loss": 0.2938541412353516, + "step": 232800 + }, + { + "epoch": 0.9995019877557679, + "grad_norm": 0.004034379031509161, + "learning_rate": 5.044712537619758e-08, + "loss": 0.22802155017852782, + "step": 232810 + }, + { + "epoch": 0.999544919845788, + "grad_norm": 0.01561362762004137, + "learning_rate": 4.613540525857386e-08, + "loss": 0.1871453642845154, + "step": 232820 + }, + { + "epoch": 0.9995878519358079, + "grad_norm": 0.09907402098178864, + "learning_rate": 4.1823685140950134e-08, + "loss": 0.15518252849578856, + "step": 232830 + }, + { + "epoch": 0.999630784025828, + "grad_norm": 0.3256697654724121, + "learning_rate": 3.751196502332641e-08, + "loss": 0.34419205188751223, + "step": 232840 + }, + { + "epoch": 0.999673716115848, + "grad_norm": 0.03469710424542427, + "learning_rate": 3.320024490570268e-08, + "loss": 0.13642860651016236, + "step": 232850 + }, + { + "epoch": 0.9997166482058679, + "grad_norm": 0.0023571979254484177, + "learning_rate": 2.888852478807896e-08, + "loss": 0.2521304368972778, + "step": 232860 + }, + { + "epoch": 0.999759580295888, + "grad_norm": 0.0008532292558811605, + "learning_rate": 2.4576804670455234e-08, + "loss": 0.34290714263916017, + "step": 232870 + }, + { + "epoch": 0.999802512385908, + "grad_norm": 0.10287713259458542, + "learning_rate": 2.0265084552831505e-08, + "loss": 0.23883533477783203, + "step": 232880 + }, + { + "epoch": 0.9998454444759279, + "grad_norm": 8.222575187683105, + "learning_rate": 1.5953364435207782e-08, + "loss": 0.26848292350769043, + "step": 232890 + }, + { + "epoch": 0.999888376565948, + "grad_norm": 1.8974652290344238, + "learning_rate": 1.1641644317584057e-08, + "loss": 0.2564098596572876, + "step": 232900 + }, + { + "epoch": 0.999931308655968, + "grad_norm": 0.014630243182182312, + "learning_rate": 7.329924199960332e-09, + "loss": 0.2814777851104736, + "step": 232910 + }, + { + "epoch": 0.999974240745988, + "grad_norm": 0.0010633636265993118, + "learning_rate": 3.0182040823366075e-09, + "loss": 0.13390289545059203, + "step": 232920 + } + ], + "logging_steps": 10, + "max_steps": 232926, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.5803244156712313e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}