diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5291 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9925611052072263, + "eval_steps": 500, + "global_step": 7500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002656748140276302, + "grad_norm": 207.1781768798828, + "learning_rate": 1.9982288345731494e-05, + "loss": 6.5351, + "step": 10 + }, + { + "epoch": 0.005313496280552604, + "grad_norm": 265.0539855957031, + "learning_rate": 1.9964576691462986e-05, + "loss": 5.2822, + "step": 20 + }, + { + "epoch": 0.007970244420828906, + "grad_norm": 1933.7158203125, + "learning_rate": 1.9946865037194475e-05, + "loss": 4.7764, + "step": 30 + }, + { + "epoch": 0.010626992561105207, + "grad_norm": 896.5211791992188, + "learning_rate": 1.9929153382925967e-05, + "loss": 4.5617, + "step": 40 + }, + { + "epoch": 0.013283740701381509, + "grad_norm": 2147.634765625, + "learning_rate": 1.991144172865746e-05, + "loss": 4.5559, + "step": 50 + }, + { + "epoch": 0.015940488841657812, + "grad_norm": 1384.8623046875, + "learning_rate": 1.9893730074388952e-05, + "loss": 4.1671, + "step": 60 + }, + { + "epoch": 0.018597236981934114, + "grad_norm": 3381.87060546875, + "learning_rate": 1.987601842012044e-05, + "loss": 3.9988, + "step": 70 + }, + { + "epoch": 0.021253985122210415, + "grad_norm": 398.0505676269531, + "learning_rate": 1.985830676585193e-05, + "loss": 4.0844, + "step": 80 + }, + { + "epoch": 0.023910733262486716, + "grad_norm": 2040.54736328125, + "learning_rate": 1.9840595111583422e-05, + "loss": 3.9493, + "step": 90 + }, + { + "epoch": 0.026567481402763018, + "grad_norm": 8612.021484375, + "learning_rate": 1.9822883457314914e-05, + "loss": 3.6944, + "step": 100 + }, + { + "epoch": 0.02922422954303932, + "grad_norm": 22271.3125, + "learning_rate": 1.9805171803046406e-05, + "loss": 4.1335, + "step": 110 + }, + { + "epoch": 0.031880977683315624, + "grad_norm": 5334.6806640625, + "learning_rate": 1.97874601487779e-05, + "loss": 3.9284, + "step": 120 + }, + { + "epoch": 0.03453772582359192, + "grad_norm": 1616.4825439453125, + "learning_rate": 1.9769748494509388e-05, + "loss": 3.9407, + "step": 130 + }, + { + "epoch": 0.03719447396386823, + "grad_norm": 137.30589294433594, + "learning_rate": 1.975203684024088e-05, + "loss": 3.7372, + "step": 140 + }, + { + "epoch": 0.039851222104144525, + "grad_norm": 2417.81982421875, + "learning_rate": 1.9734325185972372e-05, + "loss": 3.6944, + "step": 150 + }, + { + "epoch": 0.04250797024442083, + "grad_norm": 7971.87451171875, + "learning_rate": 1.9716613531703864e-05, + "loss": 3.6615, + "step": 160 + }, + { + "epoch": 0.04516471838469713, + "grad_norm": 1645.13916015625, + "learning_rate": 1.9698901877435353e-05, + "loss": 3.4582, + "step": 170 + }, + { + "epoch": 0.04782146652497343, + "grad_norm": 2899.1162109375, + "learning_rate": 1.9681190223166846e-05, + "loss": 3.4193, + "step": 180 + }, + { + "epoch": 0.05047821466524974, + "grad_norm": 13782.0908203125, + "learning_rate": 1.9663478568898338e-05, + "loss": 3.577, + "step": 190 + }, + { + "epoch": 0.053134962805526036, + "grad_norm": 7818.07177734375, + "learning_rate": 1.964576691462983e-05, + "loss": 3.2082, + "step": 200 + }, + { + "epoch": 0.05579171094580234, + "grad_norm": 14882.34375, + "learning_rate": 1.962805526036132e-05, + "loss": 3.1947, + "step": 210 + }, + { + "epoch": 0.05844845908607864, + "grad_norm": 27526.642578125, + "learning_rate": 1.961034360609281e-05, + "loss": 3.23, + "step": 220 + }, + { + "epoch": 0.06110520722635494, + "grad_norm": 9511.650390625, + "learning_rate": 1.95926319518243e-05, + "loss": 3.0386, + "step": 230 + }, + { + "epoch": 0.06376195536663125, + "grad_norm": 2172.15625, + "learning_rate": 1.9574920297555792e-05, + "loss": 3.1756, + "step": 240 + }, + { + "epoch": 0.06641870350690754, + "grad_norm": 11950.30078125, + "learning_rate": 1.9557208643287285e-05, + "loss": 3.36, + "step": 250 + }, + { + "epoch": 0.06907545164718384, + "grad_norm": 17726.330078125, + "learning_rate": 1.9539496989018777e-05, + "loss": 3.0231, + "step": 260 + }, + { + "epoch": 0.07173219978746015, + "grad_norm": 4690.27587890625, + "learning_rate": 1.9521785334750266e-05, + "loss": 3.0029, + "step": 270 + }, + { + "epoch": 0.07438894792773645, + "grad_norm": 40308.61328125, + "learning_rate": 1.9504073680481758e-05, + "loss": 3.688, + "step": 280 + }, + { + "epoch": 0.07704569606801276, + "grad_norm": 27147.087890625, + "learning_rate": 1.948636202621325e-05, + "loss": 3.3881, + "step": 290 + }, + { + "epoch": 0.07970244420828905, + "grad_norm": 59977.046875, + "learning_rate": 1.9468650371944743e-05, + "loss": 3.4571, + "step": 300 + }, + { + "epoch": 0.08235919234856535, + "grad_norm": 66940.046875, + "learning_rate": 1.9450938717676235e-05, + "loss": 3.3864, + "step": 310 + }, + { + "epoch": 0.08501594048884166, + "grad_norm": 5094.89013671875, + "learning_rate": 1.9433227063407724e-05, + "loss": 3.3697, + "step": 320 + }, + { + "epoch": 0.08767268862911796, + "grad_norm": 4367.36474609375, + "learning_rate": 1.9415515409139216e-05, + "loss": 3.3016, + "step": 330 + }, + { + "epoch": 0.09032943676939426, + "grad_norm": 7941.5458984375, + "learning_rate": 1.9397803754870705e-05, + "loss": 3.0374, + "step": 340 + }, + { + "epoch": 0.09298618490967056, + "grad_norm": 3960.741943359375, + "learning_rate": 1.9380092100602197e-05, + "loss": 3.3324, + "step": 350 + }, + { + "epoch": 0.09564293304994687, + "grad_norm": 18565.732421875, + "learning_rate": 1.936238044633369e-05, + "loss": 3.2402, + "step": 360 + }, + { + "epoch": 0.09829968119022317, + "grad_norm": 66859.0, + "learning_rate": 1.9344668792065178e-05, + "loss": 3.3142, + "step": 370 + }, + { + "epoch": 0.10095642933049948, + "grad_norm": 1521.879638671875, + "learning_rate": 1.932695713779667e-05, + "loss": 3.0546, + "step": 380 + }, + { + "epoch": 0.10361317747077577, + "grad_norm": 12662.775390625, + "learning_rate": 1.9309245483528163e-05, + "loss": 3.5396, + "step": 390 + }, + { + "epoch": 0.10626992561105207, + "grad_norm": 105807.59375, + "learning_rate": 1.9291533829259655e-05, + "loss": 3.5301, + "step": 400 + }, + { + "epoch": 0.10892667375132838, + "grad_norm": 663547.875, + "learning_rate": 1.9273822174991147e-05, + "loss": 4.28, + "step": 410 + }, + { + "epoch": 0.11158342189160468, + "grad_norm": 8186676.0, + "learning_rate": 1.9256110520722636e-05, + "loss": 5.9807, + "step": 420 + }, + { + "epoch": 0.11424017003188097, + "grad_norm": 2142551.25, + "learning_rate": 1.923839886645413e-05, + "loss": 9.4764, + "step": 430 + }, + { + "epoch": 0.11689691817215728, + "grad_norm": 366486.1875, + "learning_rate": 1.922068721218562e-05, + "loss": 10.9151, + "step": 440 + }, + { + "epoch": 0.11955366631243358, + "grad_norm": 2276693.0, + "learning_rate": 1.9202975557917113e-05, + "loss": 12.5549, + "step": 450 + }, + { + "epoch": 0.12221041445270989, + "grad_norm": 2184425.5, + "learning_rate": 1.9185263903648602e-05, + "loss": 13.1915, + "step": 460 + }, + { + "epoch": 0.12486716259298619, + "grad_norm": 2937578.75, + "learning_rate": 1.9167552249380094e-05, + "loss": 14.2279, + "step": 470 + }, + { + "epoch": 0.1275239107332625, + "grad_norm": 10091141.0, + "learning_rate": 1.9149840595111583e-05, + "loss": 13.4766, + "step": 480 + }, + { + "epoch": 0.1301806588735388, + "grad_norm": 5426885.5, + "learning_rate": 1.9132128940843075e-05, + "loss": 14.8065, + "step": 490 + }, + { + "epoch": 0.13283740701381508, + "grad_norm": 2068535.25, + "learning_rate": 1.9114417286574568e-05, + "loss": 16.3781, + "step": 500 + }, + { + "epoch": 0.13549415515409138, + "grad_norm": 3599295.0, + "learning_rate": 1.909670563230606e-05, + "loss": 15.1519, + "step": 510 + }, + { + "epoch": 0.1381509032943677, + "grad_norm": 761431.875, + "learning_rate": 1.907899397803755e-05, + "loss": 15.1124, + "step": 520 + }, + { + "epoch": 0.140807651434644, + "grad_norm": 933641.375, + "learning_rate": 1.906128232376904e-05, + "loss": 14.1038, + "step": 530 + }, + { + "epoch": 0.1434643995749203, + "grad_norm": 423861.0625, + "learning_rate": 1.9043570669500533e-05, + "loss": 13.7131, + "step": 540 + }, + { + "epoch": 0.1461211477151966, + "grad_norm": 5383.50537109375, + "learning_rate": 1.9025859015232026e-05, + "loss": 12.8075, + "step": 550 + }, + { + "epoch": 0.1487778958554729, + "grad_norm": 3759.12548828125, + "learning_rate": 1.9008147360963514e-05, + "loss": 10.7237, + "step": 560 + }, + { + "epoch": 0.1514346439957492, + "grad_norm": 2150.089111328125, + "learning_rate": 1.8990435706695007e-05, + "loss": 7.2887, + "step": 570 + }, + { + "epoch": 0.15409139213602552, + "grad_norm": 3893.645751953125, + "learning_rate": 1.89727240524265e-05, + "loss": 4.8237, + "step": 580 + }, + { + "epoch": 0.1567481402763018, + "grad_norm": 11881.3046875, + "learning_rate": 1.895501239815799e-05, + "loss": 3.9525, + "step": 590 + }, + { + "epoch": 0.1594048884165781, + "grad_norm": 14820.740234375, + "learning_rate": 1.8937300743889483e-05, + "loss": 4.6401, + "step": 600 + }, + { + "epoch": 0.1620616365568544, + "grad_norm": 99031.640625, + "learning_rate": 1.8919589089620972e-05, + "loss": 4.9725, + "step": 610 + }, + { + "epoch": 0.1647183846971307, + "grad_norm": 47882.5859375, + "learning_rate": 1.890187743535246e-05, + "loss": 4.6917, + "step": 620 + }, + { + "epoch": 0.16737513283740701, + "grad_norm": 77129.8046875, + "learning_rate": 1.8884165781083953e-05, + "loss": 4.3883, + "step": 630 + }, + { + "epoch": 0.17003188097768332, + "grad_norm": 85341.125, + "learning_rate": 1.8866454126815446e-05, + "loss": 5.114, + "step": 640 + }, + { + "epoch": 0.17268862911795962, + "grad_norm": 34883.13671875, + "learning_rate": 1.8848742472546938e-05, + "loss": 4.9715, + "step": 650 + }, + { + "epoch": 0.17534537725823593, + "grad_norm": 22649.3359375, + "learning_rate": 1.8831030818278427e-05, + "loss": 4.9266, + "step": 660 + }, + { + "epoch": 0.17800212539851223, + "grad_norm": 59614.453125, + "learning_rate": 1.881331916400992e-05, + "loss": 4.3894, + "step": 670 + }, + { + "epoch": 0.1806588735387885, + "grad_norm": 13419.771484375, + "learning_rate": 1.879560750974141e-05, + "loss": 4.238, + "step": 680 + }, + { + "epoch": 0.18331562167906482, + "grad_norm": 26652.462890625, + "learning_rate": 1.8777895855472904e-05, + "loss": 4.5253, + "step": 690 + }, + { + "epoch": 0.18597236981934112, + "grad_norm": 37440.6015625, + "learning_rate": 1.8760184201204396e-05, + "loss": 4.0546, + "step": 700 + }, + { + "epoch": 0.18862911795961743, + "grad_norm": 43147.1796875, + "learning_rate": 1.8742472546935885e-05, + "loss": 4.4831, + "step": 710 + }, + { + "epoch": 0.19128586609989373, + "grad_norm": 143355.296875, + "learning_rate": 1.8724760892667377e-05, + "loss": 4.5257, + "step": 720 + }, + { + "epoch": 0.19394261424017004, + "grad_norm": 12484.8466796875, + "learning_rate": 1.870704923839887e-05, + "loss": 4.9662, + "step": 730 + }, + { + "epoch": 0.19659936238044634, + "grad_norm": 10305.0126953125, + "learning_rate": 1.868933758413036e-05, + "loss": 5.3629, + "step": 740 + }, + { + "epoch": 0.19925611052072265, + "grad_norm": 3247.491943359375, + "learning_rate": 1.867162592986185e-05, + "loss": 5.014, + "step": 750 + }, + { + "epoch": 0.20191285866099895, + "grad_norm": 2328.57470703125, + "learning_rate": 1.8653914275593343e-05, + "loss": 4.9864, + "step": 760 + }, + { + "epoch": 0.20456960680127523, + "grad_norm": 16007.7978515625, + "learning_rate": 1.863620262132483e-05, + "loss": 4.5492, + "step": 770 + }, + { + "epoch": 0.20722635494155153, + "grad_norm": 39521.5078125, + "learning_rate": 1.8618490967056324e-05, + "loss": 4.4608, + "step": 780 + }, + { + "epoch": 0.20988310308182784, + "grad_norm": 553922.0, + "learning_rate": 1.8600779312787816e-05, + "loss": 4.9998, + "step": 790 + }, + { + "epoch": 0.21253985122210414, + "grad_norm": 623164.25, + "learning_rate": 1.858306765851931e-05, + "loss": 4.6969, + "step": 800 + }, + { + "epoch": 0.21519659936238045, + "grad_norm": 849724.3125, + "learning_rate": 1.8565356004250797e-05, + "loss": 5.2992, + "step": 810 + }, + { + "epoch": 0.21785334750265675, + "grad_norm": 1883489.125, + "learning_rate": 1.854764434998229e-05, + "loss": 5.5446, + "step": 820 + }, + { + "epoch": 0.22051009564293306, + "grad_norm": 1473608.5, + "learning_rate": 1.8529932695713782e-05, + "loss": 5.6081, + "step": 830 + }, + { + "epoch": 0.22316684378320936, + "grad_norm": 6046079.5, + "learning_rate": 1.8512221041445274e-05, + "loss": 5.543, + "step": 840 + }, + { + "epoch": 0.22582359192348567, + "grad_norm": 3414641.75, + "learning_rate": 1.8494509387176763e-05, + "loss": 6.5477, + "step": 850 + }, + { + "epoch": 0.22848034006376194, + "grad_norm": 3107066.0, + "learning_rate": 1.8476797732908255e-05, + "loss": 6.6238, + "step": 860 + }, + { + "epoch": 0.23113708820403825, + "grad_norm": 2057658.75, + "learning_rate": 1.8459086078639748e-05, + "loss": 6.6566, + "step": 870 + }, + { + "epoch": 0.23379383634431455, + "grad_norm": 689954.125, + "learning_rate": 1.8441374424371236e-05, + "loss": 5.6908, + "step": 880 + }, + { + "epoch": 0.23645058448459086, + "grad_norm": 5757.73388671875, + "learning_rate": 1.842366277010273e-05, + "loss": 4.5477, + "step": 890 + }, + { + "epoch": 0.23910733262486716, + "grad_norm": 5359.6728515625, + "learning_rate": 1.840595111583422e-05, + "loss": 3.6785, + "step": 900 + }, + { + "epoch": 0.24176408076514347, + "grad_norm": 2013.8673095703125, + "learning_rate": 1.838823946156571e-05, + "loss": 3.519, + "step": 910 + }, + { + "epoch": 0.24442082890541977, + "grad_norm": 6289.10888671875, + "learning_rate": 1.8370527807297202e-05, + "loss": 3.6842, + "step": 920 + }, + { + "epoch": 0.24707757704569608, + "grad_norm": 3089.353759765625, + "learning_rate": 1.8352816153028694e-05, + "loss": 3.6535, + "step": 930 + }, + { + "epoch": 0.24973432518597238, + "grad_norm": 2002.3780517578125, + "learning_rate": 1.8335104498760187e-05, + "loss": 3.5385, + "step": 940 + }, + { + "epoch": 0.25239107332624866, + "grad_norm": 5194.0224609375, + "learning_rate": 1.8317392844491676e-05, + "loss": 3.4652, + "step": 950 + }, + { + "epoch": 0.255047821466525, + "grad_norm": 2200.886962890625, + "learning_rate": 1.8299681190223168e-05, + "loss": 3.6788, + "step": 960 + }, + { + "epoch": 0.25770456960680127, + "grad_norm": 10148.009765625, + "learning_rate": 1.828196953595466e-05, + "loss": 3.7478, + "step": 970 + }, + { + "epoch": 0.2603613177470776, + "grad_norm": 2540.3837890625, + "learning_rate": 1.8264257881686152e-05, + "loss": 3.4836, + "step": 980 + }, + { + "epoch": 0.2630180658873539, + "grad_norm": 2385.15625, + "learning_rate": 1.8246546227417645e-05, + "loss": 3.2733, + "step": 990 + }, + { + "epoch": 0.26567481402763016, + "grad_norm": 8635.650390625, + "learning_rate": 1.8228834573149134e-05, + "loss": 3.4935, + "step": 1000 + }, + { + "epoch": 0.2683315621679065, + "grad_norm": 17405.947265625, + "learning_rate": 1.8211122918880626e-05, + "loss": 3.3743, + "step": 1010 + }, + { + "epoch": 0.27098831030818277, + "grad_norm": 2616.988037109375, + "learning_rate": 1.8193411264612115e-05, + "loss": 4.0444, + "step": 1020 + }, + { + "epoch": 0.2736450584484591, + "grad_norm": 9487.044921875, + "learning_rate": 1.8175699610343607e-05, + "loss": 3.8644, + "step": 1030 + }, + { + "epoch": 0.2763018065887354, + "grad_norm": 681.0313110351562, + "learning_rate": 1.81579879560751e-05, + "loss": 3.2198, + "step": 1040 + }, + { + "epoch": 0.2789585547290117, + "grad_norm": 1654.2945556640625, + "learning_rate": 1.8140276301806588e-05, + "loss": 3.741, + "step": 1050 + }, + { + "epoch": 0.281615302869288, + "grad_norm": 2555.9970703125, + "learning_rate": 1.812256464753808e-05, + "loss": 3.5377, + "step": 1060 + }, + { + "epoch": 0.2842720510095643, + "grad_norm": 1187.751220703125, + "learning_rate": 1.8104852993269573e-05, + "loss": 3.6048, + "step": 1070 + }, + { + "epoch": 0.2869287991498406, + "grad_norm": 2747.8486328125, + "learning_rate": 1.8087141339001065e-05, + "loss": 3.6148, + "step": 1080 + }, + { + "epoch": 0.2895855472901169, + "grad_norm": 624.16650390625, + "learning_rate": 1.8069429684732557e-05, + "loss": 3.0917, + "step": 1090 + }, + { + "epoch": 0.2922422954303932, + "grad_norm": 283.41033935546875, + "learning_rate": 1.8051718030464046e-05, + "loss": 3.4423, + "step": 1100 + }, + { + "epoch": 0.2948990435706695, + "grad_norm": 563.9237670898438, + "learning_rate": 1.8034006376195538e-05, + "loss": 3.1134, + "step": 1110 + }, + { + "epoch": 0.2975557917109458, + "grad_norm": 419.8347473144531, + "learning_rate": 1.801629472192703e-05, + "loss": 3.3765, + "step": 1120 + }, + { + "epoch": 0.3002125398512221, + "grad_norm": 328.199462890625, + "learning_rate": 1.7998583067658523e-05, + "loss": 3.1981, + "step": 1130 + }, + { + "epoch": 0.3028692879914984, + "grad_norm": 1167.4515380859375, + "learning_rate": 1.7980871413390012e-05, + "loss": 2.9826, + "step": 1140 + }, + { + "epoch": 0.3055260361317747, + "grad_norm": 1590.5523681640625, + "learning_rate": 1.7963159759121504e-05, + "loss": 3.2378, + "step": 1150 + }, + { + "epoch": 0.30818278427205104, + "grad_norm": 1228.88037109375, + "learning_rate": 1.7945448104852993e-05, + "loss": 3.2167, + "step": 1160 + }, + { + "epoch": 0.3108395324123273, + "grad_norm": 866.290283203125, + "learning_rate": 1.7927736450584485e-05, + "loss": 2.9749, + "step": 1170 + }, + { + "epoch": 0.3134962805526036, + "grad_norm": 326.7938537597656, + "learning_rate": 1.7910024796315977e-05, + "loss": 3.111, + "step": 1180 + }, + { + "epoch": 0.3161530286928799, + "grad_norm": 603.0250854492188, + "learning_rate": 1.789231314204747e-05, + "loss": 3.1647, + "step": 1190 + }, + { + "epoch": 0.3188097768331562, + "grad_norm": 553.5940551757812, + "learning_rate": 1.787460148777896e-05, + "loss": 3.1094, + "step": 1200 + }, + { + "epoch": 0.32146652497343253, + "grad_norm": 417.6220703125, + "learning_rate": 1.785688983351045e-05, + "loss": 3.195, + "step": 1210 + }, + { + "epoch": 0.3241232731137088, + "grad_norm": 745.7908935546875, + "learning_rate": 1.7839178179241943e-05, + "loss": 2.8119, + "step": 1220 + }, + { + "epoch": 0.32678002125398514, + "grad_norm": 963.697021484375, + "learning_rate": 1.7821466524973435e-05, + "loss": 2.9828, + "step": 1230 + }, + { + "epoch": 0.3294367693942614, + "grad_norm": 3789.7373046875, + "learning_rate": 1.7803754870704924e-05, + "loss": 2.8971, + "step": 1240 + }, + { + "epoch": 0.33209351753453775, + "grad_norm": 1777.551025390625, + "learning_rate": 1.7786043216436416e-05, + "loss": 2.8533, + "step": 1250 + }, + { + "epoch": 0.33475026567481403, + "grad_norm": 725.1536254882812, + "learning_rate": 1.776833156216791e-05, + "loss": 2.6644, + "step": 1260 + }, + { + "epoch": 0.3374070138150903, + "grad_norm": 2410.62060546875, + "learning_rate": 1.77506199078994e-05, + "loss": 3.058, + "step": 1270 + }, + { + "epoch": 0.34006376195536664, + "grad_norm": 825.2067260742188, + "learning_rate": 1.7732908253630893e-05, + "loss": 2.7154, + "step": 1280 + }, + { + "epoch": 0.3427205100956429, + "grad_norm": 835.7099609375, + "learning_rate": 1.7715196599362382e-05, + "loss": 3.5358, + "step": 1290 + }, + { + "epoch": 0.34537725823591925, + "grad_norm": 2334.035888671875, + "learning_rate": 1.769748494509387e-05, + "loss": 3.2141, + "step": 1300 + }, + { + "epoch": 0.3480340063761955, + "grad_norm": 1089.702392578125, + "learning_rate": 1.7679773290825363e-05, + "loss": 2.8534, + "step": 1310 + }, + { + "epoch": 0.35069075451647186, + "grad_norm": 643.6981811523438, + "learning_rate": 1.7662061636556856e-05, + "loss": 3.14, + "step": 1320 + }, + { + "epoch": 0.35334750265674814, + "grad_norm": 927.3551025390625, + "learning_rate": 1.7644349982288348e-05, + "loss": 3.255, + "step": 1330 + }, + { + "epoch": 0.35600425079702447, + "grad_norm": 642.1421508789062, + "learning_rate": 1.7626638328019837e-05, + "loss": 2.9875, + "step": 1340 + }, + { + "epoch": 0.35866099893730075, + "grad_norm": 1514.4876708984375, + "learning_rate": 1.760892667375133e-05, + "loss": 2.7786, + "step": 1350 + }, + { + "epoch": 0.361317747077577, + "grad_norm": 2913.84912109375, + "learning_rate": 1.759121501948282e-05, + "loss": 2.83, + "step": 1360 + }, + { + "epoch": 0.36397449521785336, + "grad_norm": 1152.3695068359375, + "learning_rate": 1.7573503365214314e-05, + "loss": 3.316, + "step": 1370 + }, + { + "epoch": 0.36663124335812963, + "grad_norm": 2364.73876953125, + "learning_rate": 1.7555791710945806e-05, + "loss": 3.1473, + "step": 1380 + }, + { + "epoch": 0.36928799149840597, + "grad_norm": 1560.827392578125, + "learning_rate": 1.7538080056677295e-05, + "loss": 2.875, + "step": 1390 + }, + { + "epoch": 0.37194473963868224, + "grad_norm": 672.7749633789062, + "learning_rate": 1.7520368402408787e-05, + "loss": 3.3416, + "step": 1400 + }, + { + "epoch": 0.3746014877789586, + "grad_norm": 3212.583740234375, + "learning_rate": 1.750265674814028e-05, + "loss": 2.6347, + "step": 1410 + }, + { + "epoch": 0.37725823591923485, + "grad_norm": 9892.419921875, + "learning_rate": 1.7484945093871768e-05, + "loss": 2.9356, + "step": 1420 + }, + { + "epoch": 0.3799149840595112, + "grad_norm": 13098.6201171875, + "learning_rate": 1.746723343960326e-05, + "loss": 3.0818, + "step": 1430 + }, + { + "epoch": 0.38257173219978746, + "grad_norm": 33038.46484375, + "learning_rate": 1.7449521785334753e-05, + "loss": 3.4073, + "step": 1440 + }, + { + "epoch": 0.38522848034006374, + "grad_norm": 58945.421875, + "learning_rate": 1.743181013106624e-05, + "loss": 3.4505, + "step": 1450 + }, + { + "epoch": 0.38788522848034007, + "grad_norm": 53823.19921875, + "learning_rate": 1.7414098476797734e-05, + "loss": 3.4398, + "step": 1460 + }, + { + "epoch": 0.39054197662061635, + "grad_norm": 213358.46875, + "learning_rate": 1.7396386822529226e-05, + "loss": 3.1337, + "step": 1470 + }, + { + "epoch": 0.3931987247608927, + "grad_norm": 174113.078125, + "learning_rate": 1.7378675168260718e-05, + "loss": 3.6872, + "step": 1480 + }, + { + "epoch": 0.39585547290116896, + "grad_norm": 110265.9609375, + "learning_rate": 1.7360963513992207e-05, + "loss": 3.5268, + "step": 1490 + }, + { + "epoch": 0.3985122210414453, + "grad_norm": 125626.78125, + "learning_rate": 1.73432518597237e-05, + "loss": 3.8027, + "step": 1500 + }, + { + "epoch": 0.40116896918172157, + "grad_norm": 119383.8359375, + "learning_rate": 1.7325540205455192e-05, + "loss": 3.6381, + "step": 1510 + }, + { + "epoch": 0.4038257173219979, + "grad_norm": 78246.125, + "learning_rate": 1.7307828551186684e-05, + "loss": 3.6688, + "step": 1520 + }, + { + "epoch": 0.4064824654622742, + "grad_norm": 77016.8671875, + "learning_rate": 1.7290116896918173e-05, + "loss": 3.7796, + "step": 1530 + }, + { + "epoch": 0.40913921360255046, + "grad_norm": 471759.21875, + "learning_rate": 1.7272405242649665e-05, + "loss": 3.738, + "step": 1540 + }, + { + "epoch": 0.4117959617428268, + "grad_norm": 108969.1171875, + "learning_rate": 1.7254693588381157e-05, + "loss": 3.4583, + "step": 1550 + }, + { + "epoch": 0.41445270988310307, + "grad_norm": 44717.91015625, + "learning_rate": 1.7236981934112646e-05, + "loss": 3.0156, + "step": 1560 + }, + { + "epoch": 0.4171094580233794, + "grad_norm": 56418.765625, + "learning_rate": 1.721927027984414e-05, + "loss": 3.339, + "step": 1570 + }, + { + "epoch": 0.4197662061636557, + "grad_norm": 82086.234375, + "learning_rate": 1.720155862557563e-05, + "loss": 3.2477, + "step": 1580 + }, + { + "epoch": 0.422422954303932, + "grad_norm": 38437.12890625, + "learning_rate": 1.718384697130712e-05, + "loss": 3.0923, + "step": 1590 + }, + { + "epoch": 0.4250797024442083, + "grad_norm": 64070.26953125, + "learning_rate": 1.7166135317038612e-05, + "loss": 3.8784, + "step": 1600 + }, + { + "epoch": 0.4277364505844846, + "grad_norm": 96363.0078125, + "learning_rate": 1.7148423662770104e-05, + "loss": 3.1945, + "step": 1610 + }, + { + "epoch": 0.4303931987247609, + "grad_norm": 101021.7578125, + "learning_rate": 1.7130712008501596e-05, + "loss": 2.9785, + "step": 1620 + }, + { + "epoch": 0.43304994686503717, + "grad_norm": 33741.50390625, + "learning_rate": 1.7113000354233085e-05, + "loss": 3.0544, + "step": 1630 + }, + { + "epoch": 0.4357066950053135, + "grad_norm": 18486.07421875, + "learning_rate": 1.7095288699964578e-05, + "loss": 3.3951, + "step": 1640 + }, + { + "epoch": 0.4383634431455898, + "grad_norm": 141817.4375, + "learning_rate": 1.707757704569607e-05, + "loss": 3.8719, + "step": 1650 + }, + { + "epoch": 0.4410201912858661, + "grad_norm": 18356.125, + "learning_rate": 1.7059865391427562e-05, + "loss": 3.217, + "step": 1660 + }, + { + "epoch": 0.4436769394261424, + "grad_norm": 75286.890625, + "learning_rate": 1.7042153737159054e-05, + "loss": 3.2279, + "step": 1670 + }, + { + "epoch": 0.4463336875664187, + "grad_norm": 93692.8671875, + "learning_rate": 1.7024442082890543e-05, + "loss": 3.3421, + "step": 1680 + }, + { + "epoch": 0.448990435706695, + "grad_norm": 137171.109375, + "learning_rate": 1.7006730428622032e-05, + "loss": 3.4727, + "step": 1690 + }, + { + "epoch": 0.45164718384697133, + "grad_norm": 143812.296875, + "learning_rate": 1.6989018774353524e-05, + "loss": 3.24, + "step": 1700 + }, + { + "epoch": 0.4543039319872476, + "grad_norm": 35345.19921875, + "learning_rate": 1.6971307120085017e-05, + "loss": 3.2903, + "step": 1710 + }, + { + "epoch": 0.4569606801275239, + "grad_norm": 69917.4375, + "learning_rate": 1.695359546581651e-05, + "loss": 3.1309, + "step": 1720 + }, + { + "epoch": 0.4596174282678002, + "grad_norm": 71451.5859375, + "learning_rate": 1.6935883811547998e-05, + "loss": 3.8151, + "step": 1730 + }, + { + "epoch": 0.4622741764080765, + "grad_norm": 54897.4375, + "learning_rate": 1.691817215727949e-05, + "loss": 3.7961, + "step": 1740 + }, + { + "epoch": 0.46493092454835283, + "grad_norm": 42574.12109375, + "learning_rate": 1.6900460503010982e-05, + "loss": 3.3018, + "step": 1750 + }, + { + "epoch": 0.4675876726886291, + "grad_norm": 118568.609375, + "learning_rate": 1.6882748848742475e-05, + "loss": 3.4044, + "step": 1760 + }, + { + "epoch": 0.47024442082890544, + "grad_norm": 141536.96875, + "learning_rate": 1.6865037194473967e-05, + "loss": 3.5705, + "step": 1770 + }, + { + "epoch": 0.4729011689691817, + "grad_norm": 153274.9375, + "learning_rate": 1.6847325540205456e-05, + "loss": 3.7034, + "step": 1780 + }, + { + "epoch": 0.47555791710945805, + "grad_norm": 121872.7890625, + "learning_rate": 1.6829613885936948e-05, + "loss": 3.6836, + "step": 1790 + }, + { + "epoch": 0.4782146652497343, + "grad_norm": 101665.6640625, + "learning_rate": 1.681190223166844e-05, + "loss": 3.5983, + "step": 1800 + }, + { + "epoch": 0.4808714133900106, + "grad_norm": 212873.5, + "learning_rate": 1.6794190577399933e-05, + "loss": 3.3915, + "step": 1810 + }, + { + "epoch": 0.48352816153028694, + "grad_norm": 19234.345703125, + "learning_rate": 1.677647892313142e-05, + "loss": 3.1403, + "step": 1820 + }, + { + "epoch": 0.4861849096705632, + "grad_norm": 126968.46875, + "learning_rate": 1.6758767268862914e-05, + "loss": 3.3559, + "step": 1830 + }, + { + "epoch": 0.48884165781083955, + "grad_norm": 40483.28515625, + "learning_rate": 1.6741055614594403e-05, + "loss": 3.4042, + "step": 1840 + }, + { + "epoch": 0.4914984059511158, + "grad_norm": 281826.84375, + "learning_rate": 1.6723343960325895e-05, + "loss": 3.5656, + "step": 1850 + }, + { + "epoch": 0.49415515409139216, + "grad_norm": 112396.421875, + "learning_rate": 1.6705632306057387e-05, + "loss": 3.5217, + "step": 1860 + }, + { + "epoch": 0.49681190223166843, + "grad_norm": 430567.96875, + "learning_rate": 1.668792065178888e-05, + "loss": 3.784, + "step": 1870 + }, + { + "epoch": 0.49946865037194477, + "grad_norm": 19857.708984375, + "learning_rate": 1.667020899752037e-05, + "loss": 3.2844, + "step": 1880 + }, + { + "epoch": 0.502125398512221, + "grad_norm": 153824.828125, + "learning_rate": 1.665249734325186e-05, + "loss": 3.5734, + "step": 1890 + }, + { + "epoch": 0.5047821466524973, + "grad_norm": 555864.875, + "learning_rate": 1.6634785688983353e-05, + "loss": 3.5042, + "step": 1900 + }, + { + "epoch": 0.5074388947927736, + "grad_norm": 1425396.625, + "learning_rate": 1.6617074034714845e-05, + "loss": 3.8919, + "step": 1910 + }, + { + "epoch": 0.51009564293305, + "grad_norm": 1588321.5, + "learning_rate": 1.6599362380446334e-05, + "loss": 3.7013, + "step": 1920 + }, + { + "epoch": 0.5127523910733263, + "grad_norm": 843313.25, + "learning_rate": 1.6581650726177826e-05, + "loss": 4.0527, + "step": 1930 + }, + { + "epoch": 0.5154091392136025, + "grad_norm": 121270.0859375, + "learning_rate": 1.656393907190932e-05, + "loss": 3.6732, + "step": 1940 + }, + { + "epoch": 0.5180658873538788, + "grad_norm": 194603.609375, + "learning_rate": 1.654622741764081e-05, + "loss": 3.5416, + "step": 1950 + }, + { + "epoch": 0.5207226354941552, + "grad_norm": 103689.84375, + "learning_rate": 1.65285157633723e-05, + "loss": 3.6058, + "step": 1960 + }, + { + "epoch": 0.5233793836344315, + "grad_norm": 148743.953125, + "learning_rate": 1.6510804109103792e-05, + "loss": 3.4376, + "step": 1970 + }, + { + "epoch": 0.5260361317747078, + "grad_norm": 23079.94140625, + "learning_rate": 1.649309245483528e-05, + "loss": 3.524, + "step": 1980 + }, + { + "epoch": 0.528692879914984, + "grad_norm": 12263.953125, + "learning_rate": 1.6475380800566773e-05, + "loss": 3.1242, + "step": 1990 + }, + { + "epoch": 0.5313496280552603, + "grad_norm": 270958.5625, + "learning_rate": 1.6457669146298265e-05, + "loss": 3.8531, + "step": 2000 + }, + { + "epoch": 0.5340063761955367, + "grad_norm": 145561.640625, + "learning_rate": 1.6439957492029758e-05, + "loss": 3.104, + "step": 2010 + }, + { + "epoch": 0.536663124335813, + "grad_norm": 104717.5625, + "learning_rate": 1.6422245837761247e-05, + "loss": 3.3674, + "step": 2020 + }, + { + "epoch": 0.5393198724760893, + "grad_norm": 112249.3515625, + "learning_rate": 1.640453418349274e-05, + "loss": 3.2119, + "step": 2030 + }, + { + "epoch": 0.5419766206163655, + "grad_norm": 131700.71875, + "learning_rate": 1.638682252922423e-05, + "loss": 3.6448, + "step": 2040 + }, + { + "epoch": 0.5446333687566419, + "grad_norm": 119026.4140625, + "learning_rate": 1.6369110874955723e-05, + "loss": 3.0097, + "step": 2050 + }, + { + "epoch": 0.5472901168969182, + "grad_norm": 103121.09375, + "learning_rate": 1.6351399220687216e-05, + "loss": 3.4205, + "step": 2060 + }, + { + "epoch": 0.5499468650371945, + "grad_norm": 237787.03125, + "learning_rate": 1.6333687566418704e-05, + "loss": 3.349, + "step": 2070 + }, + { + "epoch": 0.5526036131774708, + "grad_norm": 49652.95703125, + "learning_rate": 1.6315975912150197e-05, + "loss": 3.1665, + "step": 2080 + }, + { + "epoch": 0.555260361317747, + "grad_norm": 262178.34375, + "learning_rate": 1.629826425788169e-05, + "loss": 3.4743, + "step": 2090 + }, + { + "epoch": 0.5579171094580234, + "grad_norm": 130814.703125, + "learning_rate": 1.6280552603613178e-05, + "loss": 3.4995, + "step": 2100 + }, + { + "epoch": 0.5605738575982997, + "grad_norm": 273671.09375, + "learning_rate": 1.626284094934467e-05, + "loss": 3.3983, + "step": 2110 + }, + { + "epoch": 0.563230605738576, + "grad_norm": 385060.25, + "learning_rate": 1.6245129295076162e-05, + "loss": 3.5215, + "step": 2120 + }, + { + "epoch": 0.5658873538788523, + "grad_norm": 165007.71875, + "learning_rate": 1.622741764080765e-05, + "loss": 3.1164, + "step": 2130 + }, + { + "epoch": 0.5685441020191286, + "grad_norm": 70266.53125, + "learning_rate": 1.6209705986539144e-05, + "loss": 3.1971, + "step": 2140 + }, + { + "epoch": 0.5712008501594049, + "grad_norm": 271687.3125, + "learning_rate": 1.6191994332270636e-05, + "loss": 3.4932, + "step": 2150 + }, + { + "epoch": 0.5738575982996812, + "grad_norm": 35143.67578125, + "learning_rate": 1.6174282678002128e-05, + "loss": 3.4214, + "step": 2160 + }, + { + "epoch": 0.5765143464399575, + "grad_norm": 1173879.625, + "learning_rate": 1.6156571023733617e-05, + "loss": 3.3194, + "step": 2170 + }, + { + "epoch": 0.5791710945802337, + "grad_norm": 306067.03125, + "learning_rate": 1.613885936946511e-05, + "loss": 3.1417, + "step": 2180 + }, + { + "epoch": 0.5818278427205101, + "grad_norm": 342329.0625, + "learning_rate": 1.61211477151966e-05, + "loss": 3.355, + "step": 2190 + }, + { + "epoch": 0.5844845908607864, + "grad_norm": 50600.97265625, + "learning_rate": 1.6103436060928094e-05, + "loss": 3.3974, + "step": 2200 + }, + { + "epoch": 0.5871413390010627, + "grad_norm": 360589.03125, + "learning_rate": 1.6085724406659583e-05, + "loss": 3.4514, + "step": 2210 + }, + { + "epoch": 0.589798087141339, + "grad_norm": 94335.3828125, + "learning_rate": 1.6068012752391075e-05, + "loss": 3.2719, + "step": 2220 + }, + { + "epoch": 0.5924548352816154, + "grad_norm": 53790.76953125, + "learning_rate": 1.6050301098122564e-05, + "loss": 3.3992, + "step": 2230 + }, + { + "epoch": 0.5951115834218916, + "grad_norm": 107421.421875, + "learning_rate": 1.6032589443854056e-05, + "loss": 3.3512, + "step": 2240 + }, + { + "epoch": 0.5977683315621679, + "grad_norm": 142487.859375, + "learning_rate": 1.601487778958555e-05, + "loss": 3.826, + "step": 2250 + }, + { + "epoch": 0.6004250797024442, + "grad_norm": 1261580.75, + "learning_rate": 1.599716613531704e-05, + "loss": 3.7385, + "step": 2260 + }, + { + "epoch": 0.6030818278427205, + "grad_norm": 648111.0, + "learning_rate": 1.597945448104853e-05, + "loss": 3.2839, + "step": 2270 + }, + { + "epoch": 0.6057385759829969, + "grad_norm": 326968.125, + "learning_rate": 1.5961742826780022e-05, + "loss": 3.7895, + "step": 2280 + }, + { + "epoch": 0.6083953241232731, + "grad_norm": 808961.5625, + "learning_rate": 1.5944031172511514e-05, + "loss": 3.7051, + "step": 2290 + }, + { + "epoch": 0.6110520722635494, + "grad_norm": 2958079.0, + "learning_rate": 1.5926319518243006e-05, + "loss": 3.8099, + "step": 2300 + }, + { + "epoch": 0.6137088204038257, + "grad_norm": 314874.03125, + "learning_rate": 1.5908607863974495e-05, + "loss": 3.6654, + "step": 2310 + }, + { + "epoch": 0.6163655685441021, + "grad_norm": 8078548.0, + "learning_rate": 1.5890896209705987e-05, + "loss": 4.018, + "step": 2320 + }, + { + "epoch": 0.6190223166843783, + "grad_norm": 135695.46875, + "learning_rate": 1.587318455543748e-05, + "loss": 3.6651, + "step": 2330 + }, + { + "epoch": 0.6216790648246546, + "grad_norm": 18501240.0, + "learning_rate": 1.5855472901168972e-05, + "loss": 4.0069, + "step": 2340 + }, + { + "epoch": 0.6243358129649309, + "grad_norm": 4980981.5, + "learning_rate": 1.5837761246900464e-05, + "loss": 3.9174, + "step": 2350 + }, + { + "epoch": 0.6269925611052072, + "grad_norm": 1297274.125, + "learning_rate": 1.5820049592631953e-05, + "loss": 3.3223, + "step": 2360 + }, + { + "epoch": 0.6296493092454836, + "grad_norm": 1378757.625, + "learning_rate": 1.5802337938363442e-05, + "loss": 3.7712, + "step": 2370 + }, + { + "epoch": 0.6323060573857598, + "grad_norm": 2027859.875, + "learning_rate": 1.5784626284094934e-05, + "loss": 3.5468, + "step": 2380 + }, + { + "epoch": 0.6349628055260361, + "grad_norm": 157107.65625, + "learning_rate": 1.5766914629826427e-05, + "loss": 3.5328, + "step": 2390 + }, + { + "epoch": 0.6376195536663124, + "grad_norm": 1103094.75, + "learning_rate": 1.574920297555792e-05, + "loss": 3.6031, + "step": 2400 + }, + { + "epoch": 0.6402763018065888, + "grad_norm": 725449.5, + "learning_rate": 1.5731491321289408e-05, + "loss": 3.9276, + "step": 2410 + }, + { + "epoch": 0.6429330499468651, + "grad_norm": 214425.640625, + "learning_rate": 1.57137796670209e-05, + "loss": 3.517, + "step": 2420 + }, + { + "epoch": 0.6455897980871413, + "grad_norm": 876419.625, + "learning_rate": 1.5696068012752392e-05, + "loss": 3.4675, + "step": 2430 + }, + { + "epoch": 0.6482465462274176, + "grad_norm": 1504300.25, + "learning_rate": 1.5678356358483884e-05, + "loss": 3.4772, + "step": 2440 + }, + { + "epoch": 0.6509032943676939, + "grad_norm": 144657.71875, + "learning_rate": 1.5660644704215377e-05, + "loss": 3.42, + "step": 2450 + }, + { + "epoch": 0.6535600425079703, + "grad_norm": 371512.40625, + "learning_rate": 1.5642933049946866e-05, + "loss": 3.6802, + "step": 2460 + }, + { + "epoch": 0.6562167906482466, + "grad_norm": 1322714.5, + "learning_rate": 1.5625221395678358e-05, + "loss": 3.805, + "step": 2470 + }, + { + "epoch": 0.6588735387885228, + "grad_norm": 218897.765625, + "learning_rate": 1.560750974140985e-05, + "loss": 3.252, + "step": 2480 + }, + { + "epoch": 0.6615302869287991, + "grad_norm": 1596077.0, + "learning_rate": 1.5589798087141342e-05, + "loss": 3.626, + "step": 2490 + }, + { + "epoch": 0.6641870350690755, + "grad_norm": 2922875.75, + "learning_rate": 1.557208643287283e-05, + "loss": 3.5045, + "step": 2500 + }, + { + "epoch": 0.6668437832093518, + "grad_norm": 96812.5859375, + "learning_rate": 1.5554374778604324e-05, + "loss": 3.7078, + "step": 2510 + }, + { + "epoch": 0.6695005313496281, + "grad_norm": 1580814.125, + "learning_rate": 1.5536663124335812e-05, + "loss": 3.615, + "step": 2520 + }, + { + "epoch": 0.6721572794899043, + "grad_norm": 235169.53125, + "learning_rate": 1.5518951470067305e-05, + "loss": 3.5076, + "step": 2530 + }, + { + "epoch": 0.6748140276301806, + "grad_norm": 816632.0, + "learning_rate": 1.5501239815798797e-05, + "loss": 4.0074, + "step": 2540 + }, + { + "epoch": 0.677470775770457, + "grad_norm": 3783126.5, + "learning_rate": 1.548352816153029e-05, + "loss": 3.7162, + "step": 2550 + }, + { + "epoch": 0.6801275239107333, + "grad_norm": 1676969.875, + "learning_rate": 1.5465816507261778e-05, + "loss": 3.9383, + "step": 2560 + }, + { + "epoch": 0.6827842720510096, + "grad_norm": 944205.0, + "learning_rate": 1.544810485299327e-05, + "loss": 3.6335, + "step": 2570 + }, + { + "epoch": 0.6854410201912858, + "grad_norm": 532299.0, + "learning_rate": 1.5430393198724763e-05, + "loss": 3.776, + "step": 2580 + }, + { + "epoch": 0.6880977683315622, + "grad_norm": 324683.46875, + "learning_rate": 1.5412681544456255e-05, + "loss": 4.0332, + "step": 2590 + }, + { + "epoch": 0.6907545164718385, + "grad_norm": 371158.6875, + "learning_rate": 1.5394969890187744e-05, + "loss": 3.2831, + "step": 2600 + }, + { + "epoch": 0.6934112646121148, + "grad_norm": 626177.8125, + "learning_rate": 1.5377258235919236e-05, + "loss": 3.7419, + "step": 2610 + }, + { + "epoch": 0.696068012752391, + "grad_norm": 489480.3125, + "learning_rate": 1.535954658165073e-05, + "loss": 3.9135, + "step": 2620 + }, + { + "epoch": 0.6987247608926673, + "grad_norm": 840057.5625, + "learning_rate": 1.534183492738222e-05, + "loss": 3.6214, + "step": 2630 + }, + { + "epoch": 0.7013815090329437, + "grad_norm": 641658.4375, + "learning_rate": 1.532412327311371e-05, + "loss": 3.9029, + "step": 2640 + }, + { + "epoch": 0.70403825717322, + "grad_norm": 1129191.0, + "learning_rate": 1.5306411618845202e-05, + "loss": 3.6271, + "step": 2650 + }, + { + "epoch": 0.7066950053134963, + "grad_norm": 758676.8125, + "learning_rate": 1.528869996457669e-05, + "loss": 3.8411, + "step": 2660 + }, + { + "epoch": 0.7093517534537725, + "grad_norm": 946755.25, + "learning_rate": 1.5270988310308183e-05, + "loss": 3.8184, + "step": 2670 + }, + { + "epoch": 0.7120085015940489, + "grad_norm": 1282365.625, + "learning_rate": 1.5253276656039675e-05, + "loss": 3.8393, + "step": 2680 + }, + { + "epoch": 0.7146652497343252, + "grad_norm": 1212575.875, + "learning_rate": 1.5235565001771166e-05, + "loss": 3.6106, + "step": 2690 + }, + { + "epoch": 0.7173219978746015, + "grad_norm": 2197153.75, + "learning_rate": 1.5217853347502658e-05, + "loss": 3.5554, + "step": 2700 + }, + { + "epoch": 0.7199787460148778, + "grad_norm": 621252.1875, + "learning_rate": 1.520014169323415e-05, + "loss": 3.3832, + "step": 2710 + }, + { + "epoch": 0.722635494155154, + "grad_norm": 243552.59375, + "learning_rate": 1.5182430038965641e-05, + "loss": 3.4785, + "step": 2720 + }, + { + "epoch": 0.7252922422954304, + "grad_norm": 3559921.0, + "learning_rate": 1.5164718384697133e-05, + "loss": 3.7972, + "step": 2730 + }, + { + "epoch": 0.7279489904357067, + "grad_norm": 8816077.0, + "learning_rate": 1.5147006730428624e-05, + "loss": 3.6698, + "step": 2740 + }, + { + "epoch": 0.730605738575983, + "grad_norm": 2959412.0, + "learning_rate": 1.5129295076160116e-05, + "loss": 3.9389, + "step": 2750 + }, + { + "epoch": 0.7332624867162593, + "grad_norm": 13276429.0, + "learning_rate": 1.5111583421891607e-05, + "loss": 3.6811, + "step": 2760 + }, + { + "epoch": 0.7359192348565357, + "grad_norm": 24583468.0, + "learning_rate": 1.5093871767623095e-05, + "loss": 3.9955, + "step": 2770 + }, + { + "epoch": 0.7385759829968119, + "grad_norm": 11388400.0, + "learning_rate": 1.5076160113354588e-05, + "loss": 3.4851, + "step": 2780 + }, + { + "epoch": 0.7412327311370882, + "grad_norm": 2901875.5, + "learning_rate": 1.5058448459086078e-05, + "loss": 4.0118, + "step": 2790 + }, + { + "epoch": 0.7438894792773645, + "grad_norm": 7893670.0, + "learning_rate": 1.504073680481757e-05, + "loss": 4.3674, + "step": 2800 + }, + { + "epoch": 0.7465462274176408, + "grad_norm": 13170602.0, + "learning_rate": 1.5023025150549063e-05, + "loss": 3.5882, + "step": 2810 + }, + { + "epoch": 0.7492029755579172, + "grad_norm": 12720932.0, + "learning_rate": 1.5005313496280553e-05, + "loss": 4.7013, + "step": 2820 + }, + { + "epoch": 0.7518597236981934, + "grad_norm": 7461363.0, + "learning_rate": 1.4987601842012046e-05, + "loss": 3.5194, + "step": 2830 + }, + { + "epoch": 0.7545164718384697, + "grad_norm": 3747000.25, + "learning_rate": 1.4969890187743536e-05, + "loss": 3.9811, + "step": 2840 + }, + { + "epoch": 0.757173219978746, + "grad_norm": 2111091.0, + "learning_rate": 1.4952178533475028e-05, + "loss": 3.3212, + "step": 2850 + }, + { + "epoch": 0.7598299681190224, + "grad_norm": 4919647.5, + "learning_rate": 1.4934466879206519e-05, + "loss": 4.0383, + "step": 2860 + }, + { + "epoch": 0.7624867162592986, + "grad_norm": 3595169.25, + "learning_rate": 1.4916755224938011e-05, + "loss": 3.7293, + "step": 2870 + }, + { + "epoch": 0.7651434643995749, + "grad_norm": 1647251.75, + "learning_rate": 1.4899043570669502e-05, + "loss": 4.166, + "step": 2880 + }, + { + "epoch": 0.7678002125398512, + "grad_norm": 4398145.0, + "learning_rate": 1.4881331916400994e-05, + "loss": 3.4454, + "step": 2890 + }, + { + "epoch": 0.7704569606801275, + "grad_norm": 3135213.0, + "learning_rate": 1.4863620262132485e-05, + "loss": 4.0135, + "step": 2900 + }, + { + "epoch": 0.7731137088204039, + "grad_norm": 7072787.0, + "learning_rate": 1.4845908607863975e-05, + "loss": 3.4145, + "step": 2910 + }, + { + "epoch": 0.7757704569606801, + "grad_norm": 2635511.75, + "learning_rate": 1.4828196953595466e-05, + "loss": 3.8201, + "step": 2920 + }, + { + "epoch": 0.7784272051009564, + "grad_norm": 4616754.5, + "learning_rate": 1.4810485299326958e-05, + "loss": 4.1764, + "step": 2930 + }, + { + "epoch": 0.7810839532412327, + "grad_norm": 877153.0, + "learning_rate": 1.4792773645058449e-05, + "loss": 3.9471, + "step": 2940 + }, + { + "epoch": 0.7837407013815091, + "grad_norm": 569671.3125, + "learning_rate": 1.4775061990789941e-05, + "loss": 3.7697, + "step": 2950 + }, + { + "epoch": 0.7863974495217854, + "grad_norm": 810236.125, + "learning_rate": 1.4757350336521432e-05, + "loss": 4.4753, + "step": 2960 + }, + { + "epoch": 0.7890541976620616, + "grad_norm": 877906.875, + "learning_rate": 1.4739638682252924e-05, + "loss": 3.6654, + "step": 2970 + }, + { + "epoch": 0.7917109458023379, + "grad_norm": 481885.46875, + "learning_rate": 1.4721927027984414e-05, + "loss": 4.1253, + "step": 2980 + }, + { + "epoch": 0.7943676939426142, + "grad_norm": 1338787.0, + "learning_rate": 1.4704215373715907e-05, + "loss": 4.0294, + "step": 2990 + }, + { + "epoch": 0.7970244420828906, + "grad_norm": 1250065.875, + "learning_rate": 1.4686503719447397e-05, + "loss": 4.7282, + "step": 3000 + }, + { + "epoch": 0.7996811902231669, + "grad_norm": 1604171.375, + "learning_rate": 1.466879206517889e-05, + "loss": 4.0439, + "step": 3010 + }, + { + "epoch": 0.8023379383634431, + "grad_norm": 512070.90625, + "learning_rate": 1.4651080410910382e-05, + "loss": 3.5779, + "step": 3020 + }, + { + "epoch": 0.8049946865037194, + "grad_norm": 312113.46875, + "learning_rate": 1.4633368756641872e-05, + "loss": 3.6514, + "step": 3030 + }, + { + "epoch": 0.8076514346439958, + "grad_norm": 23779.923828125, + "learning_rate": 1.4615657102373361e-05, + "loss": 3.8136, + "step": 3040 + }, + { + "epoch": 0.8103081827842721, + "grad_norm": 8204.794921875, + "learning_rate": 1.4597945448104854e-05, + "loss": 4.1336, + "step": 3050 + }, + { + "epoch": 0.8129649309245484, + "grad_norm": 76479.1640625, + "learning_rate": 1.4580233793836344e-05, + "loss": 3.4411, + "step": 3060 + }, + { + "epoch": 0.8156216790648246, + "grad_norm": 66624.71875, + "learning_rate": 1.4562522139567836e-05, + "loss": 3.8493, + "step": 3070 + }, + { + "epoch": 0.8182784272051009, + "grad_norm": 22607.904296875, + "learning_rate": 1.4544810485299327e-05, + "loss": 3.2428, + "step": 3080 + }, + { + "epoch": 0.8209351753453773, + "grad_norm": 119469.640625, + "learning_rate": 1.452709883103082e-05, + "loss": 3.4363, + "step": 3090 + }, + { + "epoch": 0.8235919234856536, + "grad_norm": 108868.203125, + "learning_rate": 1.4509387176762311e-05, + "loss": 3.5903, + "step": 3100 + }, + { + "epoch": 0.8262486716259299, + "grad_norm": 5543388.0, + "learning_rate": 1.4491675522493802e-05, + "loss": 3.7918, + "step": 3110 + }, + { + "epoch": 0.8289054197662061, + "grad_norm": 2565445.75, + "learning_rate": 1.4473963868225294e-05, + "loss": 3.8573, + "step": 3120 + }, + { + "epoch": 0.8315621679064825, + "grad_norm": 702086.4375, + "learning_rate": 1.4456252213956785e-05, + "loss": 3.3944, + "step": 3130 + }, + { + "epoch": 0.8342189160467588, + "grad_norm": 115243.6484375, + "learning_rate": 1.4438540559688277e-05, + "loss": 3.2222, + "step": 3140 + }, + { + "epoch": 0.8368756641870351, + "grad_norm": 476268.625, + "learning_rate": 1.4420828905419768e-05, + "loss": 3.6144, + "step": 3150 + }, + { + "epoch": 0.8395324123273114, + "grad_norm": 65992.0, + "learning_rate": 1.440311725115126e-05, + "loss": 3.1891, + "step": 3160 + }, + { + "epoch": 0.8421891604675876, + "grad_norm": 1161863.375, + "learning_rate": 1.438540559688275e-05, + "loss": 3.6714, + "step": 3170 + }, + { + "epoch": 0.844845908607864, + "grad_norm": 185466.84375, + "learning_rate": 1.4367693942614241e-05, + "loss": 3.4372, + "step": 3180 + }, + { + "epoch": 0.8475026567481403, + "grad_norm": 56940.96875, + "learning_rate": 1.4349982288345732e-05, + "loss": 3.7385, + "step": 3190 + }, + { + "epoch": 0.8501594048884166, + "grad_norm": 99763.78125, + "learning_rate": 1.4332270634077224e-05, + "loss": 3.5612, + "step": 3200 + }, + { + "epoch": 0.8528161530286928, + "grad_norm": 91525.1328125, + "learning_rate": 1.4314558979808715e-05, + "loss": 3.6116, + "step": 3210 + }, + { + "epoch": 0.8554729011689692, + "grad_norm": 23506.251953125, + "learning_rate": 1.4296847325540207e-05, + "loss": 3.4268, + "step": 3220 + }, + { + "epoch": 0.8581296493092455, + "grad_norm": 36794.52734375, + "learning_rate": 1.4279135671271697e-05, + "loss": 3.7912, + "step": 3230 + }, + { + "epoch": 0.8607863974495218, + "grad_norm": 14971.548828125, + "learning_rate": 1.426142401700319e-05, + "loss": 3.7623, + "step": 3240 + }, + { + "epoch": 0.8634431455897981, + "grad_norm": 29957.119140625, + "learning_rate": 1.424371236273468e-05, + "loss": 3.5765, + "step": 3250 + }, + { + "epoch": 0.8660998937300743, + "grad_norm": 24691.1796875, + "learning_rate": 1.4226000708466172e-05, + "loss": 3.4663, + "step": 3260 + }, + { + "epoch": 0.8687566418703507, + "grad_norm": 21935.2734375, + "learning_rate": 1.4208289054197663e-05, + "loss": 3.6494, + "step": 3270 + }, + { + "epoch": 0.871413390010627, + "grad_norm": 26350.591796875, + "learning_rate": 1.4190577399929155e-05, + "loss": 3.5611, + "step": 3280 + }, + { + "epoch": 0.8740701381509033, + "grad_norm": 30286.142578125, + "learning_rate": 1.4172865745660646e-05, + "loss": 3.7046, + "step": 3290 + }, + { + "epoch": 0.8767268862911796, + "grad_norm": 6965.02734375, + "learning_rate": 1.4155154091392138e-05, + "loss": 3.9012, + "step": 3300 + }, + { + "epoch": 0.879383634431456, + "grad_norm": 34496.1171875, + "learning_rate": 1.4137442437123627e-05, + "loss": 3.5102, + "step": 3310 + }, + { + "epoch": 0.8820403825717322, + "grad_norm": 15867.46875, + "learning_rate": 1.411973078285512e-05, + "loss": 3.9485, + "step": 3320 + }, + { + "epoch": 0.8846971307120085, + "grad_norm": 8408.2509765625, + "learning_rate": 1.410201912858661e-05, + "loss": 4.0955, + "step": 3330 + }, + { + "epoch": 0.8873538788522848, + "grad_norm": 12868.8935546875, + "learning_rate": 1.4084307474318102e-05, + "loss": 3.8902, + "step": 3340 + }, + { + "epoch": 0.8900106269925611, + "grad_norm": 39027.8125, + "learning_rate": 1.4066595820049593e-05, + "loss": 3.7809, + "step": 3350 + }, + { + "epoch": 0.8926673751328374, + "grad_norm": 30144.494140625, + "learning_rate": 1.4048884165781085e-05, + "loss": 3.8368, + "step": 3360 + }, + { + "epoch": 0.8953241232731137, + "grad_norm": 14916.984375, + "learning_rate": 1.4031172511512576e-05, + "loss": 3.8361, + "step": 3370 + }, + { + "epoch": 0.89798087141339, + "grad_norm": 10657.8974609375, + "learning_rate": 1.4013460857244068e-05, + "loss": 3.9388, + "step": 3380 + }, + { + "epoch": 0.9006376195536663, + "grad_norm": 20504.70703125, + "learning_rate": 1.399574920297556e-05, + "loss": 4.257, + "step": 3390 + }, + { + "epoch": 0.9032943676939427, + "grad_norm": 32460.078125, + "learning_rate": 1.397803754870705e-05, + "loss": 4.0817, + "step": 3400 + }, + { + "epoch": 0.905951115834219, + "grad_norm": 6730.14404296875, + "learning_rate": 1.3960325894438543e-05, + "loss": 4.2065, + "step": 3410 + }, + { + "epoch": 0.9086078639744952, + "grad_norm": 17531.017578125, + "learning_rate": 1.3942614240170034e-05, + "loss": 3.5729, + "step": 3420 + }, + { + "epoch": 0.9112646121147715, + "grad_norm": 17859.064453125, + "learning_rate": 1.3924902585901526e-05, + "loss": 4.3419, + "step": 3430 + }, + { + "epoch": 0.9139213602550478, + "grad_norm": 99839.4296875, + "learning_rate": 1.3907190931633016e-05, + "loss": 3.9653, + "step": 3440 + }, + { + "epoch": 0.9165781083953242, + "grad_norm": 13036.796875, + "learning_rate": 1.3889479277364505e-05, + "loss": 3.8463, + "step": 3450 + }, + { + "epoch": 0.9192348565356004, + "grad_norm": 54209.05859375, + "learning_rate": 1.3871767623095998e-05, + "loss": 3.9493, + "step": 3460 + }, + { + "epoch": 0.9218916046758767, + "grad_norm": 227248.34375, + "learning_rate": 1.385405596882749e-05, + "loss": 3.7791, + "step": 3470 + }, + { + "epoch": 0.924548352816153, + "grad_norm": 856476.3125, + "learning_rate": 1.383634431455898e-05, + "loss": 4.2208, + "step": 3480 + }, + { + "epoch": 0.9272051009564294, + "grad_norm": 373248.40625, + "learning_rate": 1.3818632660290473e-05, + "loss": 4.6665, + "step": 3490 + }, + { + "epoch": 0.9298618490967057, + "grad_norm": 476773.1875, + "learning_rate": 1.3800921006021963e-05, + "loss": 4.3373, + "step": 3500 + }, + { + "epoch": 0.9325185972369819, + "grad_norm": 3948952.0, + "learning_rate": 1.3783209351753455e-05, + "loss": 3.9872, + "step": 3510 + }, + { + "epoch": 0.9351753453772582, + "grad_norm": 131342.296875, + "learning_rate": 1.3765497697484946e-05, + "loss": 4.0315, + "step": 3520 + }, + { + "epoch": 0.9378320935175345, + "grad_norm": 1021533.8125, + "learning_rate": 1.3747786043216438e-05, + "loss": 3.898, + "step": 3530 + }, + { + "epoch": 0.9404888416578109, + "grad_norm": 70664288.0, + "learning_rate": 1.3730074388947929e-05, + "loss": 4.0261, + "step": 3540 + }, + { + "epoch": 0.9431455897980872, + "grad_norm": 1955257.25, + "learning_rate": 1.3712362734679421e-05, + "loss": 3.9837, + "step": 3550 + }, + { + "epoch": 0.9458023379383634, + "grad_norm": 10510368.0, + "learning_rate": 1.3694651080410912e-05, + "loss": 4.2089, + "step": 3560 + }, + { + "epoch": 0.9484590860786397, + "grad_norm": 4540049.0, + "learning_rate": 1.3676939426142404e-05, + "loss": 4.0757, + "step": 3570 + }, + { + "epoch": 0.9511158342189161, + "grad_norm": 1934832.5, + "learning_rate": 1.3659227771873893e-05, + "loss": 3.8116, + "step": 3580 + }, + { + "epoch": 0.9537725823591924, + "grad_norm": 721523.875, + "learning_rate": 1.3641516117605385e-05, + "loss": 3.8604, + "step": 3590 + }, + { + "epoch": 0.9564293304994687, + "grad_norm": 3694456.5, + "learning_rate": 1.3623804463336876e-05, + "loss": 4.3438, + "step": 3600 + }, + { + "epoch": 0.9590860786397449, + "grad_norm": 4130751.5, + "learning_rate": 1.3606092809068368e-05, + "loss": 3.7722, + "step": 3610 + }, + { + "epoch": 0.9617428267800212, + "grad_norm": 3232915.5, + "learning_rate": 1.3588381154799859e-05, + "loss": 4.1108, + "step": 3620 + }, + { + "epoch": 0.9643995749202976, + "grad_norm": 5608699.5, + "learning_rate": 1.357066950053135e-05, + "loss": 4.4695, + "step": 3630 + }, + { + "epoch": 0.9670563230605739, + "grad_norm": 37526024.0, + "learning_rate": 1.3552957846262841e-05, + "loss": 3.8838, + "step": 3640 + }, + { + "epoch": 0.9697130712008502, + "grad_norm": 11544401.0, + "learning_rate": 1.3535246191994334e-05, + "loss": 3.7949, + "step": 3650 + }, + { + "epoch": 0.9723698193411264, + "grad_norm": 1559264.75, + "learning_rate": 1.3517534537725824e-05, + "loss": 3.8236, + "step": 3660 + }, + { + "epoch": 0.9750265674814028, + "grad_norm": 10817994.0, + "learning_rate": 1.3499822883457316e-05, + "loss": 4.0035, + "step": 3670 + }, + { + "epoch": 0.9776833156216791, + "grad_norm": 20268342.0, + "learning_rate": 1.3482111229188807e-05, + "loss": 3.6612, + "step": 3680 + }, + { + "epoch": 0.9803400637619554, + "grad_norm": 51181968.0, + "learning_rate": 1.34643995749203e-05, + "loss": 3.7019, + "step": 3690 + }, + { + "epoch": 0.9829968119022316, + "grad_norm": 74098400.0, + "learning_rate": 1.3446687920651792e-05, + "loss": 3.779, + "step": 3700 + }, + { + "epoch": 0.9856535600425079, + "grad_norm": 48340468.0, + "learning_rate": 1.3428976266383282e-05, + "loss": 3.6759, + "step": 3710 + }, + { + "epoch": 0.9883103081827843, + "grad_norm": 8802756.0, + "learning_rate": 1.3411264612114771e-05, + "loss": 3.6238, + "step": 3720 + }, + { + "epoch": 0.9909670563230606, + "grad_norm": 3833086.75, + "learning_rate": 1.3393552957846263e-05, + "loss": 3.3759, + "step": 3730 + }, + { + "epoch": 0.9936238044633369, + "grad_norm": 29499648.0, + "learning_rate": 1.3375841303577754e-05, + "loss": 3.6134, + "step": 3740 + }, + { + "epoch": 0.9962805526036131, + "grad_norm": 6612167.0, + "learning_rate": 1.3358129649309246e-05, + "loss": 3.5491, + "step": 3750 + }, + { + "epoch": 0.9989373007438895, + "grad_norm": 21236494.0, + "learning_rate": 1.3340417995040737e-05, + "loss": 3.7831, + "step": 3760 + }, + { + "epoch": 1.0, + "eval_loss": 3.75178599357605, + "eval_runtime": 744.4128, + "eval_samples_per_second": 20.225, + "eval_steps_per_second": 5.056, + "step": 3764 + }, + { + "epoch": 1.0015940488841657, + "grad_norm": 40179844.0, + "learning_rate": 1.3322706340772229e-05, + "loss": 3.711, + "step": 3770 + }, + { + "epoch": 1.004250797024442, + "grad_norm": 17010662.0, + "learning_rate": 1.3304994686503721e-05, + "loss": 3.4946, + "step": 3780 + }, + { + "epoch": 1.0069075451647185, + "grad_norm": 19932106.0, + "learning_rate": 1.3287283032235212e-05, + "loss": 3.5648, + "step": 3790 + }, + { + "epoch": 1.0095642933049946, + "grad_norm": 5492312.0, + "learning_rate": 1.3269571377966704e-05, + "loss": 4.0635, + "step": 3800 + }, + { + "epoch": 1.012221041445271, + "grad_norm": 192937568.0, + "learning_rate": 1.3251859723698195e-05, + "loss": 3.4178, + "step": 3810 + }, + { + "epoch": 1.0148777895855472, + "grad_norm": 1293443.125, + "learning_rate": 1.3234148069429687e-05, + "loss": 3.9658, + "step": 3820 + }, + { + "epoch": 1.0175345377258236, + "grad_norm": 158162096.0, + "learning_rate": 1.3216436415161178e-05, + "loss": 3.6695, + "step": 3830 + }, + { + "epoch": 1.0201912858661, + "grad_norm": 207503072.0, + "learning_rate": 1.319872476089267e-05, + "loss": 4.1104, + "step": 3840 + }, + { + "epoch": 1.0228480340063761, + "grad_norm": 5859501.0, + "learning_rate": 1.3181013106624159e-05, + "loss": 3.7423, + "step": 3850 + }, + { + "epoch": 1.0255047821466525, + "grad_norm": 65099376.0, + "learning_rate": 1.3163301452355651e-05, + "loss": 3.8122, + "step": 3860 + }, + { + "epoch": 1.0281615302869287, + "grad_norm": 13768734.0, + "learning_rate": 1.3145589798087142e-05, + "loss": 3.8062, + "step": 3870 + }, + { + "epoch": 1.030818278427205, + "grad_norm": 24830612.0, + "learning_rate": 1.3127878143818634e-05, + "loss": 3.5577, + "step": 3880 + }, + { + "epoch": 1.0334750265674815, + "grad_norm": 109977040.0, + "learning_rate": 1.3110166489550124e-05, + "loss": 3.8904, + "step": 3890 + }, + { + "epoch": 1.0361317747077576, + "grad_norm": 22621510.0, + "learning_rate": 1.3092454835281617e-05, + "loss": 3.7924, + "step": 3900 + }, + { + "epoch": 1.038788522848034, + "grad_norm": 15618693.0, + "learning_rate": 1.3074743181013107e-05, + "loss": 3.9009, + "step": 3910 + }, + { + "epoch": 1.0414452709883104, + "grad_norm": 102296992.0, + "learning_rate": 1.30570315267446e-05, + "loss": 4.0488, + "step": 3920 + }, + { + "epoch": 1.0441020191285866, + "grad_norm": 180104320.0, + "learning_rate": 1.303931987247609e-05, + "loss": 4.0832, + "step": 3930 + }, + { + "epoch": 1.046758767268863, + "grad_norm": 8426886.0, + "learning_rate": 1.3021608218207582e-05, + "loss": 3.9811, + "step": 3940 + }, + { + "epoch": 1.0494155154091391, + "grad_norm": 23817282.0, + "learning_rate": 1.3003896563939073e-05, + "loss": 3.5573, + "step": 3950 + }, + { + "epoch": 1.0520722635494155, + "grad_norm": 34805012.0, + "learning_rate": 1.2986184909670565e-05, + "loss": 3.6933, + "step": 3960 + }, + { + "epoch": 1.054729011689692, + "grad_norm": 27546222.0, + "learning_rate": 1.2968473255402056e-05, + "loss": 3.826, + "step": 3970 + }, + { + "epoch": 1.057385759829968, + "grad_norm": 73101112.0, + "learning_rate": 1.2950761601133548e-05, + "loss": 4.3474, + "step": 3980 + }, + { + "epoch": 1.0600425079702445, + "grad_norm": 60012056.0, + "learning_rate": 1.2933049946865037e-05, + "loss": 3.4645, + "step": 3990 + }, + { + "epoch": 1.0626992561105206, + "grad_norm": 10204493.0, + "learning_rate": 1.2915338292596529e-05, + "loss": 3.7942, + "step": 4000 + }, + { + "epoch": 1.065356004250797, + "grad_norm": 67629928.0, + "learning_rate": 1.289762663832802e-05, + "loss": 3.6377, + "step": 4010 + }, + { + "epoch": 1.0680127523910734, + "grad_norm": 31746526.0, + "learning_rate": 1.2879914984059512e-05, + "loss": 3.7846, + "step": 4020 + }, + { + "epoch": 1.0706695005313496, + "grad_norm": 52992448.0, + "learning_rate": 1.2862203329791003e-05, + "loss": 3.2981, + "step": 4030 + }, + { + "epoch": 1.073326248671626, + "grad_norm": 36022592.0, + "learning_rate": 1.2844491675522495e-05, + "loss": 3.6733, + "step": 4040 + }, + { + "epoch": 1.0759829968119021, + "grad_norm": 11422725.0, + "learning_rate": 1.2826780021253985e-05, + "loss": 3.5682, + "step": 4050 + }, + { + "epoch": 1.0786397449521785, + "grad_norm": 77457192.0, + "learning_rate": 1.2809068366985478e-05, + "loss": 3.8538, + "step": 4060 + }, + { + "epoch": 1.081296493092455, + "grad_norm": 109772792.0, + "learning_rate": 1.279135671271697e-05, + "loss": 4.0151, + "step": 4070 + }, + { + "epoch": 1.083953241232731, + "grad_norm": 126942304.0, + "learning_rate": 1.277364505844846e-05, + "loss": 4.418, + "step": 4080 + }, + { + "epoch": 1.0866099893730075, + "grad_norm": 215005632.0, + "learning_rate": 1.2755933404179953e-05, + "loss": 3.6302, + "step": 4090 + }, + { + "epoch": 1.0892667375132838, + "grad_norm": 18895672.0, + "learning_rate": 1.2738221749911443e-05, + "loss": 4.2548, + "step": 4100 + }, + { + "epoch": 1.09192348565356, + "grad_norm": 20576284.0, + "learning_rate": 1.2720510095642936e-05, + "loss": 3.9913, + "step": 4110 + }, + { + "epoch": 1.0945802337938364, + "grad_norm": 90564424.0, + "learning_rate": 1.2702798441374424e-05, + "loss": 3.8335, + "step": 4120 + }, + { + "epoch": 1.0972369819341126, + "grad_norm": 136458144.0, + "learning_rate": 1.2685086787105915e-05, + "loss": 4.0485, + "step": 4130 + }, + { + "epoch": 1.099893730074389, + "grad_norm": 175102016.0, + "learning_rate": 1.2667375132837407e-05, + "loss": 4.1181, + "step": 4140 + }, + { + "epoch": 1.1025504782146653, + "grad_norm": 15060149.0, + "learning_rate": 1.26496634785689e-05, + "loss": 3.753, + "step": 4150 + }, + { + "epoch": 1.1052072263549415, + "grad_norm": 92020808.0, + "learning_rate": 1.263195182430039e-05, + "loss": 3.9935, + "step": 4160 + }, + { + "epoch": 1.107863974495218, + "grad_norm": 133574952.0, + "learning_rate": 1.2614240170031882e-05, + "loss": 4.0376, + "step": 4170 + }, + { + "epoch": 1.110520722635494, + "grad_norm": 69448336.0, + "learning_rate": 1.2596528515763373e-05, + "loss": 3.7264, + "step": 4180 + }, + { + "epoch": 1.1131774707757705, + "grad_norm": 24695358.0, + "learning_rate": 1.2578816861494865e-05, + "loss": 3.6435, + "step": 4190 + }, + { + "epoch": 1.1158342189160468, + "grad_norm": 26981000.0, + "learning_rate": 1.2561105207226356e-05, + "loss": 4.1867, + "step": 4200 + }, + { + "epoch": 1.118490967056323, + "grad_norm": 26429450.0, + "learning_rate": 1.2543393552957848e-05, + "loss": 4.2308, + "step": 4210 + }, + { + "epoch": 1.1211477151965994, + "grad_norm": 75864056.0, + "learning_rate": 1.2525681898689339e-05, + "loss": 4.1067, + "step": 4220 + }, + { + "epoch": 1.1238044633368758, + "grad_norm": 53176204.0, + "learning_rate": 1.2507970244420831e-05, + "loss": 4.3122, + "step": 4230 + }, + { + "epoch": 1.126461211477152, + "grad_norm": 27715404.0, + "learning_rate": 1.2490258590152322e-05, + "loss": 4.0918, + "step": 4240 + }, + { + "epoch": 1.1291179596174283, + "grad_norm": 6029370.0, + "learning_rate": 1.2472546935883814e-05, + "loss": 4.1725, + "step": 4250 + }, + { + "epoch": 1.1317747077577045, + "grad_norm": 26051718.0, + "learning_rate": 1.2454835281615303e-05, + "loss": 3.9757, + "step": 4260 + }, + { + "epoch": 1.134431455897981, + "grad_norm": 77973728.0, + "learning_rate": 1.2437123627346795e-05, + "loss": 3.989, + "step": 4270 + }, + { + "epoch": 1.1370882040382573, + "grad_norm": 11366385.0, + "learning_rate": 1.2419411973078286e-05, + "loss": 4.3978, + "step": 4280 + }, + { + "epoch": 1.1397449521785334, + "grad_norm": 19926490.0, + "learning_rate": 1.2401700318809778e-05, + "loss": 3.7446, + "step": 4290 + }, + { + "epoch": 1.1424017003188098, + "grad_norm": 66211068.0, + "learning_rate": 1.2383988664541268e-05, + "loss": 3.9591, + "step": 4300 + }, + { + "epoch": 1.145058448459086, + "grad_norm": 7617592.5, + "learning_rate": 1.236627701027276e-05, + "loss": 4.2812, + "step": 4310 + }, + { + "epoch": 1.1477151965993624, + "grad_norm": 47218612.0, + "learning_rate": 1.2348565356004251e-05, + "loss": 4.137, + "step": 4320 + }, + { + "epoch": 1.1503719447396388, + "grad_norm": 115950944.0, + "learning_rate": 1.2330853701735743e-05, + "loss": 4.1344, + "step": 4330 + }, + { + "epoch": 1.153028692879915, + "grad_norm": 27328380.0, + "learning_rate": 1.2313142047467234e-05, + "loss": 4.0865, + "step": 4340 + }, + { + "epoch": 1.1556854410201913, + "grad_norm": 8267316.5, + "learning_rate": 1.2295430393198726e-05, + "loss": 4.3048, + "step": 4350 + }, + { + "epoch": 1.1583421891604675, + "grad_norm": 18654644.0, + "learning_rate": 1.2277718738930217e-05, + "loss": 4.4512, + "step": 4360 + }, + { + "epoch": 1.1609989373007439, + "grad_norm": 123494120.0, + "learning_rate": 1.2260007084661709e-05, + "loss": 4.1863, + "step": 4370 + }, + { + "epoch": 1.1636556854410203, + "grad_norm": 87930224.0, + "learning_rate": 1.2242295430393201e-05, + "loss": 4.1395, + "step": 4380 + }, + { + "epoch": 1.1663124335812964, + "grad_norm": 60926568.0, + "learning_rate": 1.222458377612469e-05, + "loss": 3.9975, + "step": 4390 + }, + { + "epoch": 1.1689691817215728, + "grad_norm": 15561844.0, + "learning_rate": 1.2206872121856181e-05, + "loss": 4.1746, + "step": 4400 + }, + { + "epoch": 1.171625929861849, + "grad_norm": 14337786.0, + "learning_rate": 1.2189160467587673e-05, + "loss": 4.0762, + "step": 4410 + }, + { + "epoch": 1.1742826780021254, + "grad_norm": 27260074.0, + "learning_rate": 1.2171448813319164e-05, + "loss": 4.3436, + "step": 4420 + }, + { + "epoch": 1.1769394261424018, + "grad_norm": 14445331.0, + "learning_rate": 1.2153737159050656e-05, + "loss": 3.9788, + "step": 4430 + }, + { + "epoch": 1.179596174282678, + "grad_norm": 21041896.0, + "learning_rate": 1.2136025504782147e-05, + "loss": 4.3681, + "step": 4440 + }, + { + "epoch": 1.1822529224229543, + "grad_norm": 15333385.0, + "learning_rate": 1.2118313850513639e-05, + "loss": 4.1638, + "step": 4450 + }, + { + "epoch": 1.1849096705632305, + "grad_norm": 18882606.0, + "learning_rate": 1.2100602196245131e-05, + "loss": 3.9175, + "step": 4460 + }, + { + "epoch": 1.1875664187035069, + "grad_norm": 6002330.5, + "learning_rate": 1.2082890541976622e-05, + "loss": 4.0274, + "step": 4470 + }, + { + "epoch": 1.1902231668437833, + "grad_norm": 12174502.0, + "learning_rate": 1.2065178887708114e-05, + "loss": 4.0163, + "step": 4480 + }, + { + "epoch": 1.1928799149840594, + "grad_norm": 3046521.75, + "learning_rate": 1.2047467233439604e-05, + "loss": 4.2218, + "step": 4490 + }, + { + "epoch": 1.1955366631243358, + "grad_norm": 7046191.0, + "learning_rate": 1.2029755579171097e-05, + "loss": 3.8047, + "step": 4500 + }, + { + "epoch": 1.1981934112646122, + "grad_norm": 2158310.5, + "learning_rate": 1.2012043924902587e-05, + "loss": 4.0102, + "step": 4510 + }, + { + "epoch": 1.2008501594048884, + "grad_norm": 1953139.875, + "learning_rate": 1.199433227063408e-05, + "loss": 3.9815, + "step": 4520 + }, + { + "epoch": 1.2035069075451648, + "grad_norm": 10403948.0, + "learning_rate": 1.1976620616365568e-05, + "loss": 4.2106, + "step": 4530 + }, + { + "epoch": 1.206163655685441, + "grad_norm": 1701127.5, + "learning_rate": 1.195890896209706e-05, + "loss": 4.1719, + "step": 4540 + }, + { + "epoch": 1.2088204038257173, + "grad_norm": 1922839.625, + "learning_rate": 1.1941197307828551e-05, + "loss": 4.2, + "step": 4550 + }, + { + "epoch": 1.2114771519659937, + "grad_norm": 1249251.375, + "learning_rate": 1.1923485653560044e-05, + "loss": 4.3854, + "step": 4560 + }, + { + "epoch": 1.2141339001062699, + "grad_norm": 3677515.25, + "learning_rate": 1.1905773999291534e-05, + "loss": 4.1928, + "step": 4570 + }, + { + "epoch": 1.2167906482465463, + "grad_norm": 1778515.5, + "learning_rate": 1.1888062345023026e-05, + "loss": 4.282, + "step": 4580 + }, + { + "epoch": 1.2194473963868226, + "grad_norm": 2142989.75, + "learning_rate": 1.1870350690754517e-05, + "loss": 4.0862, + "step": 4590 + }, + { + "epoch": 1.2221041445270988, + "grad_norm": 3376149.5, + "learning_rate": 1.185263903648601e-05, + "loss": 4.9249, + "step": 4600 + }, + { + "epoch": 1.2247608926673752, + "grad_norm": 918137.0625, + "learning_rate": 1.18349273822175e-05, + "loss": 4.4397, + "step": 4610 + }, + { + "epoch": 1.2274176408076514, + "grad_norm": 5548887.5, + "learning_rate": 1.1817215727948992e-05, + "loss": 4.186, + "step": 4620 + }, + { + "epoch": 1.2300743889479278, + "grad_norm": 1206121.0, + "learning_rate": 1.1799504073680483e-05, + "loss": 4.4369, + "step": 4630 + }, + { + "epoch": 1.2327311370882041, + "grad_norm": 1302905.0, + "learning_rate": 1.1781792419411975e-05, + "loss": 4.2492, + "step": 4640 + }, + { + "epoch": 1.2353878852284803, + "grad_norm": 1243181.25, + "learning_rate": 1.1764080765143466e-05, + "loss": 4.3557, + "step": 4650 + }, + { + "epoch": 1.2380446333687567, + "grad_norm": 1636811.25, + "learning_rate": 1.1746369110874956e-05, + "loss": 4.4305, + "step": 4660 + }, + { + "epoch": 1.2407013815090329, + "grad_norm": 3252745.75, + "learning_rate": 1.1728657456606447e-05, + "loss": 4.4447, + "step": 4670 + }, + { + "epoch": 1.2433581296493093, + "grad_norm": 3218180.0, + "learning_rate": 1.1710945802337939e-05, + "loss": 4.1695, + "step": 4680 + }, + { + "epoch": 1.2460148777895856, + "grad_norm": 7251921.5, + "learning_rate": 1.169323414806943e-05, + "loss": 4.0679, + "step": 4690 + }, + { + "epoch": 1.2486716259298618, + "grad_norm": 3886631.0, + "learning_rate": 1.1675522493800922e-05, + "loss": 3.9159, + "step": 4700 + }, + { + "epoch": 1.2513283740701382, + "grad_norm": 2420017.75, + "learning_rate": 1.1657810839532412e-05, + "loss": 4.6458, + "step": 4710 + }, + { + "epoch": 1.2539851222104144, + "grad_norm": 1138159.875, + "learning_rate": 1.1640099185263905e-05, + "loss": 4.078, + "step": 4720 + }, + { + "epoch": 1.2566418703506907, + "grad_norm": 930125.875, + "learning_rate": 1.1622387530995395e-05, + "loss": 4.0812, + "step": 4730 + }, + { + "epoch": 1.2592986184909671, + "grad_norm": 3835148.25, + "learning_rate": 1.1604675876726887e-05, + "loss": 4.1012, + "step": 4740 + }, + { + "epoch": 1.2619553666312433, + "grad_norm": 6243373.5, + "learning_rate": 1.158696422245838e-05, + "loss": 3.8252, + "step": 4750 + }, + { + "epoch": 1.2646121147715197, + "grad_norm": 3021652.25, + "learning_rate": 1.156925256818987e-05, + "loss": 3.9515, + "step": 4760 + }, + { + "epoch": 1.2672688629117959, + "grad_norm": 4503118.5, + "learning_rate": 1.1551540913921363e-05, + "loss": 4.0478, + "step": 4770 + }, + { + "epoch": 1.2699256110520722, + "grad_norm": 5867597.5, + "learning_rate": 1.1533829259652853e-05, + "loss": 4.0726, + "step": 4780 + }, + { + "epoch": 1.2725823591923486, + "grad_norm": 23690828.0, + "learning_rate": 1.1516117605384345e-05, + "loss": 3.5037, + "step": 4790 + }, + { + "epoch": 1.2752391073326248, + "grad_norm": 5260964.5, + "learning_rate": 1.1498405951115834e-05, + "loss": 4.0774, + "step": 4800 + }, + { + "epoch": 1.2778958554729012, + "grad_norm": 4894551.5, + "learning_rate": 1.1480694296847325e-05, + "loss": 3.7113, + "step": 4810 + }, + { + "epoch": 1.2805526036131774, + "grad_norm": 4784902.0, + "learning_rate": 1.1462982642578817e-05, + "loss": 3.886, + "step": 4820 + }, + { + "epoch": 1.2832093517534537, + "grad_norm": 22511842.0, + "learning_rate": 1.144527098831031e-05, + "loss": 3.7413, + "step": 4830 + }, + { + "epoch": 1.2858660998937301, + "grad_norm": 13445524.0, + "learning_rate": 1.14275593340418e-05, + "loss": 4.3171, + "step": 4840 + }, + { + "epoch": 1.2885228480340063, + "grad_norm": 4879641.0, + "learning_rate": 1.1409847679773292e-05, + "loss": 4.0366, + "step": 4850 + }, + { + "epoch": 1.2911795961742827, + "grad_norm": 5458451.0, + "learning_rate": 1.1392136025504783e-05, + "loss": 4.0356, + "step": 4860 + }, + { + "epoch": 1.2938363443145589, + "grad_norm": 1152951.125, + "learning_rate": 1.1374424371236275e-05, + "loss": 3.9322, + "step": 4870 + }, + { + "epoch": 1.2964930924548352, + "grad_norm": 1573109.875, + "learning_rate": 1.1356712716967766e-05, + "loss": 3.5684, + "step": 4880 + }, + { + "epoch": 1.2991498405951116, + "grad_norm": 3557934.25, + "learning_rate": 1.1339001062699258e-05, + "loss": 3.8874, + "step": 4890 + }, + { + "epoch": 1.301806588735388, + "grad_norm": 2637183.5, + "learning_rate": 1.1321289408430748e-05, + "loss": 4.0737, + "step": 4900 + }, + { + "epoch": 1.3044633368756642, + "grad_norm": 1852644.25, + "learning_rate": 1.130357775416224e-05, + "loss": 4.4462, + "step": 4910 + }, + { + "epoch": 1.3071200850159406, + "grad_norm": 7577384.5, + "learning_rate": 1.1285866099893731e-05, + "loss": 3.8546, + "step": 4920 + }, + { + "epoch": 1.3097768331562167, + "grad_norm": 4401453.5, + "learning_rate": 1.1268154445625224e-05, + "loss": 4.0443, + "step": 4930 + }, + { + "epoch": 1.3124335812964931, + "grad_norm": 3643839.75, + "learning_rate": 1.1250442791356712e-05, + "loss": 3.678, + "step": 4940 + }, + { + "epoch": 1.3150903294367695, + "grad_norm": 27145024.0, + "learning_rate": 1.1232731137088205e-05, + "loss": 3.8589, + "step": 4950 + }, + { + "epoch": 1.3177470775770457, + "grad_norm": 1982266.875, + "learning_rate": 1.1215019482819695e-05, + "loss": 3.587, + "step": 4960 + }, + { + "epoch": 1.320403825717322, + "grad_norm": 2339293.25, + "learning_rate": 1.1197307828551188e-05, + "loss": 3.6116, + "step": 4970 + }, + { + "epoch": 1.3230605738575982, + "grad_norm": 21441204.0, + "learning_rate": 1.1179596174282678e-05, + "loss": 3.4365, + "step": 4980 + }, + { + "epoch": 1.3257173219978746, + "grad_norm": 3329228.0, + "learning_rate": 1.116188452001417e-05, + "loss": 4.184, + "step": 4990 + }, + { + "epoch": 1.328374070138151, + "grad_norm": 2602702.75, + "learning_rate": 1.1144172865745661e-05, + "loss": 3.6095, + "step": 5000 + }, + { + "epoch": 1.3310308182784272, + "grad_norm": 62917268.0, + "learning_rate": 1.1126461211477153e-05, + "loss": 3.4086, + "step": 5010 + }, + { + "epoch": 1.3336875664187036, + "grad_norm": 9320738.0, + "learning_rate": 1.1108749557208644e-05, + "loss": 3.8485, + "step": 5020 + }, + { + "epoch": 1.3363443145589797, + "grad_norm": 11171778.0, + "learning_rate": 1.1091037902940136e-05, + "loss": 3.5241, + "step": 5030 + }, + { + "epoch": 1.3390010626992561, + "grad_norm": 13504690.0, + "learning_rate": 1.1073326248671628e-05, + "loss": 3.7951, + "step": 5040 + }, + { + "epoch": 1.3416578108395325, + "grad_norm": 1940023.625, + "learning_rate": 1.1055614594403119e-05, + "loss": 3.938, + "step": 5050 + }, + { + "epoch": 1.3443145589798087, + "grad_norm": 9250230.0, + "learning_rate": 1.1037902940134611e-05, + "loss": 3.6501, + "step": 5060 + }, + { + "epoch": 1.346971307120085, + "grad_norm": 8658494.0, + "learning_rate": 1.10201912858661e-05, + "loss": 3.4101, + "step": 5070 + }, + { + "epoch": 1.3496280552603612, + "grad_norm": 24788584.0, + "learning_rate": 1.100247963159759e-05, + "loss": 3.2665, + "step": 5080 + }, + { + "epoch": 1.3522848034006376, + "grad_norm": 17288262.0, + "learning_rate": 1.0984767977329083e-05, + "loss": 3.9485, + "step": 5090 + }, + { + "epoch": 1.354941551540914, + "grad_norm": 1679803.0, + "learning_rate": 1.0967056323060574e-05, + "loss": 3.7726, + "step": 5100 + }, + { + "epoch": 1.3575982996811902, + "grad_norm": 14593549.0, + "learning_rate": 1.0949344668792066e-05, + "loss": 4.0024, + "step": 5110 + }, + { + "epoch": 1.3602550478214666, + "grad_norm": 4186409.75, + "learning_rate": 1.0931633014523556e-05, + "loss": 3.6818, + "step": 5120 + }, + { + "epoch": 1.3629117959617427, + "grad_norm": 747755.5625, + "learning_rate": 1.0913921360255049e-05, + "loss": 3.4717, + "step": 5130 + }, + { + "epoch": 1.365568544102019, + "grad_norm": 445103.3125, + "learning_rate": 1.0896209705986541e-05, + "loss": 3.4684, + "step": 5140 + }, + { + "epoch": 1.3682252922422955, + "grad_norm": 1250102.625, + "learning_rate": 1.0878498051718031e-05, + "loss": 3.2248, + "step": 5150 + }, + { + "epoch": 1.3708820403825717, + "grad_norm": 532045.3125, + "learning_rate": 1.0860786397449524e-05, + "loss": 3.3662, + "step": 5160 + }, + { + "epoch": 1.373538788522848, + "grad_norm": 454849.5625, + "learning_rate": 1.0843074743181014e-05, + "loss": 3.5507, + "step": 5170 + }, + { + "epoch": 1.3761955366631242, + "grad_norm": 3551179.5, + "learning_rate": 1.0825363088912507e-05, + "loss": 3.2755, + "step": 5180 + }, + { + "epoch": 1.3788522848034006, + "grad_norm": 6700418.0, + "learning_rate": 1.0807651434643997e-05, + "loss": 3.2751, + "step": 5190 + }, + { + "epoch": 1.381509032943677, + "grad_norm": 37462192.0, + "learning_rate": 1.078993978037549e-05, + "loss": 3.5327, + "step": 5200 + }, + { + "epoch": 1.3841657810839532, + "grad_norm": 9333666.0, + "learning_rate": 1.0772228126106978e-05, + "loss": 3.1278, + "step": 5210 + }, + { + "epoch": 1.3868225292242295, + "grad_norm": 16026876.0, + "learning_rate": 1.075451647183847e-05, + "loss": 3.5275, + "step": 5220 + }, + { + "epoch": 1.3894792773645057, + "grad_norm": 24360552.0, + "learning_rate": 1.0736804817569961e-05, + "loss": 3.6815, + "step": 5230 + }, + { + "epoch": 1.392136025504782, + "grad_norm": 12289483.0, + "learning_rate": 1.0719093163301453e-05, + "loss": 3.1039, + "step": 5240 + }, + { + "epoch": 1.3947927736450585, + "grad_norm": 1954500.625, + "learning_rate": 1.0701381509032944e-05, + "loss": 3.3327, + "step": 5250 + }, + { + "epoch": 1.3974495217853349, + "grad_norm": 5957172.5, + "learning_rate": 1.0683669854764436e-05, + "loss": 3.6985, + "step": 5260 + }, + { + "epoch": 1.400106269925611, + "grad_norm": 136582976.0, + "learning_rate": 1.0665958200495927e-05, + "loss": 3.4845, + "step": 5270 + }, + { + "epoch": 1.4027630180658874, + "grad_norm": 21799228.0, + "learning_rate": 1.0648246546227419e-05, + "loss": 3.4648, + "step": 5280 + }, + { + "epoch": 1.4054197662061636, + "grad_norm": 1183856.625, + "learning_rate": 1.063053489195891e-05, + "loss": 3.2929, + "step": 5290 + }, + { + "epoch": 1.40807651434644, + "grad_norm": 28349394.0, + "learning_rate": 1.0612823237690402e-05, + "loss": 3.611, + "step": 5300 + }, + { + "epoch": 1.4107332624867164, + "grad_norm": 1230487.75, + "learning_rate": 1.0595111583421892e-05, + "loss": 3.0602, + "step": 5310 + }, + { + "epoch": 1.4133900106269925, + "grad_norm": 29549574.0, + "learning_rate": 1.0577399929153385e-05, + "loss": 3.6129, + "step": 5320 + }, + { + "epoch": 1.416046758767269, + "grad_norm": 65607896.0, + "learning_rate": 1.0559688274884875e-05, + "loss": 3.305, + "step": 5330 + }, + { + "epoch": 1.418703506907545, + "grad_norm": 21593944.0, + "learning_rate": 1.0541976620616366e-05, + "loss": 4.182, + "step": 5340 + }, + { + "epoch": 1.4213602550478215, + "grad_norm": 9913192.0, + "learning_rate": 1.0524264966347856e-05, + "loss": 3.333, + "step": 5350 + }, + { + "epoch": 1.4240170031880979, + "grad_norm": 5600408.5, + "learning_rate": 1.0506553312079349e-05, + "loss": 3.2001, + "step": 5360 + }, + { + "epoch": 1.426673751328374, + "grad_norm": 4921900.0, + "learning_rate": 1.048884165781084e-05, + "loss": 3.8381, + "step": 5370 + }, + { + "epoch": 1.4293304994686504, + "grad_norm": 22669404.0, + "learning_rate": 1.0471130003542332e-05, + "loss": 3.438, + "step": 5380 + }, + { + "epoch": 1.4319872476089266, + "grad_norm": 11211402.0, + "learning_rate": 1.0453418349273822e-05, + "loss": 3.3608, + "step": 5390 + }, + { + "epoch": 1.434643995749203, + "grad_norm": 10033162.0, + "learning_rate": 1.0435706695005314e-05, + "loss": 3.2148, + "step": 5400 + }, + { + "epoch": 1.4373007438894794, + "grad_norm": 34627448.0, + "learning_rate": 1.0417995040736805e-05, + "loss": 3.3408, + "step": 5410 + }, + { + "epoch": 1.4399574920297555, + "grad_norm": 19163360.0, + "learning_rate": 1.0400283386468297e-05, + "loss": 3.0767, + "step": 5420 + }, + { + "epoch": 1.442614240170032, + "grad_norm": 11876396.0, + "learning_rate": 1.038257173219979e-05, + "loss": 3.8624, + "step": 5430 + }, + { + "epoch": 1.445270988310308, + "grad_norm": 6485251.5, + "learning_rate": 1.036486007793128e-05, + "loss": 3.4212, + "step": 5440 + }, + { + "epoch": 1.4479277364505845, + "grad_norm": 2855033.5, + "learning_rate": 1.0347148423662772e-05, + "loss": 3.5543, + "step": 5450 + }, + { + "epoch": 1.4505844845908609, + "grad_norm": 39419356.0, + "learning_rate": 1.0329436769394263e-05, + "loss": 3.6357, + "step": 5460 + }, + { + "epoch": 1.453241232731137, + "grad_norm": 8782708.0, + "learning_rate": 1.0311725115125755e-05, + "loss": 3.7995, + "step": 5470 + }, + { + "epoch": 1.4558979808714134, + "grad_norm": 32046924.0, + "learning_rate": 1.0294013460857244e-05, + "loss": 3.2472, + "step": 5480 + }, + { + "epoch": 1.4585547290116896, + "grad_norm": 30402538.0, + "learning_rate": 1.0276301806588735e-05, + "loss": 3.1715, + "step": 5490 + }, + { + "epoch": 1.461211477151966, + "grad_norm": 19326186.0, + "learning_rate": 1.0258590152320227e-05, + "loss": 3.9161, + "step": 5500 + }, + { + "epoch": 1.4638682252922424, + "grad_norm": 9990077.0, + "learning_rate": 1.024087849805172e-05, + "loss": 3.849, + "step": 5510 + }, + { + "epoch": 1.4665249734325185, + "grad_norm": 29835254.0, + "learning_rate": 1.022316684378321e-05, + "loss": 3.331, + "step": 5520 + }, + { + "epoch": 1.469181721572795, + "grad_norm": 84350656.0, + "learning_rate": 1.0205455189514702e-05, + "loss": 3.3592, + "step": 5530 + }, + { + "epoch": 1.471838469713071, + "grad_norm": 5173333.5, + "learning_rate": 1.0187743535246193e-05, + "loss": 3.3015, + "step": 5540 + }, + { + "epoch": 1.4744952178533475, + "grad_norm": 3443425.5, + "learning_rate": 1.0170031880977685e-05, + "loss": 3.5236, + "step": 5550 + }, + { + "epoch": 1.4771519659936239, + "grad_norm": 2188022.75, + "learning_rate": 1.0152320226709175e-05, + "loss": 3.5614, + "step": 5560 + }, + { + "epoch": 1.4798087141339, + "grad_norm": 16931794.0, + "learning_rate": 1.0134608572440668e-05, + "loss": 3.6685, + "step": 5570 + }, + { + "epoch": 1.4824654622741764, + "grad_norm": 10456564.0, + "learning_rate": 1.0116896918172158e-05, + "loss": 3.4864, + "step": 5580 + }, + { + "epoch": 1.4851222104144526, + "grad_norm": 27239420.0, + "learning_rate": 1.009918526390365e-05, + "loss": 3.5637, + "step": 5590 + }, + { + "epoch": 1.487778958554729, + "grad_norm": 16616771.0, + "learning_rate": 1.0081473609635141e-05, + "loss": 3.6085, + "step": 5600 + }, + { + "epoch": 1.4904357066950054, + "grad_norm": 10221569.0, + "learning_rate": 1.0063761955366632e-05, + "loss": 3.5812, + "step": 5610 + }, + { + "epoch": 1.4930924548352817, + "grad_norm": 1452260.75, + "learning_rate": 1.0046050301098122e-05, + "loss": 3.9326, + "step": 5620 + }, + { + "epoch": 1.495749202975558, + "grad_norm": 3546143.0, + "learning_rate": 1.0028338646829615e-05, + "loss": 3.2541, + "step": 5630 + }, + { + "epoch": 1.4984059511158343, + "grad_norm": 12791246.0, + "learning_rate": 1.0010626992561105e-05, + "loss": 3.4152, + "step": 5640 + }, + { + "epoch": 1.5010626992561105, + "grad_norm": 12529229.0, + "learning_rate": 9.992915338292597e-06, + "loss": 3.0508, + "step": 5650 + }, + { + "epoch": 1.5037194473963869, + "grad_norm": 9755405.0, + "learning_rate": 9.975203684024088e-06, + "loss": 3.5064, + "step": 5660 + }, + { + "epoch": 1.5063761955366632, + "grad_norm": 6901898.0, + "learning_rate": 9.95749202975558e-06, + "loss": 3.6654, + "step": 5670 + }, + { + "epoch": 1.5090329436769394, + "grad_norm": 9542270.0, + "learning_rate": 9.93978037548707e-06, + "loss": 3.3481, + "step": 5680 + }, + { + "epoch": 1.5116896918172156, + "grad_norm": 14570059.0, + "learning_rate": 9.922068721218563e-06, + "loss": 3.6342, + "step": 5690 + }, + { + "epoch": 1.514346439957492, + "grad_norm": 130252984.0, + "learning_rate": 9.904357066950054e-06, + "loss": 3.3275, + "step": 5700 + }, + { + "epoch": 1.5170031880977684, + "grad_norm": 12491921.0, + "learning_rate": 9.886645412681544e-06, + "loss": 3.1862, + "step": 5710 + }, + { + "epoch": 1.5196599362380447, + "grad_norm": 171955248.0, + "learning_rate": 9.868933758413036e-06, + "loss": 3.6, + "step": 5720 + }, + { + "epoch": 1.522316684378321, + "grad_norm": 67972536.0, + "learning_rate": 9.851222104144527e-06, + "loss": 3.5839, + "step": 5730 + }, + { + "epoch": 1.524973432518597, + "grad_norm": 19312536.0, + "learning_rate": 9.83351044987602e-06, + "loss": 3.3906, + "step": 5740 + }, + { + "epoch": 1.5276301806588735, + "grad_norm": 39636108.0, + "learning_rate": 9.81579879560751e-06, + "loss": 3.5388, + "step": 5750 + }, + { + "epoch": 1.5302869287991498, + "grad_norm": 54133548.0, + "learning_rate": 9.798087141339002e-06, + "loss": 3.2938, + "step": 5760 + }, + { + "epoch": 1.5329436769394262, + "grad_norm": 28021788.0, + "learning_rate": 9.780375487070494e-06, + "loss": 3.565, + "step": 5770 + }, + { + "epoch": 1.5356004250797024, + "grad_norm": 12500334.0, + "learning_rate": 9.762663832801983e-06, + "loss": 3.4099, + "step": 5780 + }, + { + "epoch": 1.5382571732199788, + "grad_norm": 20677724.0, + "learning_rate": 9.744952178533476e-06, + "loss": 3.8265, + "step": 5790 + }, + { + "epoch": 1.540913921360255, + "grad_norm": 25849000.0, + "learning_rate": 9.727240524264968e-06, + "loss": 3.5107, + "step": 5800 + }, + { + "epoch": 1.5435706695005313, + "grad_norm": 7106916.0, + "learning_rate": 9.709528869996458e-06, + "loss": 3.7538, + "step": 5810 + }, + { + "epoch": 1.5462274176408077, + "grad_norm": 78143128.0, + "learning_rate": 9.69181721572795e-06, + "loss": 3.8139, + "step": 5820 + }, + { + "epoch": 1.548884165781084, + "grad_norm": 124880632.0, + "learning_rate": 9.674105561459441e-06, + "loss": 3.4966, + "step": 5830 + }, + { + "epoch": 1.5515409139213603, + "grad_norm": 16674735.0, + "learning_rate": 9.656393907190934e-06, + "loss": 3.7779, + "step": 5840 + }, + { + "epoch": 1.5541976620616365, + "grad_norm": 36204444.0, + "learning_rate": 9.638682252922424e-06, + "loss": 3.5086, + "step": 5850 + }, + { + "epoch": 1.5568544102019128, + "grad_norm": 7019197.5, + "learning_rate": 9.620970598653915e-06, + "loss": 3.3062, + "step": 5860 + }, + { + "epoch": 1.5595111583421892, + "grad_norm": 14028569.0, + "learning_rate": 9.603258944385407e-06, + "loss": 3.4862, + "step": 5870 + }, + { + "epoch": 1.5621679064824656, + "grad_norm": 24143218.0, + "learning_rate": 9.585547290116898e-06, + "loss": 3.388, + "step": 5880 + }, + { + "epoch": 1.5648246546227418, + "grad_norm": 8635328.0, + "learning_rate": 9.56783563584839e-06, + "loss": 3.9959, + "step": 5890 + }, + { + "epoch": 1.567481402763018, + "grad_norm": 14461347.0, + "learning_rate": 9.55012398157988e-06, + "loss": 3.3619, + "step": 5900 + }, + { + "epoch": 1.5701381509032943, + "grad_norm": 45164232.0, + "learning_rate": 9.532412327311371e-06, + "loss": 3.7565, + "step": 5910 + }, + { + "epoch": 1.5727948990435707, + "grad_norm": 43768708.0, + "learning_rate": 9.514700673042863e-06, + "loss": 3.2873, + "step": 5920 + }, + { + "epoch": 1.5754516471838471, + "grad_norm": 102944216.0, + "learning_rate": 9.496989018774354e-06, + "loss": 3.5849, + "step": 5930 + }, + { + "epoch": 1.5781083953241233, + "grad_norm": 8864102.0, + "learning_rate": 9.479277364505846e-06, + "loss": 3.3615, + "step": 5940 + }, + { + "epoch": 1.5807651434643994, + "grad_norm": 17926040.0, + "learning_rate": 9.461565710237337e-06, + "loss": 3.3599, + "step": 5950 + }, + { + "epoch": 1.5834218916046758, + "grad_norm": 563806208.0, + "learning_rate": 9.443854055968829e-06, + "loss": 3.6726, + "step": 5960 + }, + { + "epoch": 1.5860786397449522, + "grad_norm": 4375813.5, + "learning_rate": 9.42614240170032e-06, + "loss": 3.5982, + "step": 5970 + }, + { + "epoch": 1.5887353878852286, + "grad_norm": 23817932.0, + "learning_rate": 9.40843074743181e-06, + "loss": 3.6873, + "step": 5980 + }, + { + "epoch": 1.5913921360255048, + "grad_norm": 3588041.25, + "learning_rate": 9.390719093163302e-06, + "loss": 3.8219, + "step": 5990 + }, + { + "epoch": 1.594048884165781, + "grad_norm": 97096224.0, + "learning_rate": 9.373007438894793e-06, + "loss": 3.5905, + "step": 6000 + }, + { + "epoch": 1.5967056323060573, + "grad_norm": 4066724.0, + "learning_rate": 9.355295784626285e-06, + "loss": 3.5762, + "step": 6010 + }, + { + "epoch": 1.5993623804463337, + "grad_norm": 44529008.0, + "learning_rate": 9.337584130357776e-06, + "loss": 3.821, + "step": 6020 + }, + { + "epoch": 1.60201912858661, + "grad_norm": 10141793.0, + "learning_rate": 9.319872476089268e-06, + "loss": 3.4989, + "step": 6030 + }, + { + "epoch": 1.6046758767268863, + "grad_norm": 22102744.0, + "learning_rate": 9.302160821820759e-06, + "loss": 3.4363, + "step": 6040 + }, + { + "epoch": 1.6073326248671624, + "grad_norm": 1421525.375, + "learning_rate": 9.284449167552249e-06, + "loss": 3.3543, + "step": 6050 + }, + { + "epoch": 1.6099893730074388, + "grad_norm": 17624050.0, + "learning_rate": 9.266737513283741e-06, + "loss": 3.5835, + "step": 6060 + }, + { + "epoch": 1.6126461211477152, + "grad_norm": 2787807.5, + "learning_rate": 9.249025859015232e-06, + "loss": 3.7715, + "step": 6070 + }, + { + "epoch": 1.6153028692879916, + "grad_norm": 36419916.0, + "learning_rate": 9.231314204746724e-06, + "loss": 3.2874, + "step": 6080 + }, + { + "epoch": 1.6179596174282678, + "grad_norm": 550304.0, + "learning_rate": 9.213602550478215e-06, + "loss": 3.775, + "step": 6090 + }, + { + "epoch": 1.620616365568544, + "grad_norm": 13110638.0, + "learning_rate": 9.195890896209707e-06, + "loss": 4.0895, + "step": 6100 + }, + { + "epoch": 1.6232731137088203, + "grad_norm": 153279.40625, + "learning_rate": 9.1781792419412e-06, + "loss": 3.5868, + "step": 6110 + }, + { + "epoch": 1.6259298618490967, + "grad_norm": 274644.03125, + "learning_rate": 9.160467587672688e-06, + "loss": 3.4759, + "step": 6120 + }, + { + "epoch": 1.628586609989373, + "grad_norm": 21545.19921875, + "learning_rate": 9.14275593340418e-06, + "loss": 4.0524, + "step": 6130 + }, + { + "epoch": 1.6312433581296493, + "grad_norm": 27863.1015625, + "learning_rate": 9.125044279135673e-06, + "loss": 3.4133, + "step": 6140 + }, + { + "epoch": 1.6339001062699257, + "grad_norm": 146765.640625, + "learning_rate": 9.107332624867163e-06, + "loss": 3.6765, + "step": 6150 + }, + { + "epoch": 1.6365568544102018, + "grad_norm": 60709.375, + "learning_rate": 9.089620970598656e-06, + "loss": 3.8558, + "step": 6160 + }, + { + "epoch": 1.6392136025504782, + "grad_norm": 290704.21875, + "learning_rate": 9.071909316330146e-06, + "loss": 3.3615, + "step": 6170 + }, + { + "epoch": 1.6418703506907546, + "grad_norm": 198007.828125, + "learning_rate": 9.054197662061637e-06, + "loss": 3.6759, + "step": 6180 + }, + { + "epoch": 1.6445270988310308, + "grad_norm": 30211.29296875, + "learning_rate": 9.036486007793129e-06, + "loss": 4.1618, + "step": 6190 + }, + { + "epoch": 1.6471838469713072, + "grad_norm": 697217.3125, + "learning_rate": 9.01877435352462e-06, + "loss": 3.5873, + "step": 6200 + }, + { + "epoch": 1.6498405951115833, + "grad_norm": 311260.34375, + "learning_rate": 9.001062699256112e-06, + "loss": 4.0309, + "step": 6210 + }, + { + "epoch": 1.6524973432518597, + "grad_norm": 7285945.0, + "learning_rate": 8.983351044987602e-06, + "loss": 3.7024, + "step": 6220 + }, + { + "epoch": 1.655154091392136, + "grad_norm": 238075.265625, + "learning_rate": 8.965639390719095e-06, + "loss": 3.7081, + "step": 6230 + }, + { + "epoch": 1.6578108395324125, + "grad_norm": 104777.8828125, + "learning_rate": 8.947927736450585e-06, + "loss": 3.6374, + "step": 6240 + }, + { + "epoch": 1.6604675876726886, + "grad_norm": 45899.98828125, + "learning_rate": 8.930216082182076e-06, + "loss": 3.7753, + "step": 6250 + }, + { + "epoch": 1.6631243358129648, + "grad_norm": 4903258.0, + "learning_rate": 8.912504427913568e-06, + "loss": 3.7641, + "step": 6260 + }, + { + "epoch": 1.6657810839532412, + "grad_norm": 691504.875, + "learning_rate": 8.894792773645059e-06, + "loss": 3.012, + "step": 6270 + }, + { + "epoch": 1.6684378320935176, + "grad_norm": 7211197.0, + "learning_rate": 8.877081119376551e-06, + "loss": 3.278, + "step": 6280 + }, + { + "epoch": 1.671094580233794, + "grad_norm": 55386.39453125, + "learning_rate": 8.859369465108042e-06, + "loss": 3.5972, + "step": 6290 + }, + { + "epoch": 1.6737513283740701, + "grad_norm": 4803297.5, + "learning_rate": 8.841657810839534e-06, + "loss": 3.5168, + "step": 6300 + }, + { + "epoch": 1.6764080765143463, + "grad_norm": 153394.5625, + "learning_rate": 8.823946156571024e-06, + "loss": 3.4884, + "step": 6310 + }, + { + "epoch": 1.6790648246546227, + "grad_norm": 105014.6796875, + "learning_rate": 8.806234502302515e-06, + "loss": 3.5724, + "step": 6320 + }, + { + "epoch": 1.681721572794899, + "grad_norm": 425531.6875, + "learning_rate": 8.788522848034007e-06, + "loss": 3.7171, + "step": 6330 + }, + { + "epoch": 1.6843783209351755, + "grad_norm": 881638.625, + "learning_rate": 8.770811193765498e-06, + "loss": 3.5689, + "step": 6340 + }, + { + "epoch": 1.6870350690754516, + "grad_norm": 506417.84375, + "learning_rate": 8.75309953949699e-06, + "loss": 3.3471, + "step": 6350 + }, + { + "epoch": 1.6896918172157278, + "grad_norm": 218658.8125, + "learning_rate": 8.73538788522848e-06, + "loss": 3.0762, + "step": 6360 + }, + { + "epoch": 1.6923485653560042, + "grad_norm": 3747502.5, + "learning_rate": 8.717676230959973e-06, + "loss": 3.7819, + "step": 6370 + }, + { + "epoch": 1.6950053134962806, + "grad_norm": 402977.15625, + "learning_rate": 8.699964576691463e-06, + "loss": 3.2238, + "step": 6380 + }, + { + "epoch": 1.697662061636557, + "grad_norm": 354610.0, + "learning_rate": 8.682252922422954e-06, + "loss": 3.5365, + "step": 6390 + }, + { + "epoch": 1.7003188097768331, + "grad_norm": 737137.25, + "learning_rate": 8.664541268154446e-06, + "loss": 3.7334, + "step": 6400 + }, + { + "epoch": 1.7029755579171093, + "grad_norm": 270020.3125, + "learning_rate": 8.646829613885937e-06, + "loss": 3.6183, + "step": 6410 + }, + { + "epoch": 1.7056323060573857, + "grad_norm": 740626.4375, + "learning_rate": 8.629117959617429e-06, + "loss": 3.7487, + "step": 6420 + }, + { + "epoch": 1.708289054197662, + "grad_norm": 1305229.75, + "learning_rate": 8.61140630534892e-06, + "loss": 3.7039, + "step": 6430 + }, + { + "epoch": 1.7109458023379385, + "grad_norm": 172010.875, + "learning_rate": 8.593694651080412e-06, + "loss": 2.9064, + "step": 6440 + }, + { + "epoch": 1.7136025504782146, + "grad_norm": 36386.55859375, + "learning_rate": 8.575982996811903e-06, + "loss": 3.5462, + "step": 6450 + }, + { + "epoch": 1.7162592986184908, + "grad_norm": 280424.5, + "learning_rate": 8.558271342543393e-06, + "loss": 3.7119, + "step": 6460 + }, + { + "epoch": 1.7189160467587672, + "grad_norm": 65134.73828125, + "learning_rate": 8.540559688274885e-06, + "loss": 4.058, + "step": 6470 + }, + { + "epoch": 1.7215727948990436, + "grad_norm": 66937.53125, + "learning_rate": 8.522848034006378e-06, + "loss": 3.3975, + "step": 6480 + }, + { + "epoch": 1.72422954303932, + "grad_norm": 131224.421875, + "learning_rate": 8.505136379737868e-06, + "loss": 3.4813, + "step": 6490 + }, + { + "epoch": 1.7268862911795961, + "grad_norm": 108172.1640625, + "learning_rate": 8.48742472546936e-06, + "loss": 3.1716, + "step": 6500 + }, + { + "epoch": 1.7295430393198725, + "grad_norm": 25198.029296875, + "learning_rate": 8.469713071200851e-06, + "loss": 3.6849, + "step": 6510 + }, + { + "epoch": 1.7321997874601487, + "grad_norm": 61498.03515625, + "learning_rate": 8.452001416932342e-06, + "loss": 3.4036, + "step": 6520 + }, + { + "epoch": 1.734856535600425, + "grad_norm": 442683.875, + "learning_rate": 8.434289762663834e-06, + "loss": 3.3497, + "step": 6530 + }, + { + "epoch": 1.7375132837407015, + "grad_norm": 27654.84765625, + "learning_rate": 8.416578108395324e-06, + "loss": 3.2324, + "step": 6540 + }, + { + "epoch": 1.7401700318809776, + "grad_norm": 87875.5546875, + "learning_rate": 8.398866454126817e-06, + "loss": 3.211, + "step": 6550 + }, + { + "epoch": 1.742826780021254, + "grad_norm": 443493.65625, + "learning_rate": 8.381154799858307e-06, + "loss": 3.5746, + "step": 6560 + }, + { + "epoch": 1.7454835281615302, + "grad_norm": 112091.3046875, + "learning_rate": 8.3634431455898e-06, + "loss": 3.2604, + "step": 6570 + }, + { + "epoch": 1.7481402763018066, + "grad_norm": 37516.62109375, + "learning_rate": 8.34573149132129e-06, + "loss": 3.3058, + "step": 6580 + }, + { + "epoch": 1.750797024442083, + "grad_norm": 98792.796875, + "learning_rate": 8.32801983705278e-06, + "loss": 3.4504, + "step": 6590 + }, + { + "epoch": 1.7534537725823593, + "grad_norm": 24296.8125, + "learning_rate": 8.310308182784273e-06, + "loss": 3.2476, + "step": 6600 + }, + { + "epoch": 1.7561105207226355, + "grad_norm": 27490.43359375, + "learning_rate": 8.292596528515764e-06, + "loss": 3.4551, + "step": 6610 + }, + { + "epoch": 1.7587672688629117, + "grad_norm": 163381.75, + "learning_rate": 8.274884874247256e-06, + "loss": 3.56, + "step": 6620 + }, + { + "epoch": 1.761424017003188, + "grad_norm": 5022.00244140625, + "learning_rate": 8.257173219978746e-06, + "loss": 3.2829, + "step": 6630 + }, + { + "epoch": 1.7640807651434645, + "grad_norm": 873426.5, + "learning_rate": 8.239461565710239e-06, + "loss": 3.292, + "step": 6640 + }, + { + "epoch": 1.7667375132837408, + "grad_norm": 48760.75390625, + "learning_rate": 8.22174991144173e-06, + "loss": 3.3971, + "step": 6650 + }, + { + "epoch": 1.769394261424017, + "grad_norm": 22562.328125, + "learning_rate": 8.20403825717322e-06, + "loss": 3.4901, + "step": 6660 + }, + { + "epoch": 1.7720510095642932, + "grad_norm": 110952.984375, + "learning_rate": 8.186326602904712e-06, + "loss": 3.5824, + "step": 6670 + }, + { + "epoch": 1.7747077577045696, + "grad_norm": 11664.615234375, + "learning_rate": 8.168614948636203e-06, + "loss": 3.6433, + "step": 6680 + }, + { + "epoch": 1.777364505844846, + "grad_norm": 296820.28125, + "learning_rate": 8.150903294367695e-06, + "loss": 3.4816, + "step": 6690 + }, + { + "epoch": 1.7800212539851223, + "grad_norm": 28750.556640625, + "learning_rate": 8.133191640099186e-06, + "loss": 3.4851, + "step": 6700 + }, + { + "epoch": 1.7826780021253985, + "grad_norm": 86309.7890625, + "learning_rate": 8.115479985830678e-06, + "loss": 3.3058, + "step": 6710 + }, + { + "epoch": 1.7853347502656747, + "grad_norm": 91584.7734375, + "learning_rate": 8.097768331562168e-06, + "loss": 3.7495, + "step": 6720 + }, + { + "epoch": 1.787991498405951, + "grad_norm": 132450.96875, + "learning_rate": 8.080056677293659e-06, + "loss": 3.4955, + "step": 6730 + }, + { + "epoch": 1.7906482465462275, + "grad_norm": 134387.046875, + "learning_rate": 8.062345023025151e-06, + "loss": 3.4655, + "step": 6740 + }, + { + "epoch": 1.7933049946865038, + "grad_norm": 74426.6875, + "learning_rate": 8.044633368756642e-06, + "loss": 3.7594, + "step": 6750 + }, + { + "epoch": 1.79596174282678, + "grad_norm": 58667.3984375, + "learning_rate": 8.026921714488134e-06, + "loss": 3.7655, + "step": 6760 + }, + { + "epoch": 1.7986184909670562, + "grad_norm": 130389.9140625, + "learning_rate": 8.009210060219625e-06, + "loss": 3.673, + "step": 6770 + }, + { + "epoch": 1.8012752391073326, + "grad_norm": 89147.9296875, + "learning_rate": 7.991498405951117e-06, + "loss": 3.2874, + "step": 6780 + }, + { + "epoch": 1.803931987247609, + "grad_norm": 44793.80859375, + "learning_rate": 7.973786751682607e-06, + "loss": 3.2517, + "step": 6790 + }, + { + "epoch": 1.8065887353878853, + "grad_norm": 15245.392578125, + "learning_rate": 7.956075097414098e-06, + "loss": 3.5179, + "step": 6800 + }, + { + "epoch": 1.8092454835281615, + "grad_norm": 15995.4912109375, + "learning_rate": 7.93836344314559e-06, + "loss": 3.6515, + "step": 6810 + }, + { + "epoch": 1.8119022316684377, + "grad_norm": 16524.787109375, + "learning_rate": 7.920651788877083e-06, + "loss": 3.1618, + "step": 6820 + }, + { + "epoch": 1.814558979808714, + "grad_norm": 42409.20703125, + "learning_rate": 7.902940134608573e-06, + "loss": 3.58, + "step": 6830 + }, + { + "epoch": 1.8172157279489904, + "grad_norm": 10542.6796875, + "learning_rate": 7.885228480340065e-06, + "loss": 3.508, + "step": 6840 + }, + { + "epoch": 1.8198724760892668, + "grad_norm": 25151.1484375, + "learning_rate": 7.867516826071556e-06, + "loss": 3.0635, + "step": 6850 + }, + { + "epoch": 1.822529224229543, + "grad_norm": 9499.1826171875, + "learning_rate": 7.849805171803047e-06, + "loss": 2.9901, + "step": 6860 + }, + { + "epoch": 1.8251859723698194, + "grad_norm": 54946.984375, + "learning_rate": 7.832093517534539e-06, + "loss": 2.9531, + "step": 6870 + }, + { + "epoch": 1.8278427205100956, + "grad_norm": 10790.599609375, + "learning_rate": 7.81438186326603e-06, + "loss": 3.5882, + "step": 6880 + }, + { + "epoch": 1.830499468650372, + "grad_norm": 13575.8759765625, + "learning_rate": 7.796670208997522e-06, + "loss": 3.3612, + "step": 6890 + }, + { + "epoch": 1.8331562167906483, + "grad_norm": 20945.48046875, + "learning_rate": 7.778958554729012e-06, + "loss": 3.0764, + "step": 6900 + }, + { + "epoch": 1.8358129649309245, + "grad_norm": 232869.03125, + "learning_rate": 7.761246900460504e-06, + "loss": 3.1716, + "step": 6910 + }, + { + "epoch": 1.8384697130712009, + "grad_norm": 43791.59765625, + "learning_rate": 7.743535246191995e-06, + "loss": 3.2311, + "step": 6920 + }, + { + "epoch": 1.841126461211477, + "grad_norm": 22579.091796875, + "learning_rate": 7.725823591923486e-06, + "loss": 3.4563, + "step": 6930 + }, + { + "epoch": 1.8437832093517534, + "grad_norm": 28530.806640625, + "learning_rate": 7.708111937654978e-06, + "loss": 3.452, + "step": 6940 + }, + { + "epoch": 1.8464399574920298, + "grad_norm": 12486.0390625, + "learning_rate": 7.690400283386468e-06, + "loss": 3.2791, + "step": 6950 + }, + { + "epoch": 1.8490967056323062, + "grad_norm": 17018.11328125, + "learning_rate": 7.67268862911796e-06, + "loss": 3.6792, + "step": 6960 + }, + { + "epoch": 1.8517534537725824, + "grad_norm": 16199.2470703125, + "learning_rate": 7.654976974849451e-06, + "loss": 3.2561, + "step": 6970 + }, + { + "epoch": 1.8544102019128585, + "grad_norm": 10388.2470703125, + "learning_rate": 7.637265320580944e-06, + "loss": 3.0233, + "step": 6980 + }, + { + "epoch": 1.857066950053135, + "grad_norm": 15407.7548828125, + "learning_rate": 7.619553666312433e-06, + "loss": 3.167, + "step": 6990 + }, + { + "epoch": 1.8597236981934113, + "grad_norm": 26815.095703125, + "learning_rate": 7.601842012043925e-06, + "loss": 3.2972, + "step": 7000 + }, + { + "epoch": 1.8623804463336877, + "grad_norm": 58698.21875, + "learning_rate": 7.584130357775417e-06, + "loss": 3.2334, + "step": 7010 + }, + { + "epoch": 1.8650371944739639, + "grad_norm": 27274.71875, + "learning_rate": 7.566418703506908e-06, + "loss": 3.1432, + "step": 7020 + }, + { + "epoch": 1.86769394261424, + "grad_norm": 83316.0703125, + "learning_rate": 7.5487070492384e-06, + "loss": 3.1284, + "step": 7030 + }, + { + "epoch": 1.8703506907545164, + "grad_norm": 30122.771484375, + "learning_rate": 7.530995394969891e-06, + "loss": 2.8904, + "step": 7040 + }, + { + "epoch": 1.8730074388947928, + "grad_norm": 40200.9609375, + "learning_rate": 7.513283740701383e-06, + "loss": 3.3255, + "step": 7050 + }, + { + "epoch": 1.8756641870350692, + "grad_norm": 16342.447265625, + "learning_rate": 7.495572086432873e-06, + "loss": 3.1073, + "step": 7060 + }, + { + "epoch": 1.8783209351753454, + "grad_norm": 14423.703125, + "learning_rate": 7.477860432164365e-06, + "loss": 3.4831, + "step": 7070 + }, + { + "epoch": 1.8809776833156215, + "grad_norm": 34366.14453125, + "learning_rate": 7.460148777895856e-06, + "loss": 3.2063, + "step": 7080 + }, + { + "epoch": 1.883634431455898, + "grad_norm": 70803.8359375, + "learning_rate": 7.4424371236273475e-06, + "loss": 3.5181, + "step": 7090 + }, + { + "epoch": 1.8862911795961743, + "grad_norm": 13800.69140625, + "learning_rate": 7.424725469358839e-06, + "loss": 3.4993, + "step": 7100 + }, + { + "epoch": 1.8889479277364507, + "grad_norm": 48057.68359375, + "learning_rate": 7.40701381509033e-06, + "loss": 3.1418, + "step": 7110 + }, + { + "epoch": 1.8916046758767269, + "grad_norm": 40145.4921875, + "learning_rate": 7.389302160821822e-06, + "loss": 3.3427, + "step": 7120 + }, + { + "epoch": 1.894261424017003, + "grad_norm": 13148.1484375, + "learning_rate": 7.371590506553312e-06, + "loss": 3.2584, + "step": 7130 + }, + { + "epoch": 1.8969181721572794, + "grad_norm": 10740.6826171875, + "learning_rate": 7.353878852284804e-06, + "loss": 2.8656, + "step": 7140 + }, + { + "epoch": 1.8995749202975558, + "grad_norm": 7270.3818359375, + "learning_rate": 7.336167198016295e-06, + "loss": 3.1929, + "step": 7150 + }, + { + "epoch": 1.9022316684378322, + "grad_norm": 3250.9072265625, + "learning_rate": 7.318455543747787e-06, + "loss": 3.3218, + "step": 7160 + }, + { + "epoch": 1.9048884165781084, + "grad_norm": 40904.6484375, + "learning_rate": 7.300743889479278e-06, + "loss": 3.0994, + "step": 7170 + }, + { + "epoch": 1.9075451647183845, + "grad_norm": 9426.0009765625, + "learning_rate": 7.2830322352107695e-06, + "loss": 3.2861, + "step": 7180 + }, + { + "epoch": 1.910201912858661, + "grad_norm": 10107.427734375, + "learning_rate": 7.265320580942261e-06, + "loss": 3.3694, + "step": 7190 + }, + { + "epoch": 1.9128586609989373, + "grad_norm": 25632.7734375, + "learning_rate": 7.2476089266737514e-06, + "loss": 3.1918, + "step": 7200 + }, + { + "epoch": 1.9155154091392137, + "grad_norm": 10823.2509765625, + "learning_rate": 7.229897272405243e-06, + "loss": 3.1984, + "step": 7210 + }, + { + "epoch": 1.9181721572794899, + "grad_norm": 8237.4482421875, + "learning_rate": 7.212185618136734e-06, + "loss": 2.7874, + "step": 7220 + }, + { + "epoch": 1.9208289054197663, + "grad_norm": 4823.09716796875, + "learning_rate": 7.194473963868226e-06, + "loss": 3.2625, + "step": 7230 + }, + { + "epoch": 1.9234856535600424, + "grad_norm": 6276.54150390625, + "learning_rate": 7.176762309599717e-06, + "loss": 3.1739, + "step": 7240 + }, + { + "epoch": 1.9261424017003188, + "grad_norm": 9979.935546875, + "learning_rate": 7.1590506553312085e-06, + "loss": 3.34, + "step": 7250 + }, + { + "epoch": 1.9287991498405952, + "grad_norm": 3373.656982421875, + "learning_rate": 7.141339001062701e-06, + "loss": 3.4366, + "step": 7260 + }, + { + "epoch": 1.9314558979808714, + "grad_norm": 9178.9404296875, + "learning_rate": 7.1236273467941905e-06, + "loss": 3.245, + "step": 7270 + }, + { + "epoch": 1.9341126461211477, + "grad_norm": 11173.3037109375, + "learning_rate": 7.105915692525682e-06, + "loss": 3.1306, + "step": 7280 + }, + { + "epoch": 1.936769394261424, + "grad_norm": 6969.20849609375, + "learning_rate": 7.088204038257173e-06, + "loss": 3.5482, + "step": 7290 + }, + { + "epoch": 1.9394261424017003, + "grad_norm": 22079.796875, + "learning_rate": 7.070492383988665e-06, + "loss": 3.2338, + "step": 7300 + }, + { + "epoch": 1.9420828905419767, + "grad_norm": 51803.05078125, + "learning_rate": 7.052780729720157e-06, + "loss": 3.1844, + "step": 7310 + }, + { + "epoch": 1.944739638682253, + "grad_norm": 17502.84375, + "learning_rate": 7.0350690754516485e-06, + "loss": 3.3796, + "step": 7320 + }, + { + "epoch": 1.9473963868225292, + "grad_norm": 4275.10009765625, + "learning_rate": 7.017357421183138e-06, + "loss": 3.0306, + "step": 7330 + }, + { + "epoch": 1.9500531349628054, + "grad_norm": 3620.85400390625, + "learning_rate": 6.99964576691463e-06, + "loss": 3.3635, + "step": 7340 + }, + { + "epoch": 1.9527098831030818, + "grad_norm": 32547.673828125, + "learning_rate": 6.981934112646122e-06, + "loss": 3.1764, + "step": 7350 + }, + { + "epoch": 1.9553666312433582, + "grad_norm": 5065.5751953125, + "learning_rate": 6.964222458377613e-06, + "loss": 3.1895, + "step": 7360 + }, + { + "epoch": 1.9580233793836346, + "grad_norm": 10395.2060546875, + "learning_rate": 6.946510804109105e-06, + "loss": 3.1655, + "step": 7370 + }, + { + "epoch": 1.9606801275239107, + "grad_norm": 4557.41796875, + "learning_rate": 6.928799149840596e-06, + "loss": 3.0581, + "step": 7380 + }, + { + "epoch": 1.963336875664187, + "grad_norm": 38417.4765625, + "learning_rate": 6.911087495572088e-06, + "loss": 3.0893, + "step": 7390 + }, + { + "epoch": 1.9659936238044633, + "grad_norm": 5107.16796875, + "learning_rate": 6.893375841303578e-06, + "loss": 3.4167, + "step": 7400 + }, + { + "epoch": 1.9686503719447397, + "grad_norm": 5035.6201171875, + "learning_rate": 6.87566418703507e-06, + "loss": 3.0664, + "step": 7410 + }, + { + "epoch": 1.971307120085016, + "grad_norm": 12651.587890625, + "learning_rate": 6.857952532766561e-06, + "loss": 3.1299, + "step": 7420 + }, + { + "epoch": 1.9739638682252922, + "grad_norm": 7539.5400390625, + "learning_rate": 6.840240878498052e-06, + "loss": 3.0492, + "step": 7430 + }, + { + "epoch": 1.9766206163655684, + "grad_norm": 5577.158203125, + "learning_rate": 6.822529224229544e-06, + "loss": 3.098, + "step": 7440 + }, + { + "epoch": 1.9792773645058448, + "grad_norm": 41558.4921875, + "learning_rate": 6.804817569961035e-06, + "loss": 3.2933, + "step": 7450 + }, + { + "epoch": 1.9819341126461212, + "grad_norm": 3775.939697265625, + "learning_rate": 6.787105915692527e-06, + "loss": 2.9667, + "step": 7460 + }, + { + "epoch": 1.9845908607863976, + "grad_norm": 30318.9921875, + "learning_rate": 6.769394261424017e-06, + "loss": 3.0666, + "step": 7470 + }, + { + "epoch": 1.9872476089266737, + "grad_norm": 21865.806640625, + "learning_rate": 6.751682607155509e-06, + "loss": 2.9213, + "step": 7480 + }, + { + "epoch": 1.98990435706695, + "grad_norm": 10458.220703125, + "learning_rate": 6.733970952887e-06, + "loss": 3.2567, + "step": 7490 + }, + { + "epoch": 1.9925611052072263, + "grad_norm": 14638.0439453125, + "learning_rate": 6.7162592986184915e-06, + "loss": 3.2132, + "step": 7500 + } + ], + "logging_steps": 10, + "max_steps": 11292, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7836212920320000.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}