diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10949 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 15588, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012830382345393892, + "grad_norm": 5690.571363285278, + "learning_rate": 2.1367521367521368e-08, + "loss": 10.4318, + "step": 10 + }, + { + "epoch": 0.0025660764690787785, + "grad_norm": 6198.292818608199, + "learning_rate": 4.2735042735042736e-08, + "loss": 10.4279, + "step": 20 + }, + { + "epoch": 0.003849114703618168, + "grad_norm": 5799.239804444974, + "learning_rate": 6.410256410256409e-08, + "loss": 10.3211, + "step": 30 + }, + { + "epoch": 0.005132152938157557, + "grad_norm": 5500.573731774309, + "learning_rate": 8.547008547008547e-08, + "loss": 10.0872, + "step": 40 + }, + { + "epoch": 0.006415191172696947, + "grad_norm": 4652.31531202777, + "learning_rate": 1.0683760683760683e-07, + "loss": 9.7203, + "step": 50 + }, + { + "epoch": 0.007698229407236336, + "grad_norm": 3998.9952975347232, + "learning_rate": 1.2820512820512818e-07, + "loss": 9.1423, + "step": 60 + }, + { + "epoch": 0.008981267641775726, + "grad_norm": 7626.878245846253, + "learning_rate": 1.4957264957264955e-07, + "loss": 7.8453, + "step": 70 + }, + { + "epoch": 0.010264305876315114, + "grad_norm": 4082.258709373963, + "learning_rate": 1.7094017094017095e-07, + "loss": 6.4089, + "step": 80 + }, + { + "epoch": 0.011547344110854504, + "grad_norm": 8266.278111919211, + "learning_rate": 1.9230769230769231e-07, + "loss": 4.7115, + "step": 90 + }, + { + "epoch": 0.012830382345393894, + "grad_norm": 7992.364652556176, + "learning_rate": 2.1367521367521365e-07, + "loss": 3.8056, + "step": 100 + }, + { + "epoch": 0.014113420579933282, + "grad_norm": 1897.36380230182, + "learning_rate": 2.3504273504273502e-07, + "loss": 3.2834, + "step": 110 + }, + { + "epoch": 0.015396458814472672, + "grad_norm": 2061.382407575381, + "learning_rate": 2.5641025641025636e-07, + "loss": 2.8549, + "step": 120 + }, + { + "epoch": 0.01667949704901206, + "grad_norm": 4093.418400263578, + "learning_rate": 2.7777777777777776e-07, + "loss": 2.5494, + "step": 130 + }, + { + "epoch": 0.01796253528355145, + "grad_norm": 2185.943591771571, + "learning_rate": 2.991452991452991e-07, + "loss": 2.3261, + "step": 140 + }, + { + "epoch": 0.01924557351809084, + "grad_norm": 1180.495295953422, + "learning_rate": 3.2051282051282055e-07, + "loss": 2.1864, + "step": 150 + }, + { + "epoch": 0.020528611752630228, + "grad_norm": 533.0032421040232, + "learning_rate": 3.418803418803419e-07, + "loss": 2.0556, + "step": 160 + }, + { + "epoch": 0.02181164998716962, + "grad_norm": 1110.4001358395305, + "learning_rate": 3.6324786324786323e-07, + "loss": 1.932, + "step": 170 + }, + { + "epoch": 0.023094688221709007, + "grad_norm": 972.5137840499744, + "learning_rate": 3.8461538461538463e-07, + "loss": 1.8333, + "step": 180 + }, + { + "epoch": 0.024377726456248396, + "grad_norm": 704.8117278736595, + "learning_rate": 4.0598290598290597e-07, + "loss": 1.7051, + "step": 190 + }, + { + "epoch": 0.025660764690787787, + "grad_norm": 1067.1788406954295, + "learning_rate": 4.273504273504273e-07, + "loss": 1.6676, + "step": 200 + }, + { + "epoch": 0.026943802925327175, + "grad_norm": 522.5301191739918, + "learning_rate": 4.487179487179487e-07, + "loss": 1.5553, + "step": 210 + }, + { + "epoch": 0.028226841159866563, + "grad_norm": 971.5405928065815, + "learning_rate": 4.7008547008547005e-07, + "loss": 1.535, + "step": 220 + }, + { + "epoch": 0.02950987939440595, + "grad_norm": 1343.5662628298103, + "learning_rate": 4.914529914529914e-07, + "loss": 1.4258, + "step": 230 + }, + { + "epoch": 0.030792917628945343, + "grad_norm": 649.6956374267381, + "learning_rate": 5.128205128205127e-07, + "loss": 1.4076, + "step": 240 + }, + { + "epoch": 0.03207595586348473, + "grad_norm": 705.6522993473628, + "learning_rate": 5.341880341880341e-07, + "loss": 1.355, + "step": 250 + }, + { + "epoch": 0.03335899409802412, + "grad_norm": 1232.3346051163687, + "learning_rate": 5.555555555555555e-07, + "loss": 1.3473, + "step": 260 + }, + { + "epoch": 0.03464203233256351, + "grad_norm": 1418.391953817704, + "learning_rate": 5.769230769230768e-07, + "loss": 1.3055, + "step": 270 + }, + { + "epoch": 0.0359250705671029, + "grad_norm": 1358.4576839895308, + "learning_rate": 5.982905982905982e-07, + "loss": 1.215, + "step": 280 + }, + { + "epoch": 0.03720810880164229, + "grad_norm": 826.6192399395283, + "learning_rate": 6.196581196581196e-07, + "loss": 1.1923, + "step": 290 + }, + { + "epoch": 0.03849114703618168, + "grad_norm": 362.23869468876075, + "learning_rate": 6.410256410256411e-07, + "loss": 1.1698, + "step": 300 + }, + { + "epoch": 0.03977418527072107, + "grad_norm": 1120.3349027093425, + "learning_rate": 6.623931623931624e-07, + "loss": 1.1683, + "step": 310 + }, + { + "epoch": 0.041057223505260455, + "grad_norm": 843.8492825971533, + "learning_rate": 6.837606837606838e-07, + "loss": 1.0954, + "step": 320 + }, + { + "epoch": 0.042340261739799843, + "grad_norm": 594.7494411203583, + "learning_rate": 7.051282051282052e-07, + "loss": 1.0726, + "step": 330 + }, + { + "epoch": 0.04362329997433924, + "grad_norm": 591.336344162626, + "learning_rate": 7.264957264957265e-07, + "loss": 1.0748, + "step": 340 + }, + { + "epoch": 0.04490633820887863, + "grad_norm": 1666.8083404456775, + "learning_rate": 7.478632478632479e-07, + "loss": 1.0415, + "step": 350 + }, + { + "epoch": 0.046189376443418015, + "grad_norm": 676.6898988182528, + "learning_rate": 7.692307692307693e-07, + "loss": 1.039, + "step": 360 + }, + { + "epoch": 0.0474724146779574, + "grad_norm": 1416.82085043249, + "learning_rate": 7.905982905982905e-07, + "loss": 1.0297, + "step": 370 + }, + { + "epoch": 0.04875545291249679, + "grad_norm": 473.7692885159098, + "learning_rate": 8.119658119658119e-07, + "loss": 1.022, + "step": 380 + }, + { + "epoch": 0.05003849114703618, + "grad_norm": 864.7123644135875, + "learning_rate": 8.333333333333333e-07, + "loss": 0.9667, + "step": 390 + }, + { + "epoch": 0.051321529381575574, + "grad_norm": 670.1729781689538, + "learning_rate": 8.547008547008546e-07, + "loss": 0.982, + "step": 400 + }, + { + "epoch": 0.05260456761611496, + "grad_norm": 412.4049720210751, + "learning_rate": 8.76068376068376e-07, + "loss": 0.8914, + "step": 410 + }, + { + "epoch": 0.05388760585065435, + "grad_norm": 560.4280280696364, + "learning_rate": 8.974358974358974e-07, + "loss": 0.9452, + "step": 420 + }, + { + "epoch": 0.05517064408519374, + "grad_norm": 620.6491888060818, + "learning_rate": 9.188034188034187e-07, + "loss": 0.9638, + "step": 430 + }, + { + "epoch": 0.05645368231973313, + "grad_norm": 778.3104494835807, + "learning_rate": 9.401709401709401e-07, + "loss": 0.9088, + "step": 440 + }, + { + "epoch": 0.057736720554272515, + "grad_norm": 506.7476605214667, + "learning_rate": 9.615384615384615e-07, + "loss": 0.9226, + "step": 450 + }, + { + "epoch": 0.0590197587888119, + "grad_norm": 481.7922080095616, + "learning_rate": 9.829059829059829e-07, + "loss": 0.8994, + "step": 460 + }, + { + "epoch": 0.0603027970233513, + "grad_norm": 679.6251019040599, + "learning_rate": 9.999999568285975e-07, + "loss": 0.8899, + "step": 470 + }, + { + "epoch": 0.061585835257890686, + "grad_norm": 622.2746926008768, + "learning_rate": 9.99998445830296e-07, + "loss": 0.8604, + "step": 480 + }, + { + "epoch": 0.06286887349243007, + "grad_norm": 961.2353778924473, + "learning_rate": 9.99994776269329e-07, + "loss": 0.9078, + "step": 490 + }, + { + "epoch": 0.06415191172696946, + "grad_norm": 509.45563233279427, + "learning_rate": 9.999889481615386e-07, + "loss": 0.79, + "step": 500 + }, + { + "epoch": 0.06543494996150885, + "grad_norm": 573.9378711140391, + "learning_rate": 9.999809615320856e-07, + "loss": 0.808, + "step": 510 + }, + { + "epoch": 0.06671798819604824, + "grad_norm": 400.9208571246829, + "learning_rate": 9.999708164154493e-07, + "loss": 0.8565, + "step": 520 + }, + { + "epoch": 0.06800102643058763, + "grad_norm": 315.16602593085355, + "learning_rate": 9.999585128554275e-07, + "loss": 0.8095, + "step": 530 + }, + { + "epoch": 0.06928406466512702, + "grad_norm": 452.82425616854397, + "learning_rate": 9.999440509051367e-07, + "loss": 0.8435, + "step": 540 + }, + { + "epoch": 0.0705671028996664, + "grad_norm": 728.6644333678116, + "learning_rate": 9.999274306270108e-07, + "loss": 0.7866, + "step": 550 + }, + { + "epoch": 0.0718501411342058, + "grad_norm": 516.902877288436, + "learning_rate": 9.99908652092802e-07, + "loss": 0.7621, + "step": 560 + }, + { + "epoch": 0.0731331793687452, + "grad_norm": 356.28779383259206, + "learning_rate": 9.998877153835797e-07, + "loss": 0.8001, + "step": 570 + }, + { + "epoch": 0.07441621760328458, + "grad_norm": 327.2189079763857, + "learning_rate": 9.998646205897307e-07, + "loss": 0.8109, + "step": 580 + }, + { + "epoch": 0.07569925583782397, + "grad_norm": 507.661184728687, + "learning_rate": 9.998393678109586e-07, + "loss": 0.8227, + "step": 590 + }, + { + "epoch": 0.07698229407236336, + "grad_norm": 507.8603168808506, + "learning_rate": 9.998119571562829e-07, + "loss": 0.734, + "step": 600 + }, + { + "epoch": 0.07826533230690275, + "grad_norm": 291.5013707960829, + "learning_rate": 9.997823887440392e-07, + "loss": 0.7819, + "step": 610 + }, + { + "epoch": 0.07954837054144213, + "grad_norm": 317.10716461745744, + "learning_rate": 9.997506627018785e-07, + "loss": 0.7876, + "step": 620 + }, + { + "epoch": 0.08083140877598152, + "grad_norm": 477.1067374784116, + "learning_rate": 9.997167791667667e-07, + "loss": 0.7551, + "step": 630 + }, + { + "epoch": 0.08211444701052091, + "grad_norm": 653.2442572553759, + "learning_rate": 9.996807382849833e-07, + "loss": 0.7663, + "step": 640 + }, + { + "epoch": 0.0833974852450603, + "grad_norm": 228.06957512543602, + "learning_rate": 9.996425402121223e-07, + "loss": 0.6993, + "step": 650 + }, + { + "epoch": 0.08468052347959969, + "grad_norm": 223.14447368264166, + "learning_rate": 9.996021851130896e-07, + "loss": 0.6677, + "step": 660 + }, + { + "epoch": 0.08596356171413908, + "grad_norm": 342.9681928956178, + "learning_rate": 9.995596731621042e-07, + "loss": 0.8205, + "step": 670 + }, + { + "epoch": 0.08724659994867848, + "grad_norm": 246.02734714690487, + "learning_rate": 9.995150045426956e-07, + "loss": 0.7161, + "step": 680 + }, + { + "epoch": 0.08852963818321787, + "grad_norm": 234.15083343542562, + "learning_rate": 9.99468179447705e-07, + "loss": 0.7175, + "step": 690 + }, + { + "epoch": 0.08981267641775725, + "grad_norm": 438.81524006588677, + "learning_rate": 9.994191980792822e-07, + "loss": 0.7559, + "step": 700 + }, + { + "epoch": 0.09109571465229664, + "grad_norm": 279.48617587710146, + "learning_rate": 9.99368060648887e-07, + "loss": 0.7041, + "step": 710 + }, + { + "epoch": 0.09237875288683603, + "grad_norm": 316.5615517849292, + "learning_rate": 9.99314767377287e-07, + "loss": 0.737, + "step": 720 + }, + { + "epoch": 0.09366179112137542, + "grad_norm": 454.2678386589637, + "learning_rate": 9.99259318494556e-07, + "loss": 0.7204, + "step": 730 + }, + { + "epoch": 0.0949448293559148, + "grad_norm": 230.2338953911978, + "learning_rate": 9.99201714240075e-07, + "loss": 0.6651, + "step": 740 + }, + { + "epoch": 0.0962278675904542, + "grad_norm": 226.87115816479087, + "learning_rate": 9.991419548625292e-07, + "loss": 0.684, + "step": 750 + }, + { + "epoch": 0.09751090582499358, + "grad_norm": 369.94087067911994, + "learning_rate": 9.990800406199086e-07, + "loss": 0.7172, + "step": 760 + }, + { + "epoch": 0.09879394405953297, + "grad_norm": 356.8589884789159, + "learning_rate": 9.990159717795053e-07, + "loss": 0.719, + "step": 770 + }, + { + "epoch": 0.10007698229407236, + "grad_norm": 310.68627663653587, + "learning_rate": 9.989497486179132e-07, + "loss": 0.7184, + "step": 780 + }, + { + "epoch": 0.10136002052861175, + "grad_norm": 337.0595453131686, + "learning_rate": 9.988813714210272e-07, + "loss": 0.6973, + "step": 790 + }, + { + "epoch": 0.10264305876315115, + "grad_norm": 285.40926548192584, + "learning_rate": 9.988108404840408e-07, + "loss": 0.6484, + "step": 800 + }, + { + "epoch": 0.10392609699769054, + "grad_norm": 265.03398808439056, + "learning_rate": 9.987381561114462e-07, + "loss": 0.7136, + "step": 810 + }, + { + "epoch": 0.10520913523222993, + "grad_norm": 252.80292081179402, + "learning_rate": 9.986633186170317e-07, + "loss": 0.7195, + "step": 820 + }, + { + "epoch": 0.10649217346676931, + "grad_norm": 403.67151516536325, + "learning_rate": 9.985863283238813e-07, + "loss": 0.6701, + "step": 830 + }, + { + "epoch": 0.1077752117013087, + "grad_norm": 232.20153366017854, + "learning_rate": 9.985071855643727e-07, + "loss": 0.6209, + "step": 840 + }, + { + "epoch": 0.10905824993584809, + "grad_norm": 305.16884609933203, + "learning_rate": 9.98425890680176e-07, + "loss": 0.6898, + "step": 850 + }, + { + "epoch": 0.11034128817038748, + "grad_norm": 282.9491430539344, + "learning_rate": 9.983424440222529e-07, + "loss": 0.6612, + "step": 860 + }, + { + "epoch": 0.11162432640492687, + "grad_norm": 241.97167461227284, + "learning_rate": 9.98256845950854e-07, + "loss": 0.6532, + "step": 870 + }, + { + "epoch": 0.11290736463946625, + "grad_norm": 230.90958680331048, + "learning_rate": 9.98169096835518e-07, + "loss": 0.6538, + "step": 880 + }, + { + "epoch": 0.11419040287400564, + "grad_norm": 232.06203971090605, + "learning_rate": 9.980791970550697e-07, + "loss": 0.6653, + "step": 890 + }, + { + "epoch": 0.11547344110854503, + "grad_norm": 364.33087320101055, + "learning_rate": 9.979871469976195e-07, + "loss": 0.6594, + "step": 900 + }, + { + "epoch": 0.11675647934308442, + "grad_norm": 103.65289009950727, + "learning_rate": 9.978929470605598e-07, + "loss": 0.7068, + "step": 910 + }, + { + "epoch": 0.1180395175776238, + "grad_norm": 195.1109220427102, + "learning_rate": 9.977965976505654e-07, + "loss": 0.649, + "step": 920 + }, + { + "epoch": 0.11932255581216321, + "grad_norm": 233.23314658006575, + "learning_rate": 9.976980991835893e-07, + "loss": 0.6652, + "step": 930 + }, + { + "epoch": 0.1206055940467026, + "grad_norm": 223.3146259779246, + "learning_rate": 9.975974520848637e-07, + "loss": 0.6818, + "step": 940 + }, + { + "epoch": 0.12188863228124198, + "grad_norm": 258.394496936126, + "learning_rate": 9.974946567888956e-07, + "loss": 0.6378, + "step": 950 + }, + { + "epoch": 0.12317167051578137, + "grad_norm": 295.0456073783979, + "learning_rate": 9.97389713739467e-07, + "loss": 0.629, + "step": 960 + }, + { + "epoch": 0.12445470875032076, + "grad_norm": 126.19841498495917, + "learning_rate": 9.972826233896313e-07, + "loss": 0.6546, + "step": 970 + }, + { + "epoch": 0.12573774698486015, + "grad_norm": 237.25165124958343, + "learning_rate": 9.971733862017124e-07, + "loss": 0.6544, + "step": 980 + }, + { + "epoch": 0.12702078521939955, + "grad_norm": 301.4413520225032, + "learning_rate": 9.970620026473025e-07, + "loss": 0.6871, + "step": 990 + }, + { + "epoch": 0.12830382345393893, + "grad_norm": 209.16203458834582, + "learning_rate": 9.9694847320726e-07, + "loss": 0.7056, + "step": 1000 + }, + { + "epoch": 0.12958686168847833, + "grad_norm": 156.63440120768837, + "learning_rate": 9.96832798371707e-07, + "loss": 0.6592, + "step": 1010 + }, + { + "epoch": 0.1308698999230177, + "grad_norm": 258.5603789840098, + "learning_rate": 9.967149786400277e-07, + "loss": 0.7091, + "step": 1020 + }, + { + "epoch": 0.1321529381575571, + "grad_norm": 204.47053430978377, + "learning_rate": 9.965950145208666e-07, + "loss": 0.6977, + "step": 1030 + }, + { + "epoch": 0.13343597639209648, + "grad_norm": 133.16898643414888, + "learning_rate": 9.964729065321252e-07, + "loss": 0.6963, + "step": 1040 + }, + { + "epoch": 0.13471901462663588, + "grad_norm": 241.45149675600217, + "learning_rate": 9.96348655200961e-07, + "loss": 0.6206, + "step": 1050 + }, + { + "epoch": 0.13600205286117525, + "grad_norm": 250.08099126164794, + "learning_rate": 9.962222610637835e-07, + "loss": 0.61, + "step": 1060 + }, + { + "epoch": 0.13728509109571466, + "grad_norm": 65.2587567926103, + "learning_rate": 9.960937246662545e-07, + "loss": 0.6489, + "step": 1070 + }, + { + "epoch": 0.13856812933025403, + "grad_norm": 744.4673580347398, + "learning_rate": 9.959630465632831e-07, + "loss": 0.6169, + "step": 1080 + }, + { + "epoch": 0.13985116756479343, + "grad_norm": 141.4598968828845, + "learning_rate": 9.958302273190247e-07, + "loss": 0.6523, + "step": 1090 + }, + { + "epoch": 0.1411342057993328, + "grad_norm": 150.4269460426325, + "learning_rate": 9.956952675068786e-07, + "loss": 0.6689, + "step": 1100 + }, + { + "epoch": 0.1424172440338722, + "grad_norm": 267.7821596732494, + "learning_rate": 9.95558167709485e-07, + "loss": 0.6833, + "step": 1110 + }, + { + "epoch": 0.1437002822684116, + "grad_norm": 125.41361172218653, + "learning_rate": 9.954189285187226e-07, + "loss": 0.6424, + "step": 1120 + }, + { + "epoch": 0.14498332050295099, + "grad_norm": 196.11527084156623, + "learning_rate": 9.952775505357065e-07, + "loss": 0.5982, + "step": 1130 + }, + { + "epoch": 0.1462663587374904, + "grad_norm": 185.6091907451524, + "learning_rate": 9.95134034370785e-07, + "loss": 0.6308, + "step": 1140 + }, + { + "epoch": 0.14754939697202976, + "grad_norm": 285.25599028212946, + "learning_rate": 9.949883806435375e-07, + "loss": 0.6955, + "step": 1150 + }, + { + "epoch": 0.14883243520656916, + "grad_norm": 404.9475392260525, + "learning_rate": 9.948405899827708e-07, + "loss": 0.5763, + "step": 1160 + }, + { + "epoch": 0.15011547344110854, + "grad_norm": 345.3807078818553, + "learning_rate": 9.946906630265184e-07, + "loss": 0.6252, + "step": 1170 + }, + { + "epoch": 0.15139851167564794, + "grad_norm": 351.78154185039, + "learning_rate": 9.94538600422035e-07, + "loss": 0.6159, + "step": 1180 + }, + { + "epoch": 0.15268154991018731, + "grad_norm": 313.7318301380712, + "learning_rate": 9.943844028257967e-07, + "loss": 0.6403, + "step": 1190 + }, + { + "epoch": 0.15396458814472672, + "grad_norm": 211.11391452081756, + "learning_rate": 9.942280709034954e-07, + "loss": 0.6475, + "step": 1200 + }, + { + "epoch": 0.1552476263792661, + "grad_norm": 249.20146795393595, + "learning_rate": 9.940696053300378e-07, + "loss": 0.6219, + "step": 1210 + }, + { + "epoch": 0.1565306646138055, + "grad_norm": 226.24172694061625, + "learning_rate": 9.93909006789542e-07, + "loss": 0.6704, + "step": 1220 + }, + { + "epoch": 0.1578137028483449, + "grad_norm": 312.29256022912784, + "learning_rate": 9.93746275975334e-07, + "loss": 0.6352, + "step": 1230 + }, + { + "epoch": 0.15909674108288427, + "grad_norm": 222.63710577868898, + "learning_rate": 9.935814135899454e-07, + "loss": 0.6352, + "step": 1240 + }, + { + "epoch": 0.16037977931742367, + "grad_norm": 273.42902965842177, + "learning_rate": 9.934144203451101e-07, + "loss": 0.6039, + "step": 1250 + }, + { + "epoch": 0.16166281755196305, + "grad_norm": 155.07946280069893, + "learning_rate": 9.932452969617607e-07, + "loss": 0.6322, + "step": 1260 + }, + { + "epoch": 0.16294585578650245, + "grad_norm": 101.60220691007102, + "learning_rate": 9.930740441700266e-07, + "loss": 0.7185, + "step": 1270 + }, + { + "epoch": 0.16422889402104182, + "grad_norm": 185.61083170261958, + "learning_rate": 9.929006627092297e-07, + "loss": 0.6227, + "step": 1280 + }, + { + "epoch": 0.16551193225558122, + "grad_norm": 110.9066944129287, + "learning_rate": 9.927251533278821e-07, + "loss": 0.6976, + "step": 1290 + }, + { + "epoch": 0.1667949704901206, + "grad_norm": 174.89332362163964, + "learning_rate": 9.925475167836819e-07, + "loss": 0.6407, + "step": 1300 + }, + { + "epoch": 0.16807800872466, + "grad_norm": 204.6341137211078, + "learning_rate": 9.923677538435108e-07, + "loss": 0.6747, + "step": 1310 + }, + { + "epoch": 0.16936104695919937, + "grad_norm": 393.60882051589294, + "learning_rate": 9.921858652834304e-07, + "loss": 0.586, + "step": 1320 + }, + { + "epoch": 0.17064408519373878, + "grad_norm": 54.92861317673329, + "learning_rate": 9.920018518886788e-07, + "loss": 0.6294, + "step": 1330 + }, + { + "epoch": 0.17192712342827815, + "grad_norm": 268.75192804307983, + "learning_rate": 9.918157144536675e-07, + "loss": 0.5801, + "step": 1340 + }, + { + "epoch": 0.17321016166281755, + "grad_norm": 153.14786662378606, + "learning_rate": 9.916274537819773e-07, + "loss": 0.6326, + "step": 1350 + }, + { + "epoch": 0.17449319989735695, + "grad_norm": 127.74567065481183, + "learning_rate": 9.914370706863558e-07, + "loss": 0.6002, + "step": 1360 + }, + { + "epoch": 0.17577623813189633, + "grad_norm": 98.29606961586033, + "learning_rate": 9.912445659887135e-07, + "loss": 0.5871, + "step": 1370 + }, + { + "epoch": 0.17705927636643573, + "grad_norm": 181.93775603253522, + "learning_rate": 9.910499405201193e-07, + "loss": 0.574, + "step": 1380 + }, + { + "epoch": 0.1783423146009751, + "grad_norm": 150.4272850120788, + "learning_rate": 9.90853195120799e-07, + "loss": 0.6312, + "step": 1390 + }, + { + "epoch": 0.1796253528355145, + "grad_norm": 47.054521439082414, + "learning_rate": 9.906543306401293e-07, + "loss": 0.6385, + "step": 1400 + }, + { + "epoch": 0.18090839107005388, + "grad_norm": 116.23551412983754, + "learning_rate": 9.904533479366363e-07, + "loss": 0.5982, + "step": 1410 + }, + { + "epoch": 0.18219142930459328, + "grad_norm": 206.72052851250405, + "learning_rate": 9.902502478779896e-07, + "loss": 0.5772, + "step": 1420 + }, + { + "epoch": 0.18347446753913266, + "grad_norm": 160.18307601981758, + "learning_rate": 9.900450313410007e-07, + "loss": 0.6261, + "step": 1430 + }, + { + "epoch": 0.18475750577367206, + "grad_norm": 149.32715135865536, + "learning_rate": 9.898376992116177e-07, + "loss": 0.6282, + "step": 1440 + }, + { + "epoch": 0.18604054400821143, + "grad_norm": 219.2275247767045, + "learning_rate": 9.896282523849223e-07, + "loss": 0.5697, + "step": 1450 + }, + { + "epoch": 0.18732358224275084, + "grad_norm": 283.63854669571714, + "learning_rate": 9.894166917651254e-07, + "loss": 0.5917, + "step": 1460 + }, + { + "epoch": 0.1886066204772902, + "grad_norm": 277.5329220598513, + "learning_rate": 9.892030182655638e-07, + "loss": 0.5188, + "step": 1470 + }, + { + "epoch": 0.1898896587118296, + "grad_norm": 132.85493246872863, + "learning_rate": 9.889872328086952e-07, + "loss": 0.6055, + "step": 1480 + }, + { + "epoch": 0.19117269694636901, + "grad_norm": 146.11831965983288, + "learning_rate": 9.887693363260957e-07, + "loss": 0.5484, + "step": 1490 + }, + { + "epoch": 0.1924557351809084, + "grad_norm": 182.57054457540633, + "learning_rate": 9.885493297584545e-07, + "loss": 0.576, + "step": 1500 + }, + { + "epoch": 0.1937387734154478, + "grad_norm": 189.38138450066361, + "learning_rate": 9.883272140555708e-07, + "loss": 0.6384, + "step": 1510 + }, + { + "epoch": 0.19502181164998716, + "grad_norm": 170.33597752529735, + "learning_rate": 9.881029901763484e-07, + "loss": 0.6115, + "step": 1520 + }, + { + "epoch": 0.19630484988452657, + "grad_norm": 81.06797112294171, + "learning_rate": 9.878766590887932e-07, + "loss": 0.5857, + "step": 1530 + }, + { + "epoch": 0.19758788811906594, + "grad_norm": 219.84688036598953, + "learning_rate": 9.876482217700078e-07, + "loss": 0.5488, + "step": 1540 + }, + { + "epoch": 0.19887092635360534, + "grad_norm": 216.39954202652584, + "learning_rate": 9.874176792061879e-07, + "loss": 0.5885, + "step": 1550 + }, + { + "epoch": 0.20015396458814472, + "grad_norm": 109.09383116199874, + "learning_rate": 9.871850323926177e-07, + "loss": 0.6076, + "step": 1560 + }, + { + "epoch": 0.20143700282268412, + "grad_norm": 229.12929072548326, + "learning_rate": 9.869502823336654e-07, + "loss": 0.6152, + "step": 1570 + }, + { + "epoch": 0.2027200410572235, + "grad_norm": 152.8058942167868, + "learning_rate": 9.867134300427805e-07, + "loss": 0.6034, + "step": 1580 + }, + { + "epoch": 0.2040030792917629, + "grad_norm": 280.1936257726876, + "learning_rate": 9.864744765424864e-07, + "loss": 0.6485, + "step": 1590 + }, + { + "epoch": 0.2052861175263023, + "grad_norm": 199.2244661839233, + "learning_rate": 9.862334228643788e-07, + "loss": 0.5796, + "step": 1600 + }, + { + "epoch": 0.20656915576084167, + "grad_norm": 235.49608912619834, + "learning_rate": 9.859902700491197e-07, + "loss": 0.622, + "step": 1610 + }, + { + "epoch": 0.20785219399538107, + "grad_norm": 204.31865483159947, + "learning_rate": 9.857450191464337e-07, + "loss": 0.5837, + "step": 1620 + }, + { + "epoch": 0.20913523222992045, + "grad_norm": 143.40245282299608, + "learning_rate": 9.85497671215103e-07, + "loss": 0.6127, + "step": 1630 + }, + { + "epoch": 0.21041827046445985, + "grad_norm": 157.77826827829796, + "learning_rate": 9.852482273229627e-07, + "loss": 0.5605, + "step": 1640 + }, + { + "epoch": 0.21170130869899922, + "grad_norm": 96.15035213737738, + "learning_rate": 9.849966885468971e-07, + "loss": 0.5956, + "step": 1650 + }, + { + "epoch": 0.21298434693353863, + "grad_norm": 66.33699904485792, + "learning_rate": 9.847430559728337e-07, + "loss": 0.5359, + "step": 1660 + }, + { + "epoch": 0.214267385168078, + "grad_norm": 139.10605552465088, + "learning_rate": 9.844873306957398e-07, + "loss": 0.5855, + "step": 1670 + }, + { + "epoch": 0.2155504234026174, + "grad_norm": 216.80204788879325, + "learning_rate": 9.842295138196164e-07, + "loss": 0.5589, + "step": 1680 + }, + { + "epoch": 0.21683346163715678, + "grad_norm": 43.51628407628625, + "learning_rate": 9.83969606457495e-07, + "loss": 0.5728, + "step": 1690 + }, + { + "epoch": 0.21811649987169618, + "grad_norm": 95.10318097720744, + "learning_rate": 9.837076097314318e-07, + "loss": 0.6076, + "step": 1700 + }, + { + "epoch": 0.21939953810623555, + "grad_norm": 82.86768680504082, + "learning_rate": 9.83443524772503e-07, + "loss": 0.6278, + "step": 1710 + }, + { + "epoch": 0.22068257634077496, + "grad_norm": 157.9159497917049, + "learning_rate": 9.831773527208002e-07, + "loss": 0.5671, + "step": 1720 + }, + { + "epoch": 0.22196561457531436, + "grad_norm": 97.12336610237939, + "learning_rate": 9.829090947254247e-07, + "loss": 0.5836, + "step": 1730 + }, + { + "epoch": 0.22324865280985373, + "grad_norm": 108.15125915732064, + "learning_rate": 9.826387519444836e-07, + "loss": 0.5938, + "step": 1740 + }, + { + "epoch": 0.22453169104439313, + "grad_norm": 85.39212598792813, + "learning_rate": 9.823663255450844e-07, + "loss": 0.5264, + "step": 1750 + }, + { + "epoch": 0.2258147292789325, + "grad_norm": 68.97258018960441, + "learning_rate": 9.820918167033294e-07, + "loss": 0.5481, + "step": 1760 + }, + { + "epoch": 0.2270977675134719, + "grad_norm": 131.26378392117633, + "learning_rate": 9.818152266043115e-07, + "loss": 0.6233, + "step": 1770 + }, + { + "epoch": 0.22838080574801128, + "grad_norm": 170.7541622515897, + "learning_rate": 9.815365564421085e-07, + "loss": 0.6136, + "step": 1780 + }, + { + "epoch": 0.22966384398255069, + "grad_norm": 160.71265772299893, + "learning_rate": 9.81255807419778e-07, + "loss": 0.5565, + "step": 1790 + }, + { + "epoch": 0.23094688221709006, + "grad_norm": 101.21727674235666, + "learning_rate": 9.80972980749353e-07, + "loss": 0.574, + "step": 1800 + }, + { + "epoch": 0.23222992045162946, + "grad_norm": 144.74537535470733, + "learning_rate": 9.806880776518349e-07, + "loss": 0.5317, + "step": 1810 + }, + { + "epoch": 0.23351295868616884, + "grad_norm": 146.97094513359616, + "learning_rate": 9.8040109935719e-07, + "loss": 0.5946, + "step": 1820 + }, + { + "epoch": 0.23479599692070824, + "grad_norm": 125.66227537593764, + "learning_rate": 9.801120471043438e-07, + "loss": 0.6007, + "step": 1830 + }, + { + "epoch": 0.2360790351552476, + "grad_norm": 182.89139311257597, + "learning_rate": 9.798209221411746e-07, + "loss": 0.5801, + "step": 1840 + }, + { + "epoch": 0.23736207338978701, + "grad_norm": 201.12682062033193, + "learning_rate": 9.795277257245094e-07, + "loss": 0.5778, + "step": 1850 + }, + { + "epoch": 0.23864511162432642, + "grad_norm": 132.82867011061398, + "learning_rate": 9.792324591201177e-07, + "loss": 0.5839, + "step": 1860 + }, + { + "epoch": 0.2399281498588658, + "grad_norm": 67.79255815839814, + "learning_rate": 9.789351236027066e-07, + "loss": 0.5892, + "step": 1870 + }, + { + "epoch": 0.2412111880934052, + "grad_norm": 175.6249057361181, + "learning_rate": 9.786357204559149e-07, + "loss": 0.6003, + "step": 1880 + }, + { + "epoch": 0.24249422632794457, + "grad_norm": 109.66816115052191, + "learning_rate": 9.78334250972307e-07, + "loss": 0.5703, + "step": 1890 + }, + { + "epoch": 0.24377726456248397, + "grad_norm": 140.33994022687585, + "learning_rate": 9.780307164533687e-07, + "loss": 0.5721, + "step": 1900 + }, + { + "epoch": 0.24506030279702334, + "grad_norm": 91.15867876884485, + "learning_rate": 9.77725118209501e-07, + "loss": 0.581, + "step": 1910 + }, + { + "epoch": 0.24634334103156275, + "grad_norm": 104.94771349878727, + "learning_rate": 9.774174575600136e-07, + "loss": 0.5448, + "step": 1920 + }, + { + "epoch": 0.24762637926610212, + "grad_norm": 101.12966460927039, + "learning_rate": 9.771077358331201e-07, + "loss": 0.5859, + "step": 1930 + }, + { + "epoch": 0.24890941750064152, + "grad_norm": 106.44729021928742, + "learning_rate": 9.767959543659325e-07, + "loss": 0.6228, + "step": 1940 + }, + { + "epoch": 0.2501924557351809, + "grad_norm": 98.18438143073902, + "learning_rate": 9.764821145044543e-07, + "loss": 0.5629, + "step": 1950 + }, + { + "epoch": 0.2514754939697203, + "grad_norm": 207.50210063130612, + "learning_rate": 9.761662176035762e-07, + "loss": 0.5667, + "step": 1960 + }, + { + "epoch": 0.2527585322042597, + "grad_norm": 103.33501255218327, + "learning_rate": 9.758482650270685e-07, + "loss": 0.5444, + "step": 1970 + }, + { + "epoch": 0.2540415704387991, + "grad_norm": 89.40546395345238, + "learning_rate": 9.755282581475767e-07, + "loss": 0.5563, + "step": 1980 + }, + { + "epoch": 0.25532460867333845, + "grad_norm": 157.2807539819587, + "learning_rate": 9.75206198346615e-07, + "loss": 0.557, + "step": 1990 + }, + { + "epoch": 0.25660764690787785, + "grad_norm": 193.08650407857195, + "learning_rate": 9.748820870145602e-07, + "loss": 0.532, + "step": 2000 + }, + { + "epoch": 0.25789068514241725, + "grad_norm": 182.83253768958875, + "learning_rate": 9.74555925550646e-07, + "loss": 0.6271, + "step": 2010 + }, + { + "epoch": 0.25917372337695666, + "grad_norm": 95.47772962832124, + "learning_rate": 9.742277153629563e-07, + "loss": 0.5432, + "step": 2020 + }, + { + "epoch": 0.260456761611496, + "grad_norm": 152.57051795847727, + "learning_rate": 9.738974578684206e-07, + "loss": 0.5639, + "step": 2030 + }, + { + "epoch": 0.2617397998460354, + "grad_norm": 142.19066762704168, + "learning_rate": 9.735651544928058e-07, + "loss": 0.5361, + "step": 2040 + }, + { + "epoch": 0.2630228380805748, + "grad_norm": 74.44691804759147, + "learning_rate": 9.73230806670712e-07, + "loss": 0.5312, + "step": 2050 + }, + { + "epoch": 0.2643058763151142, + "grad_norm": 62.817015904843984, + "learning_rate": 9.728944158455653e-07, + "loss": 0.5895, + "step": 2060 + }, + { + "epoch": 0.26558891454965355, + "grad_norm": 95.24820933227181, + "learning_rate": 9.725559834696109e-07, + "loss": 0.5688, + "step": 2070 + }, + { + "epoch": 0.26687195278419296, + "grad_norm": 81.31135945557507, + "learning_rate": 9.722155110039089e-07, + "loss": 0.5045, + "step": 2080 + }, + { + "epoch": 0.26815499101873236, + "grad_norm": 132.00497151945328, + "learning_rate": 9.71872999918326e-07, + "loss": 0.6027, + "step": 2090 + }, + { + "epoch": 0.26943802925327176, + "grad_norm": 88.8462717644516, + "learning_rate": 9.7152845169153e-07, + "loss": 0.5944, + "step": 2100 + }, + { + "epoch": 0.27072106748781116, + "grad_norm": 138.53297477669003, + "learning_rate": 9.711818678109837e-07, + "loss": 0.5995, + "step": 2110 + }, + { + "epoch": 0.2720041057223505, + "grad_norm": 227.50860767258524, + "learning_rate": 9.708332497729376e-07, + "loss": 0.5739, + "step": 2120 + }, + { + "epoch": 0.2732871439568899, + "grad_norm": 78.8047137515368, + "learning_rate": 9.70482599082424e-07, + "loss": 0.5506, + "step": 2130 + }, + { + "epoch": 0.2745701821914293, + "grad_norm": 125.51465337780404, + "learning_rate": 9.701299172532508e-07, + "loss": 0.4835, + "step": 2140 + }, + { + "epoch": 0.2758532204259687, + "grad_norm": 59.35917935492479, + "learning_rate": 9.697752058079942e-07, + "loss": 0.5569, + "step": 2150 + }, + { + "epoch": 0.27713625866050806, + "grad_norm": 118.55659208359444, + "learning_rate": 9.694184662779929e-07, + "loss": 0.5302, + "step": 2160 + }, + { + "epoch": 0.27841929689504746, + "grad_norm": 136.11589338428104, + "learning_rate": 9.69059700203341e-07, + "loss": 0.5203, + "step": 2170 + }, + { + "epoch": 0.27970233512958687, + "grad_norm": 74.46076393132971, + "learning_rate": 9.686989091328812e-07, + "loss": 0.509, + "step": 2180 + }, + { + "epoch": 0.28098537336412627, + "grad_norm": 94.3500124559077, + "learning_rate": 9.683360946241987e-07, + "loss": 0.5293, + "step": 2190 + }, + { + "epoch": 0.2822684115986656, + "grad_norm": 74.55788976206907, + "learning_rate": 9.67971258243614e-07, + "loss": 0.5488, + "step": 2200 + }, + { + "epoch": 0.283551449833205, + "grad_norm": 134.5951365670617, + "learning_rate": 9.676044015661768e-07, + "loss": 0.5981, + "step": 2210 + }, + { + "epoch": 0.2848344880677444, + "grad_norm": 92.03908896696493, + "learning_rate": 9.672355261756576e-07, + "loss": 0.5348, + "step": 2220 + }, + { + "epoch": 0.2861175263022838, + "grad_norm": 78.78226915642787, + "learning_rate": 9.668646336645432e-07, + "loss": 0.5076, + "step": 2230 + }, + { + "epoch": 0.2874005645368232, + "grad_norm": 196.20657017497456, + "learning_rate": 9.664917256340278e-07, + "loss": 0.5311, + "step": 2240 + }, + { + "epoch": 0.28868360277136257, + "grad_norm": 163.40688357061603, + "learning_rate": 9.661168036940071e-07, + "loss": 0.5536, + "step": 2250 + }, + { + "epoch": 0.28996664100590197, + "grad_norm": 209.06372183941585, + "learning_rate": 9.657398694630712e-07, + "loss": 0.5249, + "step": 2260 + }, + { + "epoch": 0.2912496792404414, + "grad_norm": 144.40246568412755, + "learning_rate": 9.653609245684972e-07, + "loss": 0.5488, + "step": 2270 + }, + { + "epoch": 0.2925327174749808, + "grad_norm": 143.32041228685134, + "learning_rate": 9.649799706462434e-07, + "loss": 0.5411, + "step": 2280 + }, + { + "epoch": 0.2938157557095201, + "grad_norm": 184.47820197170782, + "learning_rate": 9.645970093409402e-07, + "loss": 0.5124, + "step": 2290 + }, + { + "epoch": 0.2950987939440595, + "grad_norm": 96.52836275154928, + "learning_rate": 9.642120423058849e-07, + "loss": 0.497, + "step": 2300 + }, + { + "epoch": 0.2963818321785989, + "grad_norm": 121.24223812508137, + "learning_rate": 9.638250712030334e-07, + "loss": 0.5384, + "step": 2310 + }, + { + "epoch": 0.2976648704131383, + "grad_norm": 179.31385123040465, + "learning_rate": 9.634360977029939e-07, + "loss": 0.5549, + "step": 2320 + }, + { + "epoch": 0.2989479086476777, + "grad_norm": 65.11155498939304, + "learning_rate": 9.63045123485019e-07, + "loss": 0.5405, + "step": 2330 + }, + { + "epoch": 0.3002309468822171, + "grad_norm": 44.77811128594072, + "learning_rate": 9.626521502369983e-07, + "loss": 0.5299, + "step": 2340 + }, + { + "epoch": 0.3015139851167565, + "grad_norm": 192.7822050634201, + "learning_rate": 9.622571796554522e-07, + "loss": 0.5289, + "step": 2350 + }, + { + "epoch": 0.3027970233512959, + "grad_norm": 100.99286642534246, + "learning_rate": 9.618602134455233e-07, + "loss": 0.5456, + "step": 2360 + }, + { + "epoch": 0.3040800615858353, + "grad_norm": 97.46475092913997, + "learning_rate": 9.6146125332097e-07, + "loss": 0.5342, + "step": 2370 + }, + { + "epoch": 0.30536309982037463, + "grad_norm": 85.79370933844478, + "learning_rate": 9.610603010041582e-07, + "loss": 0.5392, + "step": 2380 + }, + { + "epoch": 0.30664613805491403, + "grad_norm": 69.04229553053277, + "learning_rate": 9.60657358226055e-07, + "loss": 0.4992, + "step": 2390 + }, + { + "epoch": 0.30792917628945343, + "grad_norm": 124.92462876518213, + "learning_rate": 9.602524267262202e-07, + "loss": 0.5241, + "step": 2400 + }, + { + "epoch": 0.30921221452399283, + "grad_norm": 98.10179688147778, + "learning_rate": 9.598455082527991e-07, + "loss": 0.5287, + "step": 2410 + }, + { + "epoch": 0.3104952527585322, + "grad_norm": 108.61518323292323, + "learning_rate": 9.594366045625153e-07, + "loss": 0.5378, + "step": 2420 + }, + { + "epoch": 0.3117782909930716, + "grad_norm": 78.8604580425309, + "learning_rate": 9.590257174206629e-07, + "loss": 0.5291, + "step": 2430 + }, + { + "epoch": 0.313061329227611, + "grad_norm": 90.02785655021445, + "learning_rate": 9.586128486010985e-07, + "loss": 0.542, + "step": 2440 + }, + { + "epoch": 0.3143443674621504, + "grad_norm": 120.60051139676054, + "learning_rate": 9.58197999886234e-07, + "loss": 0.547, + "step": 2450 + }, + { + "epoch": 0.3156274056966898, + "grad_norm": 113.3678097491849, + "learning_rate": 9.577811730670295e-07, + "loss": 0.5212, + "step": 2460 + }, + { + "epoch": 0.31691044393122914, + "grad_norm": 152.82224829070685, + "learning_rate": 9.573623699429837e-07, + "loss": 0.5224, + "step": 2470 + }, + { + "epoch": 0.31819348216576854, + "grad_norm": 76.17655003840161, + "learning_rate": 9.569415923221275e-07, + "loss": 0.5186, + "step": 2480 + }, + { + "epoch": 0.31947652040030794, + "grad_norm": 83.8072497505936, + "learning_rate": 9.565188420210168e-07, + "loss": 0.5173, + "step": 2490 + }, + { + "epoch": 0.32075955863484734, + "grad_norm": 96.69870222358485, + "learning_rate": 9.560941208647231e-07, + "loss": 0.4814, + "step": 2500 + }, + { + "epoch": 0.3220425968693867, + "grad_norm": 85.59984589074426, + "learning_rate": 9.556674306868264e-07, + "loss": 0.526, + "step": 2510 + }, + { + "epoch": 0.3233256351039261, + "grad_norm": 115.69951917108001, + "learning_rate": 9.552387733294078e-07, + "loss": 0.5414, + "step": 2520 + }, + { + "epoch": 0.3246086733384655, + "grad_norm": 145.5625207118623, + "learning_rate": 9.548081506430406e-07, + "loss": 0.5623, + "step": 2530 + }, + { + "epoch": 0.3258917115730049, + "grad_norm": 99.50445608172687, + "learning_rate": 9.543755644867822e-07, + "loss": 0.5832, + "step": 2540 + }, + { + "epoch": 0.32717474980754424, + "grad_norm": 91.68401105242992, + "learning_rate": 9.539410167281671e-07, + "loss": 0.5401, + "step": 2550 + }, + { + "epoch": 0.32845778804208364, + "grad_norm": 67.09981635540203, + "learning_rate": 9.535045092431988e-07, + "loss": 0.5208, + "step": 2560 + }, + { + "epoch": 0.32974082627662304, + "grad_norm": 104.48143010449827, + "learning_rate": 9.530660439163402e-07, + "loss": 0.552, + "step": 2570 + }, + { + "epoch": 0.33102386451116245, + "grad_norm": 84.42394895345447, + "learning_rate": 9.526256226405073e-07, + "loss": 0.5361, + "step": 2580 + }, + { + "epoch": 0.33230690274570185, + "grad_norm": 59.71217847206384, + "learning_rate": 9.521832473170596e-07, + "loss": 0.4921, + "step": 2590 + }, + { + "epoch": 0.3335899409802412, + "grad_norm": 78.64934994859199, + "learning_rate": 9.517389198557928e-07, + "loss": 0.5508, + "step": 2600 + }, + { + "epoch": 0.3348729792147806, + "grad_norm": 193.0353134817172, + "learning_rate": 9.512926421749303e-07, + "loss": 0.5125, + "step": 2610 + }, + { + "epoch": 0.33615601744932, + "grad_norm": 55.035054500657644, + "learning_rate": 9.508444162011147e-07, + "loss": 0.5336, + "step": 2620 + }, + { + "epoch": 0.3374390556838594, + "grad_norm": 89.41620561216038, + "learning_rate": 9.503942438693995e-07, + "loss": 0.5417, + "step": 2630 + }, + { + "epoch": 0.33872209391839875, + "grad_norm": 88.81010586231267, + "learning_rate": 9.499421271232415e-07, + "loss": 0.5204, + "step": 2640 + }, + { + "epoch": 0.34000513215293815, + "grad_norm": 129.76504794050945, + "learning_rate": 9.494880679144912e-07, + "loss": 0.5296, + "step": 2650 + }, + { + "epoch": 0.34128817038747755, + "grad_norm": 75.95833135628054, + "learning_rate": 9.490320682033853e-07, + "loss": 0.5403, + "step": 2660 + }, + { + "epoch": 0.34257120862201695, + "grad_norm": 108.63935916838176, + "learning_rate": 9.485741299585379e-07, + "loss": 0.5051, + "step": 2670 + }, + { + "epoch": 0.3438542468565563, + "grad_norm": 63.81690759659535, + "learning_rate": 9.481142551569317e-07, + "loss": 0.5135, + "step": 2680 + }, + { + "epoch": 0.3451372850910957, + "grad_norm": 40.65266595888347, + "learning_rate": 9.476524457839102e-07, + "loss": 0.5279, + "step": 2690 + }, + { + "epoch": 0.3464203233256351, + "grad_norm": 62.00451181203029, + "learning_rate": 9.471887038331684e-07, + "loss": 0.51, + "step": 2700 + }, + { + "epoch": 0.3477033615601745, + "grad_norm": 170.26761539331866, + "learning_rate": 9.467230313067447e-07, + "loss": 0.5384, + "step": 2710 + }, + { + "epoch": 0.3489863997947139, + "grad_norm": 120.94533652558341, + "learning_rate": 9.462554302150122e-07, + "loss": 0.5849, + "step": 2720 + }, + { + "epoch": 0.35026943802925325, + "grad_norm": 120.43096664810126, + "learning_rate": 9.457859025766695e-07, + "loss": 0.5594, + "step": 2730 + }, + { + "epoch": 0.35155247626379266, + "grad_norm": 57.833782596045616, + "learning_rate": 9.453144504187326e-07, + "loss": 0.4973, + "step": 2740 + }, + { + "epoch": 0.35283551449833206, + "grad_norm": 99.0938235889427, + "learning_rate": 9.448410757765258e-07, + "loss": 0.4999, + "step": 2750 + }, + { + "epoch": 0.35411855273287146, + "grad_norm": 69.10735137973282, + "learning_rate": 9.443657806936734e-07, + "loss": 0.4896, + "step": 2760 + }, + { + "epoch": 0.3554015909674108, + "grad_norm": 59.8246422242593, + "learning_rate": 9.438885672220897e-07, + "loss": 0.5276, + "step": 2770 + }, + { + "epoch": 0.3566846292019502, + "grad_norm": 149.736859286465, + "learning_rate": 9.434094374219721e-07, + "loss": 0.5285, + "step": 2780 + }, + { + "epoch": 0.3579676674364896, + "grad_norm": 45.83522677239938, + "learning_rate": 9.429283933617899e-07, + "loss": 0.5067, + "step": 2790 + }, + { + "epoch": 0.359250705671029, + "grad_norm": 49.13309130489495, + "learning_rate": 9.424454371182773e-07, + "loss": 0.5201, + "step": 2800 + }, + { + "epoch": 0.36053374390556836, + "grad_norm": 56.41038847457869, + "learning_rate": 9.419605707764233e-07, + "loss": 0.5391, + "step": 2810 + }, + { + "epoch": 0.36181678214010776, + "grad_norm": 137.21661708712807, + "learning_rate": 9.414737964294634e-07, + "loss": 0.5146, + "step": 2820 + }, + { + "epoch": 0.36309982037464716, + "grad_norm": 97.26634440538126, + "learning_rate": 9.409851161788697e-07, + "loss": 0.5146, + "step": 2830 + }, + { + "epoch": 0.36438285860918657, + "grad_norm": 94.66064502466513, + "learning_rate": 9.404945321343429e-07, + "loss": 0.5435, + "step": 2840 + }, + { + "epoch": 0.36566589684372597, + "grad_norm": 51.74410392536065, + "learning_rate": 9.400020464138023e-07, + "loss": 0.5688, + "step": 2850 + }, + { + "epoch": 0.3669489350782653, + "grad_norm": 60.11514043587916, + "learning_rate": 9.39507661143377e-07, + "loss": 0.4619, + "step": 2860 + }, + { + "epoch": 0.3682319733128047, + "grad_norm": 83.92644875994564, + "learning_rate": 9.39011378457397e-07, + "loss": 0.5362, + "step": 2870 + }, + { + "epoch": 0.3695150115473441, + "grad_norm": 39.54064056485653, + "learning_rate": 9.385132004983832e-07, + "loss": 0.4914, + "step": 2880 + }, + { + "epoch": 0.3707980497818835, + "grad_norm": 50.241723218110366, + "learning_rate": 9.380131294170393e-07, + "loss": 0.5813, + "step": 2890 + }, + { + "epoch": 0.37208108801642287, + "grad_norm": 73.55402078436745, + "learning_rate": 9.375111673722413e-07, + "loss": 0.4933, + "step": 2900 + }, + { + "epoch": 0.37336412625096227, + "grad_norm": 185.01116480980332, + "learning_rate": 9.370073165310292e-07, + "loss": 0.5317, + "step": 2910 + }, + { + "epoch": 0.37464716448550167, + "grad_norm": 112.6902578948012, + "learning_rate": 9.365015790685968e-07, + "loss": 0.5025, + "step": 2920 + }, + { + "epoch": 0.3759302027200411, + "grad_norm": 65.45440886042813, + "learning_rate": 9.35993957168283e-07, + "loss": 0.5172, + "step": 2930 + }, + { + "epoch": 0.3772132409545804, + "grad_norm": 91.6031352167508, + "learning_rate": 9.35484453021562e-07, + "loss": 0.5233, + "step": 2940 + }, + { + "epoch": 0.3784962791891198, + "grad_norm": 68.759744403661, + "learning_rate": 9.349730688280339e-07, + "loss": 0.5709, + "step": 2950 + }, + { + "epoch": 0.3797793174236592, + "grad_norm": 66.78943064147592, + "learning_rate": 9.344598067954151e-07, + "loss": 0.5002, + "step": 2960 + }, + { + "epoch": 0.3810623556581986, + "grad_norm": 115.45928338504864, + "learning_rate": 9.33944669139529e-07, + "loss": 0.5019, + "step": 2970 + }, + { + "epoch": 0.38234539389273803, + "grad_norm": 48.46911526563253, + "learning_rate": 9.334276580842966e-07, + "loss": 0.4973, + "step": 2980 + }, + { + "epoch": 0.3836284321272774, + "grad_norm": 59.680461304688116, + "learning_rate": 9.32908775861726e-07, + "loss": 0.4952, + "step": 2990 + }, + { + "epoch": 0.3849114703618168, + "grad_norm": 137.82183966119425, + "learning_rate": 9.32388024711904e-07, + "loss": 0.5009, + "step": 3000 + }, + { + "epoch": 0.3861945085963562, + "grad_norm": 72.88101260567575, + "learning_rate": 9.318654068829857e-07, + "loss": 0.5511, + "step": 3010 + }, + { + "epoch": 0.3874775468308956, + "grad_norm": 119.75893124268156, + "learning_rate": 9.313409246311843e-07, + "loss": 0.5023, + "step": 3020 + }, + { + "epoch": 0.3887605850654349, + "grad_norm": 73.8150724223774, + "learning_rate": 9.308145802207628e-07, + "loss": 0.5401, + "step": 3030 + }, + { + "epoch": 0.39004362329997433, + "grad_norm": 94.93395449552862, + "learning_rate": 9.302863759240231e-07, + "loss": 0.4967, + "step": 3040 + }, + { + "epoch": 0.39132666153451373, + "grad_norm": 92.76926948842548, + "learning_rate": 9.297563140212962e-07, + "loss": 0.5494, + "step": 3050 + }, + { + "epoch": 0.39260969976905313, + "grad_norm": 54.51188831354633, + "learning_rate": 9.29224396800933e-07, + "loss": 0.5543, + "step": 3060 + }, + { + "epoch": 0.3938927380035925, + "grad_norm": 61.97473543176993, + "learning_rate": 9.286906265592939e-07, + "loss": 0.4516, + "step": 3070 + }, + { + "epoch": 0.3951757762381319, + "grad_norm": 45.245587178773974, + "learning_rate": 9.281550056007394e-07, + "loss": 0.553, + "step": 3080 + }, + { + "epoch": 0.3964588144726713, + "grad_norm": 115.79739046578939, + "learning_rate": 9.27617536237619e-07, + "loss": 0.4947, + "step": 3090 + }, + { + "epoch": 0.3977418527072107, + "grad_norm": 85.9278518555287, + "learning_rate": 9.270782207902627e-07, + "loss": 0.4922, + "step": 3100 + }, + { + "epoch": 0.3990248909417501, + "grad_norm": 67.76999989757189, + "learning_rate": 9.265370615869703e-07, + "loss": 0.4935, + "step": 3110 + }, + { + "epoch": 0.40030792917628943, + "grad_norm": 54.86641762256036, + "learning_rate": 9.259940609640011e-07, + "loss": 0.491, + "step": 3120 + }, + { + "epoch": 0.40159096741082884, + "grad_norm": 49.738106812225254, + "learning_rate": 9.254492212655638e-07, + "loss": 0.5513, + "step": 3130 + }, + { + "epoch": 0.40287400564536824, + "grad_norm": 140.3576174385971, + "learning_rate": 9.249025448438075e-07, + "loss": 0.503, + "step": 3140 + }, + { + "epoch": 0.40415704387990764, + "grad_norm": 72.83607328411647, + "learning_rate": 9.243540340588096e-07, + "loss": 0.4628, + "step": 3150 + }, + { + "epoch": 0.405440082114447, + "grad_norm": 96.92348607041515, + "learning_rate": 9.238036912785679e-07, + "loss": 0.5191, + "step": 3160 + }, + { + "epoch": 0.4067231203489864, + "grad_norm": 72.80884144343837, + "learning_rate": 9.23251518878988e-07, + "loss": 0.4863, + "step": 3170 + }, + { + "epoch": 0.4080061585835258, + "grad_norm": 92.52108398865427, + "learning_rate": 9.22697519243875e-07, + "loss": 0.5476, + "step": 3180 + }, + { + "epoch": 0.4092891968180652, + "grad_norm": 62.37392442082554, + "learning_rate": 9.221416947649222e-07, + "loss": 0.5191, + "step": 3190 + }, + { + "epoch": 0.4105722350526046, + "grad_norm": 77.64070752149898, + "learning_rate": 9.215840478417009e-07, + "loss": 0.5246, + "step": 3200 + }, + { + "epoch": 0.41185527328714394, + "grad_norm": 85.75305982046062, + "learning_rate": 9.210245808816504e-07, + "loss": 0.5466, + "step": 3210 + }, + { + "epoch": 0.41313831152168334, + "grad_norm": 70.68374980619299, + "learning_rate": 9.20463296300067e-07, + "loss": 0.5213, + "step": 3220 + }, + { + "epoch": 0.41442134975622275, + "grad_norm": 105.17299703081034, + "learning_rate": 9.199001965200944e-07, + "loss": 0.4936, + "step": 3230 + }, + { + "epoch": 0.41570438799076215, + "grad_norm": 93.38321311405969, + "learning_rate": 9.19335283972712e-07, + "loss": 0.5184, + "step": 3240 + }, + { + "epoch": 0.4169874262253015, + "grad_norm": 81.80960050535629, + "learning_rate": 9.187685610967261e-07, + "loss": 0.4666, + "step": 3250 + }, + { + "epoch": 0.4182704644598409, + "grad_norm": 82.48523582162457, + "learning_rate": 9.182000303387577e-07, + "loss": 0.5286, + "step": 3260 + }, + { + "epoch": 0.4195535026943803, + "grad_norm": 86.58056082667797, + "learning_rate": 9.176296941532331e-07, + "loss": 0.5419, + "step": 3270 + }, + { + "epoch": 0.4208365409289197, + "grad_norm": 110.83314509596525, + "learning_rate": 9.170575550023729e-07, + "loss": 0.4989, + "step": 3280 + }, + { + "epoch": 0.42211957916345905, + "grad_norm": 100.82182214719839, + "learning_rate": 9.164836153561811e-07, + "loss": 0.5416, + "step": 3290 + }, + { + "epoch": 0.42340261739799845, + "grad_norm": 66.0891640337973, + "learning_rate": 9.159078776924345e-07, + "loss": 0.5108, + "step": 3300 + }, + { + "epoch": 0.42468565563253785, + "grad_norm": 58.500959259089456, + "learning_rate": 9.153303444966727e-07, + "loss": 0.5495, + "step": 3310 + }, + { + "epoch": 0.42596869386707725, + "grad_norm": 70.27935008733114, + "learning_rate": 9.147510182621867e-07, + "loss": 0.5171, + "step": 3320 + }, + { + "epoch": 0.42725173210161665, + "grad_norm": 84.80662687789946, + "learning_rate": 9.141699014900082e-07, + "loss": 0.4701, + "step": 3330 + }, + { + "epoch": 0.428534770336156, + "grad_norm": 47.54249334268104, + "learning_rate": 9.13586996688899e-07, + "loss": 0.502, + "step": 3340 + }, + { + "epoch": 0.4298178085706954, + "grad_norm": 139.57086751459553, + "learning_rate": 9.130023063753398e-07, + "loss": 0.5595, + "step": 3350 + }, + { + "epoch": 0.4311008468052348, + "grad_norm": 135.55131093569233, + "learning_rate": 9.124158330735199e-07, + "loss": 0.526, + "step": 3360 + }, + { + "epoch": 0.4323838850397742, + "grad_norm": 66.76542233715135, + "learning_rate": 9.118275793153259e-07, + "loss": 0.5215, + "step": 3370 + }, + { + "epoch": 0.43366692327431355, + "grad_norm": 64.97478188769473, + "learning_rate": 9.112375476403311e-07, + "loss": 0.4751, + "step": 3380 + }, + { + "epoch": 0.43494996150885296, + "grad_norm": 49.726519858618715, + "learning_rate": 9.106457405957839e-07, + "loss": 0.4777, + "step": 3390 + }, + { + "epoch": 0.43623299974339236, + "grad_norm": 76.57492181577163, + "learning_rate": 9.100521607365974e-07, + "loss": 0.5654, + "step": 3400 + }, + { + "epoch": 0.43751603797793176, + "grad_norm": 72.1879683017641, + "learning_rate": 9.094568106253383e-07, + "loss": 0.4831, + "step": 3410 + }, + { + "epoch": 0.4387990762124711, + "grad_norm": 74.55530226384423, + "learning_rate": 9.088596928322157e-07, + "loss": 0.543, + "step": 3420 + }, + { + "epoch": 0.4400821144470105, + "grad_norm": 58.10894345930496, + "learning_rate": 9.082608099350697e-07, + "loss": 0.5089, + "step": 3430 + }, + { + "epoch": 0.4413651526815499, + "grad_norm": 74.02933255751522, + "learning_rate": 9.076601645193611e-07, + "loss": 0.4931, + "step": 3440 + }, + { + "epoch": 0.4426481909160893, + "grad_norm": 74.73992121442521, + "learning_rate": 9.070577591781597e-07, + "loss": 0.4764, + "step": 3450 + }, + { + "epoch": 0.4439312291506287, + "grad_norm": 57.05783720743899, + "learning_rate": 9.064535965121323e-07, + "loss": 0.4939, + "step": 3460 + }, + { + "epoch": 0.44521426738516806, + "grad_norm": 77.35389213409275, + "learning_rate": 9.058476791295335e-07, + "loss": 0.5052, + "step": 3470 + }, + { + "epoch": 0.44649730561970746, + "grad_norm": 57.1572549522783, + "learning_rate": 9.052400096461927e-07, + "loss": 0.5119, + "step": 3480 + }, + { + "epoch": 0.44778034385424687, + "grad_norm": 177.94155502457323, + "learning_rate": 9.046305906855029e-07, + "loss": 0.5008, + "step": 3490 + }, + { + "epoch": 0.44906338208878627, + "grad_norm": 71.70066837286801, + "learning_rate": 9.040194248784109e-07, + "loss": 0.5068, + "step": 3500 + }, + { + "epoch": 0.4503464203233256, + "grad_norm": 35.64818403585496, + "learning_rate": 9.034065148634039e-07, + "loss": 0.507, + "step": 3510 + }, + { + "epoch": 0.451629458557865, + "grad_norm": 79.19186440306412, + "learning_rate": 9.027918632864997e-07, + "loss": 0.5153, + "step": 3520 + }, + { + "epoch": 0.4529124967924044, + "grad_norm": 94.4713265654709, + "learning_rate": 9.021754728012343e-07, + "loss": 0.4908, + "step": 3530 + }, + { + "epoch": 0.4541955350269438, + "grad_norm": 84.5267549027121, + "learning_rate": 9.015573460686509e-07, + "loss": 0.4858, + "step": 3540 + }, + { + "epoch": 0.45547857326148317, + "grad_norm": 43.69958009471101, + "learning_rate": 9.009374857572885e-07, + "loss": 0.4902, + "step": 3550 + }, + { + "epoch": 0.45676161149602257, + "grad_norm": 29.572546623039514, + "learning_rate": 9.003158945431699e-07, + "loss": 0.4843, + "step": 3560 + }, + { + "epoch": 0.45804464973056197, + "grad_norm": 44.021687992196256, + "learning_rate": 8.99692575109791e-07, + "loss": 0.4783, + "step": 3570 + }, + { + "epoch": 0.45932768796510137, + "grad_norm": 79.2049721504944, + "learning_rate": 8.990675301481079e-07, + "loss": 0.5005, + "step": 3580 + }, + { + "epoch": 0.4606107261996408, + "grad_norm": 58.916580942334406, + "learning_rate": 8.984407623565265e-07, + "loss": 0.4996, + "step": 3590 + }, + { + "epoch": 0.4618937644341801, + "grad_norm": 44.36010581119709, + "learning_rate": 8.978122744408905e-07, + "loss": 0.5105, + "step": 3600 + }, + { + "epoch": 0.4631768026687195, + "grad_norm": 113.33162497602233, + "learning_rate": 8.971820691144691e-07, + "loss": 0.4795, + "step": 3610 + }, + { + "epoch": 0.4644598409032589, + "grad_norm": 99.31900319179132, + "learning_rate": 8.965501490979466e-07, + "loss": 0.5048, + "step": 3620 + }, + { + "epoch": 0.4657428791377983, + "grad_norm": 114.24775151092814, + "learning_rate": 8.959165171194089e-07, + "loss": 0.5279, + "step": 3630 + }, + { + "epoch": 0.4670259173723377, + "grad_norm": 90.99541019458889, + "learning_rate": 8.952811759143335e-07, + "loss": 0.5059, + "step": 3640 + }, + { + "epoch": 0.4683089556068771, + "grad_norm": 74.00664176473386, + "learning_rate": 8.946441282255765e-07, + "loss": 0.4877, + "step": 3650 + }, + { + "epoch": 0.4695919938414165, + "grad_norm": 101.35951552675873, + "learning_rate": 8.940053768033608e-07, + "loss": 0.5331, + "step": 3660 + }, + { + "epoch": 0.4708750320759559, + "grad_norm": 190.19732708758383, + "learning_rate": 8.933649244052656e-07, + "loss": 0.4657, + "step": 3670 + }, + { + "epoch": 0.4721580703104952, + "grad_norm": 128.72068852413693, + "learning_rate": 8.927227737962122e-07, + "loss": 0.4944, + "step": 3680 + }, + { + "epoch": 0.47344110854503463, + "grad_norm": 29.874733630022995, + "learning_rate": 8.92078927748454e-07, + "loss": 0.4871, + "step": 3690 + }, + { + "epoch": 0.47472414677957403, + "grad_norm": 35.96475401967171, + "learning_rate": 8.914333890415638e-07, + "loss": 0.487, + "step": 3700 + }, + { + "epoch": 0.47600718501411343, + "grad_norm": 58.17312922845809, + "learning_rate": 8.907861604624219e-07, + "loss": 0.5131, + "step": 3710 + }, + { + "epoch": 0.47729022324865283, + "grad_norm": 65.42942679054853, + "learning_rate": 8.901372448052035e-07, + "loss": 0.4738, + "step": 3720 + }, + { + "epoch": 0.4785732614831922, + "grad_norm": 89.03628125547272, + "learning_rate": 8.894866448713678e-07, + "loss": 0.4821, + "step": 3730 + }, + { + "epoch": 0.4798562997177316, + "grad_norm": 87.93129795299639, + "learning_rate": 8.888343634696449e-07, + "loss": 0.4867, + "step": 3740 + }, + { + "epoch": 0.481139337952271, + "grad_norm": 71.33287547091281, + "learning_rate": 8.881804034160243e-07, + "loss": 0.5329, + "step": 3750 + }, + { + "epoch": 0.4824223761868104, + "grad_norm": 73.12237614138162, + "learning_rate": 8.875247675337421e-07, + "loss": 0.5359, + "step": 3760 + }, + { + "epoch": 0.48370541442134973, + "grad_norm": 71.1499524735341, + "learning_rate": 8.868674586532693e-07, + "loss": 0.5179, + "step": 3770 + }, + { + "epoch": 0.48498845265588914, + "grad_norm": 125.2373609736472, + "learning_rate": 8.862084796122997e-07, + "loss": 0.5019, + "step": 3780 + }, + { + "epoch": 0.48627149089042854, + "grad_norm": 29.260109352075443, + "learning_rate": 8.855478332557373e-07, + "loss": 0.5255, + "step": 3790 + }, + { + "epoch": 0.48755452912496794, + "grad_norm": 67.10167376356975, + "learning_rate": 8.848855224356838e-07, + "loss": 0.4839, + "step": 3800 + }, + { + "epoch": 0.4888375673595073, + "grad_norm": 72.91744902825012, + "learning_rate": 8.842215500114273e-07, + "loss": 0.4979, + "step": 3810 + }, + { + "epoch": 0.4901206055940467, + "grad_norm": 55.66475628255291, + "learning_rate": 8.835559188494286e-07, + "loss": 0.5102, + "step": 3820 + }, + { + "epoch": 0.4914036438285861, + "grad_norm": 39.09817622878358, + "learning_rate": 8.828886318233098e-07, + "loss": 0.5256, + "step": 3830 + }, + { + "epoch": 0.4926866820631255, + "grad_norm": 62.49197135282087, + "learning_rate": 8.822196918138416e-07, + "loss": 0.5149, + "step": 3840 + }, + { + "epoch": 0.4939697202976649, + "grad_norm": 55.56459972321514, + "learning_rate": 8.815491017089311e-07, + "loss": 0.4938, + "step": 3850 + }, + { + "epoch": 0.49525275853220424, + "grad_norm": 101.17823217681804, + "learning_rate": 8.808768644036084e-07, + "loss": 0.4833, + "step": 3860 + }, + { + "epoch": 0.49653579676674364, + "grad_norm": 67.77397826111684, + "learning_rate": 8.802029828000155e-07, + "loss": 0.5007, + "step": 3870 + }, + { + "epoch": 0.49781883500128304, + "grad_norm": 72.2115524116798, + "learning_rate": 8.795274598073926e-07, + "loss": 0.463, + "step": 3880 + }, + { + "epoch": 0.49910187323582245, + "grad_norm": 100.39912701768698, + "learning_rate": 8.788502983420664e-07, + "loss": 0.4944, + "step": 3890 + }, + { + "epoch": 0.5003849114703618, + "grad_norm": 65.62350943727532, + "learning_rate": 8.781715013274367e-07, + "loss": 0.5053, + "step": 3900 + }, + { + "epoch": 0.5016679497049013, + "grad_norm": 62.37641694865736, + "learning_rate": 8.774910716939645e-07, + "loss": 0.484, + "step": 3910 + }, + { + "epoch": 0.5029509879394406, + "grad_norm": 45.287558804241634, + "learning_rate": 8.76809012379159e-07, + "loss": 0.5023, + "step": 3920 + }, + { + "epoch": 0.5042340261739799, + "grad_norm": 57.527910860924, + "learning_rate": 8.761253263275649e-07, + "loss": 0.5098, + "step": 3930 + }, + { + "epoch": 0.5055170644085194, + "grad_norm": 95.31526371427563, + "learning_rate": 8.754400164907496e-07, + "loss": 0.506, + "step": 3940 + }, + { + "epoch": 0.5068001026430587, + "grad_norm": 35.09736890299341, + "learning_rate": 8.747530858272907e-07, + "loss": 0.441, + "step": 3950 + }, + { + "epoch": 0.5080831408775982, + "grad_norm": 122.97873484388035, + "learning_rate": 8.740645373027634e-07, + "loss": 0.4978, + "step": 3960 + }, + { + "epoch": 0.5093661791121376, + "grad_norm": 122.01225666320966, + "learning_rate": 8.733743738897272e-07, + "loss": 0.4725, + "step": 3970 + }, + { + "epoch": 0.5106492173466769, + "grad_norm": 86.88136272530403, + "learning_rate": 8.726825985677131e-07, + "loss": 0.4531, + "step": 3980 + }, + { + "epoch": 0.5119322555812164, + "grad_norm": 86.56496537187459, + "learning_rate": 8.719892143232115e-07, + "loss": 0.4692, + "step": 3990 + }, + { + "epoch": 0.5132152938157557, + "grad_norm": 94.08970135759908, + "learning_rate": 8.71294224149658e-07, + "loss": 0.4909, + "step": 4000 + }, + { + "epoch": 0.514498332050295, + "grad_norm": 47.505307134176164, + "learning_rate": 8.705976310474218e-07, + "loss": 0.4302, + "step": 4010 + }, + { + "epoch": 0.5157813702848345, + "grad_norm": 42.16239657754352, + "learning_rate": 8.69899438023792e-07, + "loss": 0.5116, + "step": 4020 + }, + { + "epoch": 0.5170644085193739, + "grad_norm": 76.44892330842734, + "learning_rate": 8.691996480929647e-07, + "loss": 0.5364, + "step": 4030 + }, + { + "epoch": 0.5183474467539133, + "grad_norm": 92.51786770061078, + "learning_rate": 8.684982642760301e-07, + "loss": 0.519, + "step": 4040 + }, + { + "epoch": 0.5196304849884527, + "grad_norm": 96.06152840002075, + "learning_rate": 8.677952896009596e-07, + "loss": 0.4866, + "step": 4050 + }, + { + "epoch": 0.520913523222992, + "grad_norm": 80.90817857169483, + "learning_rate": 8.670907271025923e-07, + "loss": 0.4883, + "step": 4060 + }, + { + "epoch": 0.5221965614575315, + "grad_norm": 110.56660906214886, + "learning_rate": 8.663845798226221e-07, + "loss": 0.4848, + "step": 4070 + }, + { + "epoch": 0.5234795996920708, + "grad_norm": 38.50504593138599, + "learning_rate": 8.656768508095852e-07, + "loss": 0.4943, + "step": 4080 + }, + { + "epoch": 0.5247626379266103, + "grad_norm": 79.09454912651827, + "learning_rate": 8.649675431188457e-07, + "loss": 0.4861, + "step": 4090 + }, + { + "epoch": 0.5260456761611496, + "grad_norm": 38.52528743231916, + "learning_rate": 8.642566598125831e-07, + "loss": 0.504, + "step": 4100 + }, + { + "epoch": 0.527328714395689, + "grad_norm": 68.16993109931833, + "learning_rate": 8.635442039597797e-07, + "loss": 0.4767, + "step": 4110 + }, + { + "epoch": 0.5286117526302284, + "grad_norm": 52.69967857278605, + "learning_rate": 8.62830178636206e-07, + "loss": 0.4588, + "step": 4120 + }, + { + "epoch": 0.5298947908647678, + "grad_norm": 44.94949990184425, + "learning_rate": 8.621145869244084e-07, + "loss": 0.4948, + "step": 4130 + }, + { + "epoch": 0.5311778290993071, + "grad_norm": 77.95582790851786, + "learning_rate": 8.613974319136957e-07, + "loss": 0.4597, + "step": 4140 + }, + { + "epoch": 0.5324608673338466, + "grad_norm": 51.88049674865014, + "learning_rate": 8.606787167001256e-07, + "loss": 0.4517, + "step": 4150 + }, + { + "epoch": 0.5337439055683859, + "grad_norm": 39.78117225084205, + "learning_rate": 8.599584443864913e-07, + "loss": 0.5096, + "step": 4160 + }, + { + "epoch": 0.5350269438029254, + "grad_norm": 37.041413648506044, + "learning_rate": 8.592366180823083e-07, + "loss": 0.522, + "step": 4170 + }, + { + "epoch": 0.5363099820374647, + "grad_norm": 36.35148535779268, + "learning_rate": 8.585132409038012e-07, + "loss": 0.498, + "step": 4180 + }, + { + "epoch": 0.5375930202720041, + "grad_norm": 57.57830353524684, + "learning_rate": 8.577883159738892e-07, + "loss": 0.4834, + "step": 4190 + }, + { + "epoch": 0.5388760585065435, + "grad_norm": 40.245789814311586, + "learning_rate": 8.57061846422174e-07, + "loss": 0.5064, + "step": 4200 + }, + { + "epoch": 0.5401590967410829, + "grad_norm": 66.77317337604748, + "learning_rate": 8.563338353849256e-07, + "loss": 0.4729, + "step": 4210 + }, + { + "epoch": 0.5414421349756223, + "grad_norm": 65.43310607970143, + "learning_rate": 8.556042860050685e-07, + "loss": 0.5083, + "step": 4220 + }, + { + "epoch": 0.5427251732101617, + "grad_norm": 62.891338511987655, + "learning_rate": 8.548732014321686e-07, + "loss": 0.4548, + "step": 4230 + }, + { + "epoch": 0.544008211444701, + "grad_norm": 36.87250105695121, + "learning_rate": 8.541405848224197e-07, + "loss": 0.4797, + "step": 4240 + }, + { + "epoch": 0.5452912496792405, + "grad_norm": 32.76964327209872, + "learning_rate": 8.534064393386289e-07, + "loss": 0.4766, + "step": 4250 + }, + { + "epoch": 0.5465742879137798, + "grad_norm": 30.522621987525568, + "learning_rate": 8.526707681502043e-07, + "loss": 0.4931, + "step": 4260 + }, + { + "epoch": 0.5478573261483192, + "grad_norm": 61.80753835325765, + "learning_rate": 8.519335744331409e-07, + "loss": 0.4815, + "step": 4270 + }, + { + "epoch": 0.5491403643828586, + "grad_norm": 48.391328138978466, + "learning_rate": 8.511948613700055e-07, + "loss": 0.5086, + "step": 4280 + }, + { + "epoch": 0.550423402617398, + "grad_norm": 50.5794370523935, + "learning_rate": 8.504546321499254e-07, + "loss": 0.5162, + "step": 4290 + }, + { + "epoch": 0.5517064408519374, + "grad_norm": 69.22696484452543, + "learning_rate": 8.497128899685728e-07, + "loss": 0.487, + "step": 4300 + }, + { + "epoch": 0.5529894790864768, + "grad_norm": 85.38732372568789, + "learning_rate": 8.489696380281515e-07, + "loss": 0.4676, + "step": 4310 + }, + { + "epoch": 0.5542725173210161, + "grad_norm": 40.43244733091365, + "learning_rate": 8.482248795373835e-07, + "loss": 0.5, + "step": 4320 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 50.65330622057186, + "learning_rate": 8.474786177114941e-07, + "loss": 0.4675, + "step": 4330 + }, + { + "epoch": 0.5568385937900949, + "grad_norm": 61.39205626332214, + "learning_rate": 8.467308557721995e-07, + "loss": 0.5369, + "step": 4340 + }, + { + "epoch": 0.5581216320246344, + "grad_norm": 47.03116027362982, + "learning_rate": 8.459815969476916e-07, + "loss": 0.5189, + "step": 4350 + }, + { + "epoch": 0.5594046702591737, + "grad_norm": 86.85914459551998, + "learning_rate": 8.452308444726248e-07, + "loss": 0.4789, + "step": 4360 + }, + { + "epoch": 0.5606877084937131, + "grad_norm": 93.02036078991378, + "learning_rate": 8.444786015881015e-07, + "loss": 0.4771, + "step": 4370 + }, + { + "epoch": 0.5619707467282525, + "grad_norm": 30.244444537946, + "learning_rate": 8.437248715416589e-07, + "loss": 0.4956, + "step": 4380 + }, + { + "epoch": 0.5632537849627919, + "grad_norm": 72.64523405604973, + "learning_rate": 8.429696575872541e-07, + "loss": 0.4995, + "step": 4390 + }, + { + "epoch": 0.5645368231973312, + "grad_norm": 40.37521481583398, + "learning_rate": 8.422129629852504e-07, + "loss": 0.479, + "step": 4400 + }, + { + "epoch": 0.5658198614318707, + "grad_norm": 31.442891728349256, + "learning_rate": 8.414547910024035e-07, + "loss": 0.4556, + "step": 4410 + }, + { + "epoch": 0.56710289966641, + "grad_norm": 46.035048963026995, + "learning_rate": 8.406951449118469e-07, + "loss": 0.4764, + "step": 4420 + }, + { + "epoch": 0.5683859379009495, + "grad_norm": 36.86925293989069, + "learning_rate": 8.399340279930784e-07, + "loss": 0.5066, + "step": 4430 + }, + { + "epoch": 0.5696689761354888, + "grad_norm": 78.49312124132145, + "learning_rate": 8.391714435319451e-07, + "loss": 0.4862, + "step": 4440 + }, + { + "epoch": 0.5709520143700282, + "grad_norm": 46.46479867431165, + "learning_rate": 8.3840739482063e-07, + "loss": 0.4724, + "step": 4450 + }, + { + "epoch": 0.5722350526045676, + "grad_norm": 61.443366961760944, + "learning_rate": 8.376418851576376e-07, + "loss": 0.4733, + "step": 4460 + }, + { + "epoch": 0.573518090839107, + "grad_norm": 37.02739373707538, + "learning_rate": 8.368749178477792e-07, + "loss": 0.4506, + "step": 4470 + }, + { + "epoch": 0.5748011290736464, + "grad_norm": 57.813438078799, + "learning_rate": 8.36106496202159e-07, + "loss": 0.491, + "step": 4480 + }, + { + "epoch": 0.5760841673081858, + "grad_norm": 47.577663349352164, + "learning_rate": 8.353366235381598e-07, + "loss": 0.5052, + "step": 4490 + }, + { + "epoch": 0.5773672055427251, + "grad_norm": 28.32265015726599, + "learning_rate": 8.34565303179429e-07, + "loss": 0.4521, + "step": 4500 + }, + { + "epoch": 0.5786502437772646, + "grad_norm": 78.080260024836, + "learning_rate": 8.337925384558635e-07, + "loss": 0.4879, + "step": 4510 + }, + { + "epoch": 0.5799332820118039, + "grad_norm": 48.4881908590533, + "learning_rate": 8.330183327035958e-07, + "loss": 0.4871, + "step": 4520 + }, + { + "epoch": 0.5812163202463433, + "grad_norm": 46.86518505271407, + "learning_rate": 8.322426892649794e-07, + "loss": 0.5015, + "step": 4530 + }, + { + "epoch": 0.5824993584808827, + "grad_norm": 75.95495267548664, + "learning_rate": 8.314656114885747e-07, + "loss": 0.4394, + "step": 4540 + }, + { + "epoch": 0.5837823967154221, + "grad_norm": 44.05121843885771, + "learning_rate": 8.306871027291347e-07, + "loss": 0.494, + "step": 4550 + }, + { + "epoch": 0.5850654349499615, + "grad_norm": 67.23619953678096, + "learning_rate": 8.299071663475892e-07, + "loss": 0.5159, + "step": 4560 + }, + { + "epoch": 0.5863484731845009, + "grad_norm": 32.31357541653001, + "learning_rate": 8.291258057110319e-07, + "loss": 0.4535, + "step": 4570 + }, + { + "epoch": 0.5876315114190402, + "grad_norm": 49.79851029559879, + "learning_rate": 8.283430241927052e-07, + "loss": 0.47, + "step": 4580 + }, + { + "epoch": 0.5889145496535797, + "grad_norm": 54.69601512345344, + "learning_rate": 8.275588251719855e-07, + "loss": 0.4965, + "step": 4590 + }, + { + "epoch": 0.590197587888119, + "grad_norm": 37.0955060283378, + "learning_rate": 8.267732120343687e-07, + "loss": 0.493, + "step": 4600 + }, + { + "epoch": 0.5914806261226585, + "grad_norm": 46.85782189104801, + "learning_rate": 8.25986188171456e-07, + "loss": 0.4926, + "step": 4610 + }, + { + "epoch": 0.5927636643571979, + "grad_norm": 81.24427461572259, + "learning_rate": 8.251977569809381e-07, + "loss": 0.4712, + "step": 4620 + }, + { + "epoch": 0.5940467025917372, + "grad_norm": 64.42125855733796, + "learning_rate": 8.244079218665823e-07, + "loss": 0.4492, + "step": 4630 + }, + { + "epoch": 0.5953297408262767, + "grad_norm": 53.858045948982365, + "learning_rate": 8.236166862382162e-07, + "loss": 0.4189, + "step": 4640 + }, + { + "epoch": 0.596612779060816, + "grad_norm": 37.95766496087885, + "learning_rate": 8.228240535117137e-07, + "loss": 0.4829, + "step": 4650 + }, + { + "epoch": 0.5978958172953553, + "grad_norm": 34.462742993750645, + "learning_rate": 8.220300271089806e-07, + "loss": 0.4718, + "step": 4660 + }, + { + "epoch": 0.5991788555298948, + "grad_norm": 45.18432282260115, + "learning_rate": 8.212346104579387e-07, + "loss": 0.4942, + "step": 4670 + }, + { + "epoch": 0.6004618937644342, + "grad_norm": 52.5852904904687, + "learning_rate": 8.20437806992512e-07, + "loss": 0.474, + "step": 4680 + }, + { + "epoch": 0.6017449319989736, + "grad_norm": 44.64027605452197, + "learning_rate": 8.196396201526119e-07, + "loss": 0.4423, + "step": 4690 + }, + { + "epoch": 0.603027970233513, + "grad_norm": 64.36552262196055, + "learning_rate": 8.188400533841216e-07, + "loss": 0.5055, + "step": 4700 + }, + { + "epoch": 0.6043110084680523, + "grad_norm": 51.5888763462088, + "learning_rate": 8.180391101388819e-07, + "loss": 0.4672, + "step": 4710 + }, + { + "epoch": 0.6055940467025918, + "grad_norm": 50.422708948103335, + "learning_rate": 8.172367938746758e-07, + "loss": 0.4815, + "step": 4720 + }, + { + "epoch": 0.6068770849371311, + "grad_norm": 64.16164800090183, + "learning_rate": 8.164331080552138e-07, + "loss": 0.4736, + "step": 4730 + }, + { + "epoch": 0.6081601231716706, + "grad_norm": 61.34335059817513, + "learning_rate": 8.156280561501194e-07, + "loss": 0.5019, + "step": 4740 + }, + { + "epoch": 0.6094431614062099, + "grad_norm": 36.20192098286336, + "learning_rate": 8.148216416349132e-07, + "loss": 0.4452, + "step": 4750 + }, + { + "epoch": 0.6107261996407493, + "grad_norm": 63.511536561270816, + "learning_rate": 8.140138679909984e-07, + "loss": 0.4929, + "step": 4760 + }, + { + "epoch": 0.6120092378752887, + "grad_norm": 42.996134750254086, + "learning_rate": 8.132047387056465e-07, + "loss": 0.4705, + "step": 4770 + }, + { + "epoch": 0.6132922761098281, + "grad_norm": 32.88132990106328, + "learning_rate": 8.123942572719799e-07, + "loss": 0.4899, + "step": 4780 + }, + { + "epoch": 0.6145753143443674, + "grad_norm": 55.2149131835054, + "learning_rate": 8.115824271889604e-07, + "loss": 0.5112, + "step": 4790 + }, + { + "epoch": 0.6158583525789069, + "grad_norm": 90.15922271775864, + "learning_rate": 8.107692519613705e-07, + "loss": 0.4599, + "step": 4800 + }, + { + "epoch": 0.6171413908134462, + "grad_norm": 40.92461097421741, + "learning_rate": 8.099547350998007e-07, + "loss": 0.4793, + "step": 4810 + }, + { + "epoch": 0.6184244290479857, + "grad_norm": 66.61804140035336, + "learning_rate": 8.091388801206333e-07, + "loss": 0.4809, + "step": 4820 + }, + { + "epoch": 0.619707467282525, + "grad_norm": 39.7908546312094, + "learning_rate": 8.083216905460274e-07, + "loss": 0.4489, + "step": 4830 + }, + { + "epoch": 0.6209905055170644, + "grad_norm": 35.10944935246261, + "learning_rate": 8.075031699039036e-07, + "loss": 0.4825, + "step": 4840 + }, + { + "epoch": 0.6222735437516038, + "grad_norm": 25.828014474419653, + "learning_rate": 8.066833217279296e-07, + "loss": 0.4701, + "step": 4850 + }, + { + "epoch": 0.6235565819861432, + "grad_norm": 25.928448134228265, + "learning_rate": 8.058621495575031e-07, + "loss": 0.4566, + "step": 4860 + }, + { + "epoch": 0.6248396202206826, + "grad_norm": 33.158255287133485, + "learning_rate": 8.050396569377388e-07, + "loss": 0.4577, + "step": 4870 + }, + { + "epoch": 0.626122658455222, + "grad_norm": 52.819671922457786, + "learning_rate": 8.042158474194511e-07, + "loss": 0.4769, + "step": 4880 + }, + { + "epoch": 0.6274056966897613, + "grad_norm": 51.03321385865141, + "learning_rate": 8.033907245591402e-07, + "loss": 0.49, + "step": 4890 + }, + { + "epoch": 0.6286887349243008, + "grad_norm": 53.438724584905835, + "learning_rate": 8.025642919189761e-07, + "loss": 0.4457, + "step": 4900 + }, + { + "epoch": 0.6299717731588401, + "grad_norm": 54.24375487190807, + "learning_rate": 8.017365530667829e-07, + "loss": 0.4722, + "step": 4910 + }, + { + "epoch": 0.6312548113933796, + "grad_norm": 51.50785241143754, + "learning_rate": 8.009075115760242e-07, + "loss": 0.4727, + "step": 4920 + }, + { + "epoch": 0.6325378496279189, + "grad_norm": 29.293462528973613, + "learning_rate": 8.00077171025787e-07, + "loss": 0.4445, + "step": 4930 + }, + { + "epoch": 0.6338208878624583, + "grad_norm": 41.74464890384665, + "learning_rate": 7.992455350007668e-07, + "loss": 0.4636, + "step": 4940 + }, + { + "epoch": 0.6351039260969977, + "grad_norm": 35.074019680042426, + "learning_rate": 7.984126070912518e-07, + "loss": 0.4384, + "step": 4950 + }, + { + "epoch": 0.6363869643315371, + "grad_norm": 40.90544171515429, + "learning_rate": 7.975783908931073e-07, + "loss": 0.4453, + "step": 4960 + }, + { + "epoch": 0.6376700025660764, + "grad_norm": 25.373707321848155, + "learning_rate": 7.967428900077602e-07, + "loss": 0.4752, + "step": 4970 + }, + { + "epoch": 0.6389530408006159, + "grad_norm": 42.190418501317275, + "learning_rate": 7.959061080421838e-07, + "loss": 0.4919, + "step": 4980 + }, + { + "epoch": 0.6402360790351552, + "grad_norm": 36.74599270216001, + "learning_rate": 7.950680486088822e-07, + "loss": 0.4717, + "step": 4990 + }, + { + "epoch": 0.6415191172696947, + "grad_norm": 51.93503906451015, + "learning_rate": 7.942287153258739e-07, + "loss": 0.4312, + "step": 5000 + }, + { + "epoch": 0.642802155504234, + "grad_norm": 47.69724820649352, + "learning_rate": 7.933881118166776e-07, + "loss": 0.509, + "step": 5010 + }, + { + "epoch": 0.6440851937387734, + "grad_norm": 47.57546071324281, + "learning_rate": 7.925462417102947e-07, + "loss": 0.4344, + "step": 5020 + }, + { + "epoch": 0.6453682319733128, + "grad_norm": 44.35029668440026, + "learning_rate": 7.917031086411957e-07, + "loss": 0.4857, + "step": 5030 + }, + { + "epoch": 0.6466512702078522, + "grad_norm": 55.92891047267402, + "learning_rate": 7.908587162493028e-07, + "loss": 0.5007, + "step": 5040 + }, + { + "epoch": 0.6479343084423916, + "grad_norm": 31.20784489670834, + "learning_rate": 7.900130681799753e-07, + "loss": 0.449, + "step": 5050 + }, + { + "epoch": 0.649217346676931, + "grad_norm": 41.09729632153388, + "learning_rate": 7.891661680839932e-07, + "loss": 0.5041, + "step": 5060 + }, + { + "epoch": 0.6505003849114703, + "grad_norm": 35.56598033783615, + "learning_rate": 7.883180196175417e-07, + "loss": 0.522, + "step": 5070 + }, + { + "epoch": 0.6517834231460098, + "grad_norm": 27.826873293501215, + "learning_rate": 7.874686264421953e-07, + "loss": 0.5315, + "step": 5080 + }, + { + "epoch": 0.6530664613805491, + "grad_norm": 40.686564629917854, + "learning_rate": 7.866179922249023e-07, + "loss": 0.4712, + "step": 5090 + }, + { + "epoch": 0.6543494996150885, + "grad_norm": 41.12866912124451, + "learning_rate": 7.857661206379687e-07, + "loss": 0.4312, + "step": 5100 + }, + { + "epoch": 0.6556325378496279, + "grad_norm": 32.75915907932429, + "learning_rate": 7.849130153590422e-07, + "loss": 0.4751, + "step": 5110 + }, + { + "epoch": 0.6569155760841673, + "grad_norm": 38.71380707131648, + "learning_rate": 7.840586800710968e-07, + "loss": 0.4637, + "step": 5120 + }, + { + "epoch": 0.6581986143187067, + "grad_norm": 40.753954506391715, + "learning_rate": 7.832031184624164e-07, + "loss": 0.4548, + "step": 5130 + }, + { + "epoch": 0.6594816525532461, + "grad_norm": 26.90377192375261, + "learning_rate": 7.823463342265791e-07, + "loss": 0.4652, + "step": 5140 + }, + { + "epoch": 0.6607646907877854, + "grad_norm": 54.75898642054998, + "learning_rate": 7.814883310624415e-07, + "loss": 0.4532, + "step": 5150 + }, + { + "epoch": 0.6620477290223249, + "grad_norm": 36.35096372129136, + "learning_rate": 7.806291126741221e-07, + "loss": 0.4704, + "step": 5160 + }, + { + "epoch": 0.6633307672568642, + "grad_norm": 48.41553162525986, + "learning_rate": 7.797686827709862e-07, + "loss": 0.5057, + "step": 5170 + }, + { + "epoch": 0.6646138054914037, + "grad_norm": 43.16372169266977, + "learning_rate": 7.789070450676286e-07, + "loss": 0.4453, + "step": 5180 + }, + { + "epoch": 0.665896843725943, + "grad_norm": 22.83321902634232, + "learning_rate": 7.780442032838592e-07, + "loss": 0.457, + "step": 5190 + }, + { + "epoch": 0.6671798819604824, + "grad_norm": 26.976282351806642, + "learning_rate": 7.771801611446858e-07, + "loss": 0.4644, + "step": 5200 + }, + { + "epoch": 0.6684629201950218, + "grad_norm": 31.195511981648295, + "learning_rate": 7.763149223802978e-07, + "loss": 0.497, + "step": 5210 + }, + { + "epoch": 0.6697459584295612, + "grad_norm": 59.07875489664759, + "learning_rate": 7.754484907260512e-07, + "loss": 0.4372, + "step": 5220 + }, + { + "epoch": 0.6710289966641005, + "grad_norm": 40.205401628251806, + "learning_rate": 7.745808699224518e-07, + "loss": 0.4491, + "step": 5230 + }, + { + "epoch": 0.67231203489864, + "grad_norm": 43.60566077446433, + "learning_rate": 7.737120637151387e-07, + "loss": 0.4705, + "step": 5240 + }, + { + "epoch": 0.6735950731331793, + "grad_norm": 31.4612181829617, + "learning_rate": 7.728420758548692e-07, + "loss": 0.4776, + "step": 5250 + }, + { + "epoch": 0.6748781113677188, + "grad_norm": 33.4515266214021, + "learning_rate": 7.719709100975011e-07, + "loss": 0.4684, + "step": 5260 + }, + { + "epoch": 0.6761611496022581, + "grad_norm": 52.98995202331823, + "learning_rate": 7.710985702039785e-07, + "loss": 0.4735, + "step": 5270 + }, + { + "epoch": 0.6774441878367975, + "grad_norm": 29.609729415564534, + "learning_rate": 7.702250599403133e-07, + "loss": 0.4729, + "step": 5280 + }, + { + "epoch": 0.678727226071337, + "grad_norm": 34.19196717005476, + "learning_rate": 7.693503830775705e-07, + "loss": 0.4221, + "step": 5290 + }, + { + "epoch": 0.6800102643058763, + "grad_norm": 57.58240771321304, + "learning_rate": 7.684745433918516e-07, + "loss": 0.5289, + "step": 5300 + }, + { + "epoch": 0.6812933025404158, + "grad_norm": 47.52125634630053, + "learning_rate": 7.675975446642784e-07, + "loss": 0.4573, + "step": 5310 + }, + { + "epoch": 0.6825763407749551, + "grad_norm": 96.15680586858748, + "learning_rate": 7.667193906809752e-07, + "loss": 0.5041, + "step": 5320 + }, + { + "epoch": 0.6838593790094945, + "grad_norm": 39.65819144336563, + "learning_rate": 7.658400852330556e-07, + "loss": 0.4698, + "step": 5330 + }, + { + "epoch": 0.6851424172440339, + "grad_norm": 28.191619026715653, + "learning_rate": 7.649596321166024e-07, + "loss": 0.4777, + "step": 5340 + }, + { + "epoch": 0.6864254554785733, + "grad_norm": 39.58755067683703, + "learning_rate": 7.640780351326543e-07, + "loss": 0.4957, + "step": 5350 + }, + { + "epoch": 0.6877084937131126, + "grad_norm": 42.72684481649047, + "learning_rate": 7.631952980871879e-07, + "loss": 0.5101, + "step": 5360 + }, + { + "epoch": 0.6889915319476521, + "grad_norm": 32.38769694536021, + "learning_rate": 7.623114247911011e-07, + "loss": 0.4703, + "step": 5370 + }, + { + "epoch": 0.6902745701821914, + "grad_norm": 35.43244166876834, + "learning_rate": 7.61426419060198e-07, + "loss": 0.4595, + "step": 5380 + }, + { + "epoch": 0.6915576084167309, + "grad_norm": 20.82564201823901, + "learning_rate": 7.605402847151708e-07, + "loss": 0.4928, + "step": 5390 + }, + { + "epoch": 0.6928406466512702, + "grad_norm": 67.5526315026267, + "learning_rate": 7.596530255815845e-07, + "loss": 0.4613, + "step": 5400 + }, + { + "epoch": 0.6941236848858096, + "grad_norm": 25.719765989732426, + "learning_rate": 7.587646454898599e-07, + "loss": 0.4538, + "step": 5410 + }, + { + "epoch": 0.695406723120349, + "grad_norm": 27.760703978970383, + "learning_rate": 7.578751482752572e-07, + "loss": 0.4349, + "step": 5420 + }, + { + "epoch": 0.6966897613548884, + "grad_norm": 24.39110497890649, + "learning_rate": 7.569845377778592e-07, + "loss": 0.4506, + "step": 5430 + }, + { + "epoch": 0.6979727995894278, + "grad_norm": 30.324128550405902, + "learning_rate": 7.560928178425549e-07, + "loss": 0.4884, + "step": 5440 + }, + { + "epoch": 0.6992558378239672, + "grad_norm": 68.4474004372979, + "learning_rate": 7.551999923190233e-07, + "loss": 0.5236, + "step": 5450 + }, + { + "epoch": 0.7005388760585065, + "grad_norm": 58.42667021369657, + "learning_rate": 7.543060650617158e-07, + "loss": 0.4776, + "step": 5460 + }, + { + "epoch": 0.701821914293046, + "grad_norm": 34.85917003827046, + "learning_rate": 7.534110399298405e-07, + "loss": 0.5072, + "step": 5470 + }, + { + "epoch": 0.7031049525275853, + "grad_norm": 39.174098304810045, + "learning_rate": 7.52514920787345e-07, + "loss": 0.4178, + "step": 5480 + }, + { + "epoch": 0.7043879907621247, + "grad_norm": 34.85007450638585, + "learning_rate": 7.516177115029001e-07, + "loss": 0.4265, + "step": 5490 + }, + { + "epoch": 0.7056710289966641, + "grad_norm": 43.70644139496384, + "learning_rate": 7.507194159498826e-07, + "loss": 0.4304, + "step": 5500 + }, + { + "epoch": 0.7069540672312035, + "grad_norm": 48.24428388371721, + "learning_rate": 7.498200380063592e-07, + "loss": 0.4475, + "step": 5510 + }, + { + "epoch": 0.7082371054657429, + "grad_norm": 43.80801850462909, + "learning_rate": 7.489195815550691e-07, + "loss": 0.5005, + "step": 5520 + }, + { + "epoch": 0.7095201437002823, + "grad_norm": 34.762289077312985, + "learning_rate": 7.48018050483408e-07, + "loss": 0.441, + "step": 5530 + }, + { + "epoch": 0.7108031819348216, + "grad_norm": 45.41847188050982, + "learning_rate": 7.471154486834104e-07, + "loss": 0.4336, + "step": 5540 + }, + { + "epoch": 0.7120862201693611, + "grad_norm": 34.66528892878694, + "learning_rate": 7.462117800517336e-07, + "loss": 0.5011, + "step": 5550 + }, + { + "epoch": 0.7133692584039004, + "grad_norm": 32.538566267876085, + "learning_rate": 7.453070484896403e-07, + "loss": 0.4711, + "step": 5560 + }, + { + "epoch": 0.7146522966384399, + "grad_norm": 22.846286303830126, + "learning_rate": 7.444012579029826e-07, + "loss": 0.4551, + "step": 5570 + }, + { + "epoch": 0.7159353348729792, + "grad_norm": 44.64140234459555, + "learning_rate": 7.434944122021836e-07, + "loss": 0.4673, + "step": 5580 + }, + { + "epoch": 0.7172183731075186, + "grad_norm": 45.7348144672235, + "learning_rate": 7.425865153022224e-07, + "loss": 0.4478, + "step": 5590 + }, + { + "epoch": 0.718501411342058, + "grad_norm": 26.06959702311973, + "learning_rate": 7.416775711226158e-07, + "loss": 0.4526, + "step": 5600 + }, + { + "epoch": 0.7197844495765974, + "grad_norm": 42.03311990753281, + "learning_rate": 7.407675835874019e-07, + "loss": 0.4518, + "step": 5610 + }, + { + "epoch": 0.7210674878111367, + "grad_norm": 22.128346271066064, + "learning_rate": 7.398565566251232e-07, + "loss": 0.4495, + "step": 5620 + }, + { + "epoch": 0.7223505260456762, + "grad_norm": 72.68428923626436, + "learning_rate": 7.389444941688092e-07, + "loss": 0.4361, + "step": 5630 + }, + { + "epoch": 0.7236335642802155, + "grad_norm": 38.18162948667469, + "learning_rate": 7.380314001559605e-07, + "loss": 0.4713, + "step": 5640 + }, + { + "epoch": 0.724916602514755, + "grad_norm": 33.49574142588956, + "learning_rate": 7.371172785285307e-07, + "loss": 0.4898, + "step": 5650 + }, + { + "epoch": 0.7261996407492943, + "grad_norm": 30.083767902776017, + "learning_rate": 7.362021332329091e-07, + "loss": 0.4714, + "step": 5660 + }, + { + "epoch": 0.7274826789838337, + "grad_norm": 62.066984747563055, + "learning_rate": 7.352859682199057e-07, + "loss": 0.4906, + "step": 5670 + }, + { + "epoch": 0.7287657172183731, + "grad_norm": 28.84357814288708, + "learning_rate": 7.343687874447313e-07, + "loss": 0.4546, + "step": 5680 + }, + { + "epoch": 0.7300487554529125, + "grad_norm": 32.48272999856781, + "learning_rate": 7.334505948669829e-07, + "loss": 0.4579, + "step": 5690 + }, + { + "epoch": 0.7313317936874519, + "grad_norm": 32.62530234443397, + "learning_rate": 7.325313944506253e-07, + "loss": 0.4747, + "step": 5700 + }, + { + "epoch": 0.7326148319219913, + "grad_norm": 19.28433081386963, + "learning_rate": 7.316111901639739e-07, + "loss": 0.4386, + "step": 5710 + }, + { + "epoch": 0.7338978701565306, + "grad_norm": 53.63398937712851, + "learning_rate": 7.306899859796788e-07, + "loss": 0.4455, + "step": 5720 + }, + { + "epoch": 0.7351809083910701, + "grad_norm": 51.9199977891092, + "learning_rate": 7.297677858747057e-07, + "loss": 0.4497, + "step": 5730 + }, + { + "epoch": 0.7364639466256094, + "grad_norm": 38.80034056113416, + "learning_rate": 7.28844593830321e-07, + "loss": 0.4726, + "step": 5740 + }, + { + "epoch": 0.7377469848601488, + "grad_norm": 52.44029218909403, + "learning_rate": 7.279204138320724e-07, + "loss": 0.4704, + "step": 5750 + }, + { + "epoch": 0.7390300230946882, + "grad_norm": 31.343024377001566, + "learning_rate": 7.269952498697734e-07, + "loss": 0.4776, + "step": 5760 + }, + { + "epoch": 0.7403130613292276, + "grad_norm": 20.154448729971797, + "learning_rate": 7.26069105937485e-07, + "loss": 0.4411, + "step": 5770 + }, + { + "epoch": 0.741596099563767, + "grad_norm": 35.93586756299603, + "learning_rate": 7.251419860334993e-07, + "loss": 0.4646, + "step": 5780 + }, + { + "epoch": 0.7428791377983064, + "grad_norm": 30.22295739019751, + "learning_rate": 7.242138941603215e-07, + "loss": 0.4381, + "step": 5790 + }, + { + "epoch": 0.7441621760328457, + "grad_norm": 32.161932975702655, + "learning_rate": 7.23284834324653e-07, + "loss": 0.433, + "step": 5800 + }, + { + "epoch": 0.7454452142673852, + "grad_norm": 26.279385641044612, + "learning_rate": 7.223548105373737e-07, + "loss": 0.4172, + "step": 5810 + }, + { + "epoch": 0.7467282525019245, + "grad_norm": 34.629287065815966, + "learning_rate": 7.214238268135257e-07, + "loss": 0.4228, + "step": 5820 + }, + { + "epoch": 0.748011290736464, + "grad_norm": 40.38836002803532, + "learning_rate": 7.204918871722949e-07, + "loss": 0.4795, + "step": 5830 + }, + { + "epoch": 0.7492943289710033, + "grad_norm": 29.245412979265474, + "learning_rate": 7.19558995636994e-07, + "loss": 0.4836, + "step": 5840 + }, + { + "epoch": 0.7505773672055427, + "grad_norm": 39.743707325849364, + "learning_rate": 7.186251562350448e-07, + "loss": 0.4586, + "step": 5850 + }, + { + "epoch": 0.7518604054400821, + "grad_norm": 31.06321910125638, + "learning_rate": 7.176903729979621e-07, + "loss": 0.4877, + "step": 5860 + }, + { + "epoch": 0.7531434436746215, + "grad_norm": 28.10706275645348, + "learning_rate": 7.167546499613346e-07, + "loss": 0.4447, + "step": 5870 + }, + { + "epoch": 0.7544264819091608, + "grad_norm": 33.62210476923362, + "learning_rate": 7.158179911648086e-07, + "loss": 0.4932, + "step": 5880 + }, + { + "epoch": 0.7557095201437003, + "grad_norm": 35.50416746199111, + "learning_rate": 7.148804006520701e-07, + "loss": 0.4641, + "step": 5890 + }, + { + "epoch": 0.7569925583782396, + "grad_norm": 25.75895616653588, + "learning_rate": 7.139418824708271e-07, + "loss": 0.4557, + "step": 5900 + }, + { + "epoch": 0.7582755966127791, + "grad_norm": 37.147405982799675, + "learning_rate": 7.130024406727932e-07, + "loss": 0.4478, + "step": 5910 + }, + { + "epoch": 0.7595586348473184, + "grad_norm": 39.13144250776751, + "learning_rate": 7.120620793136688e-07, + "loss": 0.4396, + "step": 5920 + }, + { + "epoch": 0.7608416730818578, + "grad_norm": 64.87673793985412, + "learning_rate": 7.111208024531243e-07, + "loss": 0.4457, + "step": 5930 + }, + { + "epoch": 0.7621247113163973, + "grad_norm": 28.176402242615463, + "learning_rate": 7.101786141547828e-07, + "loss": 0.4795, + "step": 5940 + }, + { + "epoch": 0.7634077495509366, + "grad_norm": 45.237586156623216, + "learning_rate": 7.092355184862015e-07, + "loss": 0.4381, + "step": 5950 + }, + { + "epoch": 0.7646907877854761, + "grad_norm": 40.985218178198714, + "learning_rate": 7.082915195188556e-07, + "loss": 0.4398, + "step": 5960 + }, + { + "epoch": 0.7659738260200154, + "grad_norm": 27.075329807923193, + "learning_rate": 7.073466213281195e-07, + "loss": 0.4538, + "step": 5970 + }, + { + "epoch": 0.7672568642545547, + "grad_norm": 38.5148557172641, + "learning_rate": 7.064008279932498e-07, + "loss": 0.4608, + "step": 5980 + }, + { + "epoch": 0.7685399024890942, + "grad_norm": 34.995351031534966, + "learning_rate": 7.054541435973675e-07, + "loss": 0.4524, + "step": 5990 + }, + { + "epoch": 0.7698229407236336, + "grad_norm": 36.81350336528763, + "learning_rate": 7.045065722274406e-07, + "loss": 0.456, + "step": 6000 + }, + { + "epoch": 0.7711059789581729, + "grad_norm": 27.485135114110467, + "learning_rate": 7.03558117974266e-07, + "loss": 0.4638, + "step": 6010 + }, + { + "epoch": 0.7723890171927124, + "grad_norm": 48.528310915401036, + "learning_rate": 7.026087849324526e-07, + "loss": 0.4136, + "step": 6020 + }, + { + "epoch": 0.7736720554272517, + "grad_norm": 38.63357954422182, + "learning_rate": 7.016585772004026e-07, + "loss": 0.4548, + "step": 6030 + }, + { + "epoch": 0.7749550936617912, + "grad_norm": 26.970354243675786, + "learning_rate": 7.007074988802946e-07, + "loss": 0.4945, + "step": 6040 + }, + { + "epoch": 0.7762381318963305, + "grad_norm": 23.282122498045965, + "learning_rate": 6.997555540780658e-07, + "loss": 0.4006, + "step": 6050 + }, + { + "epoch": 0.7775211701308699, + "grad_norm": 43.17729016150936, + "learning_rate": 6.988027469033941e-07, + "loss": 0.4451, + "step": 6060 + }, + { + "epoch": 0.7788042083654093, + "grad_norm": 25.836461672899393, + "learning_rate": 6.978490814696801e-07, + "loss": 0.4411, + "step": 6070 + }, + { + "epoch": 0.7800872465999487, + "grad_norm": 47.346151301500576, + "learning_rate": 6.968945618940298e-07, + "loss": 0.4267, + "step": 6080 + }, + { + "epoch": 0.7813702848344881, + "grad_norm": 39.79870377915692, + "learning_rate": 6.959391922972367e-07, + "loss": 0.4489, + "step": 6090 + }, + { + "epoch": 0.7826533230690275, + "grad_norm": 44.47821550845711, + "learning_rate": 6.94982976803764e-07, + "loss": 0.4694, + "step": 6100 + }, + { + "epoch": 0.7839363613035668, + "grad_norm": 39.91157742088441, + "learning_rate": 6.940259195417264e-07, + "loss": 0.4654, + "step": 6110 + }, + { + "epoch": 0.7852193995381063, + "grad_norm": 62.74727717417218, + "learning_rate": 6.93068024642873e-07, + "loss": 0.4512, + "step": 6120 + }, + { + "epoch": 0.7865024377726456, + "grad_norm": 49.582094697631845, + "learning_rate": 6.921092962425694e-07, + "loss": 0.4686, + "step": 6130 + }, + { + "epoch": 0.787785476007185, + "grad_norm": 34.90688719798809, + "learning_rate": 6.911497384797785e-07, + "loss": 0.4904, + "step": 6140 + }, + { + "epoch": 0.7890685142417244, + "grad_norm": 40.497367155064005, + "learning_rate": 6.901893554970444e-07, + "loss": 0.4961, + "step": 6150 + }, + { + "epoch": 0.7903515524762638, + "grad_norm": 23.07554553690874, + "learning_rate": 6.892281514404742e-07, + "loss": 0.4404, + "step": 6160 + }, + { + "epoch": 0.7916345907108032, + "grad_norm": 26.504414505115324, + "learning_rate": 6.882661304597186e-07, + "loss": 0.4683, + "step": 6170 + }, + { + "epoch": 0.7929176289453426, + "grad_norm": 34.50630834328466, + "learning_rate": 6.87303296707956e-07, + "loss": 0.4545, + "step": 6180 + }, + { + "epoch": 0.7942006671798819, + "grad_norm": 38.008010538981395, + "learning_rate": 6.863396543418731e-07, + "loss": 0.4882, + "step": 6190 + }, + { + "epoch": 0.7954837054144214, + "grad_norm": 33.22395152413681, + "learning_rate": 6.853752075216479e-07, + "loss": 0.5008, + "step": 6200 + }, + { + "epoch": 0.7967667436489607, + "grad_norm": 32.58414597018976, + "learning_rate": 6.84409960410931e-07, + "loss": 0.4869, + "step": 6210 + }, + { + "epoch": 0.7980497818835002, + "grad_norm": 21.40032076277771, + "learning_rate": 6.83443917176828e-07, + "loss": 0.4799, + "step": 6220 + }, + { + "epoch": 0.7993328201180395, + "grad_norm": 29.080859542509973, + "learning_rate": 6.82477081989882e-07, + "loss": 0.4709, + "step": 6230 + }, + { + "epoch": 0.8006158583525789, + "grad_norm": 77.54735009605663, + "learning_rate": 6.815094590240541e-07, + "loss": 0.4651, + "step": 6240 + }, + { + "epoch": 0.8018988965871183, + "grad_norm": 25.382184117906387, + "learning_rate": 6.805410524567073e-07, + "loss": 0.4515, + "step": 6250 + }, + { + "epoch": 0.8031819348216577, + "grad_norm": 30.267262798539388, + "learning_rate": 6.795718664685868e-07, + "loss": 0.4945, + "step": 6260 + }, + { + "epoch": 0.8044649730561971, + "grad_norm": 83.24060405822647, + "learning_rate": 6.786019052438033e-07, + "loss": 0.4259, + "step": 6270 + }, + { + "epoch": 0.8057480112907365, + "grad_norm": 35.19245238465059, + "learning_rate": 6.776311729698138e-07, + "loss": 0.486, + "step": 6280 + }, + { + "epoch": 0.8070310495252758, + "grad_norm": 33.5543684153452, + "learning_rate": 6.766596738374043e-07, + "loss": 0.4513, + "step": 6290 + }, + { + "epoch": 0.8083140877598153, + "grad_norm": 39.3040905765322, + "learning_rate": 6.756874120406714e-07, + "loss": 0.4766, + "step": 6300 + }, + { + "epoch": 0.8095971259943546, + "grad_norm": 51.063107043561885, + "learning_rate": 6.74714391777004e-07, + "loss": 0.4078, + "step": 6310 + }, + { + "epoch": 0.810880164228894, + "grad_norm": 45.51310262937702, + "learning_rate": 6.737406172470657e-07, + "loss": 0.4838, + "step": 6320 + }, + { + "epoch": 0.8121632024634334, + "grad_norm": 18.3701090342424, + "learning_rate": 6.727660926547762e-07, + "loss": 0.4174, + "step": 6330 + }, + { + "epoch": 0.8134462406979728, + "grad_norm": 28.833464100805774, + "learning_rate": 6.717908222072934e-07, + "loss": 0.4591, + "step": 6340 + }, + { + "epoch": 0.8147292789325122, + "grad_norm": 44.10187230132924, + "learning_rate": 6.708148101149954e-07, + "loss": 0.4571, + "step": 6350 + }, + { + "epoch": 0.8160123171670516, + "grad_norm": 31.70030927185124, + "learning_rate": 6.698380605914613e-07, + "loss": 0.4617, + "step": 6360 + }, + { + "epoch": 0.8172953554015909, + "grad_norm": 46.09222201987856, + "learning_rate": 6.688605778534548e-07, + "loss": 0.4414, + "step": 6370 + }, + { + "epoch": 0.8185783936361304, + "grad_norm": 59.19647799494779, + "learning_rate": 6.678823661209042e-07, + "loss": 0.4426, + "step": 6380 + }, + { + "epoch": 0.8198614318706697, + "grad_norm": 35.37717874547178, + "learning_rate": 6.669034296168854e-07, + "loss": 0.503, + "step": 6390 + }, + { + "epoch": 0.8211444701052092, + "grad_norm": 41.05886425510951, + "learning_rate": 6.659237725676032e-07, + "loss": 0.4383, + "step": 6400 + }, + { + "epoch": 0.8224275083397485, + "grad_norm": 30.60790396778759, + "learning_rate": 6.649433992023728e-07, + "loss": 0.4496, + "step": 6410 + }, + { + "epoch": 0.8237105465742879, + "grad_norm": 66.3787896790375, + "learning_rate": 6.639623137536022e-07, + "loss": 0.4169, + "step": 6420 + }, + { + "epoch": 0.8249935848088273, + "grad_norm": 31.622035521934535, + "learning_rate": 6.629805204567733e-07, + "loss": 0.4508, + "step": 6430 + }, + { + "epoch": 0.8262766230433667, + "grad_norm": 24.206851517000135, + "learning_rate": 6.619980235504241e-07, + "loss": 0.439, + "step": 6440 + }, + { + "epoch": 0.827559661277906, + "grad_norm": 46.578063519561404, + "learning_rate": 6.6101482727613e-07, + "loss": 0.4808, + "step": 6450 + }, + { + "epoch": 0.8288426995124455, + "grad_norm": 33.81561112885494, + "learning_rate": 6.600309358784857e-07, + "loss": 0.4686, + "step": 6460 + }, + { + "epoch": 0.8301257377469848, + "grad_norm": 21.74797977723323, + "learning_rate": 6.590463536050871e-07, + "loss": 0.4132, + "step": 6470 + }, + { + "epoch": 0.8314087759815243, + "grad_norm": 24.740495757200947, + "learning_rate": 6.580610847065123e-07, + "loss": 0.4275, + "step": 6480 + }, + { + "epoch": 0.8326918142160636, + "grad_norm": 30.53205101946798, + "learning_rate": 6.570751334363036e-07, + "loss": 0.485, + "step": 6490 + }, + { + "epoch": 0.833974852450603, + "grad_norm": 26.614993429357714, + "learning_rate": 6.560885040509498e-07, + "loss": 0.4321, + "step": 6500 + }, + { + "epoch": 0.8352578906851424, + "grad_norm": 32.268365913500006, + "learning_rate": 6.551012008098667e-07, + "loss": 0.4342, + "step": 6510 + }, + { + "epoch": 0.8365409289196818, + "grad_norm": 39.5302368196487, + "learning_rate": 6.541132279753796e-07, + "loss": 0.4484, + "step": 6520 + }, + { + "epoch": 0.8378239671542212, + "grad_norm": 27.781255368724512, + "learning_rate": 6.531245898127041e-07, + "loss": 0.4436, + "step": 6530 + }, + { + "epoch": 0.8391070053887606, + "grad_norm": 34.250187520941154, + "learning_rate": 6.521352905899282e-07, + "loss": 0.4558, + "step": 6540 + }, + { + "epoch": 0.8403900436232999, + "grad_norm": 22.293818007132735, + "learning_rate": 6.511453345779941e-07, + "loss": 0.4274, + "step": 6550 + }, + { + "epoch": 0.8416730818578394, + "grad_norm": 37.68489685438498, + "learning_rate": 6.501547260506793e-07, + "loss": 0.5056, + "step": 6560 + }, + { + "epoch": 0.8429561200923787, + "grad_norm": 42.78964577207527, + "learning_rate": 6.49163469284578e-07, + "loss": 0.4662, + "step": 6570 + }, + { + "epoch": 0.8442391583269181, + "grad_norm": 18.685919621437602, + "learning_rate": 6.481715685590836e-07, + "loss": 0.4566, + "step": 6580 + }, + { + "epoch": 0.8455221965614576, + "grad_norm": 30.15561131293439, + "learning_rate": 6.471790281563687e-07, + "loss": 0.4084, + "step": 6590 + }, + { + "epoch": 0.8468052347959969, + "grad_norm": 26.842057436717152, + "learning_rate": 6.461858523613684e-07, + "loss": 0.4595, + "step": 6600 + }, + { + "epoch": 0.8480882730305364, + "grad_norm": 46.89129288112349, + "learning_rate": 6.451920454617599e-07, + "loss": 0.4739, + "step": 6610 + }, + { + "epoch": 0.8493713112650757, + "grad_norm": 43.96101248415759, + "learning_rate": 6.44197611747946e-07, + "loss": 0.4099, + "step": 6620 + }, + { + "epoch": 0.850654349499615, + "grad_norm": 52.00116065893711, + "learning_rate": 6.432025555130347e-07, + "loss": 0.4165, + "step": 6630 + }, + { + "epoch": 0.8519373877341545, + "grad_norm": 19.224864259677314, + "learning_rate": 6.42206881052822e-07, + "loss": 0.4475, + "step": 6640 + }, + { + "epoch": 0.8532204259686939, + "grad_norm": 33.941647902468134, + "learning_rate": 6.412105926657723e-07, + "loss": 0.4772, + "step": 6650 + }, + { + "epoch": 0.8545034642032333, + "grad_norm": 29.415890824302895, + "learning_rate": 6.402136946530014e-07, + "loss": 0.4461, + "step": 6660 + }, + { + "epoch": 0.8557865024377727, + "grad_norm": 27.580081345123634, + "learning_rate": 6.392161913182559e-07, + "loss": 0.449, + "step": 6670 + }, + { + "epoch": 0.857069540672312, + "grad_norm": 33.036266048866565, + "learning_rate": 6.382180869678961e-07, + "loss": 0.466, + "step": 6680 + }, + { + "epoch": 0.8583525789068515, + "grad_norm": 19.92154035829535, + "learning_rate": 6.372193859108775e-07, + "loss": 0.4397, + "step": 6690 + }, + { + "epoch": 0.8596356171413908, + "grad_norm": 45.326003702640705, + "learning_rate": 6.362200924587305e-07, + "loss": 0.4445, + "step": 6700 + }, + { + "epoch": 0.8609186553759302, + "grad_norm": 45.69949308122071, + "learning_rate": 6.352202109255438e-07, + "loss": 0.4589, + "step": 6710 + }, + { + "epoch": 0.8622016936104696, + "grad_norm": 21.017370975023802, + "learning_rate": 6.342197456279448e-07, + "loss": 0.4992, + "step": 6720 + }, + { + "epoch": 0.863484731845009, + "grad_norm": 23.486283006568794, + "learning_rate": 6.332187008850808e-07, + "loss": 0.46, + "step": 6730 + }, + { + "epoch": 0.8647677700795484, + "grad_norm": 20.780839580882127, + "learning_rate": 6.322170810186011e-07, + "loss": 0.4232, + "step": 6740 + }, + { + "epoch": 0.8660508083140878, + "grad_norm": 26.184442395087892, + "learning_rate": 6.312148903526374e-07, + "loss": 0.4457, + "step": 6750 + }, + { + "epoch": 0.8673338465486271, + "grad_norm": 32.95586014980141, + "learning_rate": 6.302121332137863e-07, + "loss": 0.4553, + "step": 6760 + }, + { + "epoch": 0.8686168847831666, + "grad_norm": 33.19149556103387, + "learning_rate": 6.292088139310889e-07, + "loss": 0.4519, + "step": 6770 + }, + { + "epoch": 0.8698999230177059, + "grad_norm": 27.686417621307868, + "learning_rate": 6.282049368360142e-07, + "loss": 0.4495, + "step": 6780 + }, + { + "epoch": 0.8711829612522454, + "grad_norm": 18.830663433718435, + "learning_rate": 6.272005062624387e-07, + "loss": 0.4248, + "step": 6790 + }, + { + "epoch": 0.8724659994867847, + "grad_norm": 48.70976943086503, + "learning_rate": 6.261955265466286e-07, + "loss": 0.4334, + "step": 6800 + }, + { + "epoch": 0.8737490377213241, + "grad_norm": 47.5810729951281, + "learning_rate": 6.251900020272207e-07, + "loss": 0.4555, + "step": 6810 + }, + { + "epoch": 0.8750320759558635, + "grad_norm": 18.05668929934205, + "learning_rate": 6.24183937045204e-07, + "loss": 0.3789, + "step": 6820 + }, + { + "epoch": 0.8763151141904029, + "grad_norm": 42.44521735939729, + "learning_rate": 6.231773359439006e-07, + "loss": 0.475, + "step": 6830 + }, + { + "epoch": 0.8775981524249422, + "grad_norm": 28.894307746660274, + "learning_rate": 6.22170203068947e-07, + "loss": 0.4633, + "step": 6840 + }, + { + "epoch": 0.8788811906594817, + "grad_norm": 43.399587110100406, + "learning_rate": 6.211625427682758e-07, + "loss": 0.4397, + "step": 6850 + }, + { + "epoch": 0.880164228894021, + "grad_norm": 18.55424078507953, + "learning_rate": 6.201543593920959e-07, + "loss": 0.4618, + "step": 6860 + }, + { + "epoch": 0.8814472671285605, + "grad_norm": 33.00065683391829, + "learning_rate": 6.191456572928753e-07, + "loss": 0.4374, + "step": 6870 + }, + { + "epoch": 0.8827303053630998, + "grad_norm": 23.63085524168053, + "learning_rate": 6.181364408253207e-07, + "loss": 0.393, + "step": 6880 + }, + { + "epoch": 0.8840133435976392, + "grad_norm": 36.89912992181461, + "learning_rate": 6.171267143463595e-07, + "loss": 0.4371, + "step": 6890 + }, + { + "epoch": 0.8852963818321786, + "grad_norm": 37.39933445503004, + "learning_rate": 6.161164822151213e-07, + "loss": 0.4316, + "step": 6900 + }, + { + "epoch": 0.886579420066718, + "grad_norm": 23.6651969708587, + "learning_rate": 6.151057487929181e-07, + "loss": 0.4577, + "step": 6910 + }, + { + "epoch": 0.8878624583012574, + "grad_norm": 44.34959350922, + "learning_rate": 6.140945184432265e-07, + "loss": 0.4549, + "step": 6920 + }, + { + "epoch": 0.8891454965357968, + "grad_norm": 31.802028145909095, + "learning_rate": 6.130827955316682e-07, + "loss": 0.4589, + "step": 6930 + }, + { + "epoch": 0.8904285347703361, + "grad_norm": 34.125781467079484, + "learning_rate": 6.120705844259912e-07, + "loss": 0.4605, + "step": 6940 + }, + { + "epoch": 0.8917115730048756, + "grad_norm": 85.67825438780527, + "learning_rate": 6.110578894960516e-07, + "loss": 0.5264, + "step": 6950 + }, + { + "epoch": 0.8929946112394149, + "grad_norm": 21.689679284713595, + "learning_rate": 6.100447151137939e-07, + "loss": 0.4586, + "step": 6960 + }, + { + "epoch": 0.8942776494739543, + "grad_norm": 31.517683627953975, + "learning_rate": 6.090310656532321e-07, + "loss": 0.4272, + "step": 6970 + }, + { + "epoch": 0.8955606877084937, + "grad_norm": 20.343226592099693, + "learning_rate": 6.08016945490432e-07, + "loss": 0.429, + "step": 6980 + }, + { + "epoch": 0.8968437259430331, + "grad_norm": 28.53655448692546, + "learning_rate": 6.070023590034906e-07, + "loss": 0.4656, + "step": 6990 + }, + { + "epoch": 0.8981267641775725, + "grad_norm": 40.95926031963293, + "learning_rate": 6.05987310572519e-07, + "loss": 0.4327, + "step": 7000 + }, + { + "epoch": 0.8994098024121119, + "grad_norm": 45.59264864990491, + "learning_rate": 6.04971804579622e-07, + "loss": 0.4981, + "step": 7010 + }, + { + "epoch": 0.9006928406466512, + "grad_norm": 20.198585508587936, + "learning_rate": 6.039558454088795e-07, + "loss": 0.4348, + "step": 7020 + }, + { + "epoch": 0.9019758788811907, + "grad_norm": 37.55980145110452, + "learning_rate": 6.029394374463289e-07, + "loss": 0.4579, + "step": 7030 + }, + { + "epoch": 0.90325891711573, + "grad_norm": 27.175443524228466, + "learning_rate": 6.019225850799439e-07, + "loss": 0.4631, + "step": 7040 + }, + { + "epoch": 0.9045419553502695, + "grad_norm": 26.054907559980045, + "learning_rate": 6.009052926996172e-07, + "loss": 0.4461, + "step": 7050 + }, + { + "epoch": 0.9058249935848088, + "grad_norm": 27.181106826695874, + "learning_rate": 5.998875646971413e-07, + "loss": 0.4465, + "step": 7060 + }, + { + "epoch": 0.9071080318193482, + "grad_norm": 31.13039507272809, + "learning_rate": 5.988694054661893e-07, + "loss": 0.4194, + "step": 7070 + }, + { + "epoch": 0.9083910700538876, + "grad_norm": 15.698030078091408, + "learning_rate": 5.978508194022958e-07, + "loss": 0.4504, + "step": 7080 + }, + { + "epoch": 0.909674108288427, + "grad_norm": 32.32099478003905, + "learning_rate": 5.96831810902838e-07, + "loss": 0.4198, + "step": 7090 + }, + { + "epoch": 0.9109571465229663, + "grad_norm": 31.321907871544507, + "learning_rate": 5.958123843670173e-07, + "loss": 0.4353, + "step": 7100 + }, + { + "epoch": 0.9122401847575058, + "grad_norm": 33.32410190096123, + "learning_rate": 5.947925441958393e-07, + "loss": 0.4152, + "step": 7110 + }, + { + "epoch": 0.9135232229920451, + "grad_norm": 19.20467040362234, + "learning_rate": 5.937722947920955e-07, + "loss": 0.4463, + "step": 7120 + }, + { + "epoch": 0.9148062612265846, + "grad_norm": 34.509133631092446, + "learning_rate": 5.927516405603441e-07, + "loss": 0.4102, + "step": 7130 + }, + { + "epoch": 0.9160892994611239, + "grad_norm": 28.11864851795215, + "learning_rate": 5.917305859068911e-07, + "loss": 0.4819, + "step": 7140 + }, + { + "epoch": 0.9173723376956633, + "grad_norm": 22.5002126723046, + "learning_rate": 5.907091352397713e-07, + "loss": 0.4507, + "step": 7150 + }, + { + "epoch": 0.9186553759302027, + "grad_norm": 22.343249244111423, + "learning_rate": 5.896872929687286e-07, + "loss": 0.451, + "step": 7160 + }, + { + "epoch": 0.9199384141647421, + "grad_norm": 42.74138675145006, + "learning_rate": 5.886650635051983e-07, + "loss": 0.4709, + "step": 7170 + }, + { + "epoch": 0.9212214523992815, + "grad_norm": 28.821709263189213, + "learning_rate": 5.876424512622863e-07, + "loss": 0.4657, + "step": 7180 + }, + { + "epoch": 0.9225044906338209, + "grad_norm": 23.873867369232162, + "learning_rate": 5.866194606547519e-07, + "loss": 0.4723, + "step": 7190 + }, + { + "epoch": 0.9237875288683602, + "grad_norm": 21.1757621425987, + "learning_rate": 5.855960960989876e-07, + "loss": 0.4563, + "step": 7200 + }, + { + "epoch": 0.9250705671028997, + "grad_norm": 36.908121644860124, + "learning_rate": 5.845723620129996e-07, + "loss": 0.3843, + "step": 7210 + }, + { + "epoch": 0.926353605337439, + "grad_norm": 22.630167665485146, + "learning_rate": 5.835482628163908e-07, + "loss": 0.4251, + "step": 7220 + }, + { + "epoch": 0.9276366435719784, + "grad_norm": 31.507896542379424, + "learning_rate": 5.825238029303387e-07, + "loss": 0.4676, + "step": 7230 + }, + { + "epoch": 0.9289196818065178, + "grad_norm": 21.373706562518684, + "learning_rate": 5.814989867775795e-07, + "loss": 0.4318, + "step": 7240 + }, + { + "epoch": 0.9302027200410572, + "grad_norm": 29.381571300923696, + "learning_rate": 5.804738187823863e-07, + "loss": 0.4162, + "step": 7250 + }, + { + "epoch": 0.9314857582755967, + "grad_norm": 26.343752608053503, + "learning_rate": 5.794483033705516e-07, + "loss": 0.4567, + "step": 7260 + }, + { + "epoch": 0.932768796510136, + "grad_norm": 28.351719048844895, + "learning_rate": 5.784224449693679e-07, + "loss": 0.4396, + "step": 7270 + }, + { + "epoch": 0.9340518347446753, + "grad_norm": 24.26762015858682, + "learning_rate": 5.77396248007608e-07, + "loss": 0.4167, + "step": 7280 + }, + { + "epoch": 0.9353348729792148, + "grad_norm": 35.81397989743118, + "learning_rate": 5.763697169155069e-07, + "loss": 0.4427, + "step": 7290 + }, + { + "epoch": 0.9366179112137542, + "grad_norm": 22.1396581738949, + "learning_rate": 5.753428561247415e-07, + "loss": 0.4803, + "step": 7300 + }, + { + "epoch": 0.9379009494482936, + "grad_norm": 24.389462570313153, + "learning_rate": 5.743156700684125e-07, + "loss": 0.4039, + "step": 7310 + }, + { + "epoch": 0.939183987682833, + "grad_norm": 26.7051711507577, + "learning_rate": 5.732881631810244e-07, + "loss": 0.4112, + "step": 7320 + }, + { + "epoch": 0.9404670259173723, + "grad_norm": 41.087633048905786, + "learning_rate": 5.722603398984671e-07, + "loss": 0.452, + "step": 7330 + }, + { + "epoch": 0.9417500641519118, + "grad_norm": 24.012345328230133, + "learning_rate": 5.712322046579964e-07, + "loss": 0.4129, + "step": 7340 + }, + { + "epoch": 0.9430331023864511, + "grad_norm": 25.578052282135417, + "learning_rate": 5.702037618982147e-07, + "loss": 0.4465, + "step": 7350 + }, + { + "epoch": 0.9443161406209905, + "grad_norm": 32.002009065688924, + "learning_rate": 5.691750160590522e-07, + "loss": 0.4701, + "step": 7360 + }, + { + "epoch": 0.9455991788555299, + "grad_norm": 26.287569530320525, + "learning_rate": 5.681459715817472e-07, + "loss": 0.4046, + "step": 7370 + }, + { + "epoch": 0.9468822170900693, + "grad_norm": 18.685791241314956, + "learning_rate": 5.671166329088277e-07, + "loss": 0.4315, + "step": 7380 + }, + { + "epoch": 0.9481652553246087, + "grad_norm": 31.65810599965732, + "learning_rate": 5.660870044840916e-07, + "loss": 0.4535, + "step": 7390 + }, + { + "epoch": 0.9494482935591481, + "grad_norm": 27.233180023210643, + "learning_rate": 5.650570907525875e-07, + "loss": 0.4575, + "step": 7400 + }, + { + "epoch": 0.9507313317936874, + "grad_norm": 22.22874502628049, + "learning_rate": 5.640268961605958e-07, + "loss": 0.4647, + "step": 7410 + }, + { + "epoch": 0.9520143700282269, + "grad_norm": 17.430580761117465, + "learning_rate": 5.629964251556099e-07, + "loss": 0.4303, + "step": 7420 + }, + { + "epoch": 0.9532974082627662, + "grad_norm": 23.88761868322909, + "learning_rate": 5.619656821863157e-07, + "loss": 0.4173, + "step": 7430 + }, + { + "epoch": 0.9545804464973057, + "grad_norm": 19.6259283264678, + "learning_rate": 5.609346717025737e-07, + "loss": 0.4611, + "step": 7440 + }, + { + "epoch": 0.955863484731845, + "grad_norm": 33.75827248561883, + "learning_rate": 5.599033981553993e-07, + "loss": 0.4753, + "step": 7450 + }, + { + "epoch": 0.9571465229663844, + "grad_norm": 34.56116462938692, + "learning_rate": 5.588718659969437e-07, + "loss": 0.4407, + "step": 7460 + }, + { + "epoch": 0.9584295612009238, + "grad_norm": 29.078994242135263, + "learning_rate": 5.578400796804739e-07, + "loss": 0.4277, + "step": 7470 + }, + { + "epoch": 0.9597125994354632, + "grad_norm": 27.121417400332838, + "learning_rate": 5.568080436603548e-07, + "loss": 0.4558, + "step": 7480 + }, + { + "epoch": 0.9609956376700025, + "grad_norm": 25.455859505213052, + "learning_rate": 5.557757623920293e-07, + "loss": 0.4083, + "step": 7490 + }, + { + "epoch": 0.962278675904542, + "grad_norm": 18.347983088967062, + "learning_rate": 5.547432403319985e-07, + "loss": 0.4531, + "step": 7500 + }, + { + "epoch": 0.9635617141390813, + "grad_norm": 16.83396044286444, + "learning_rate": 5.537104819378036e-07, + "loss": 0.4526, + "step": 7510 + }, + { + "epoch": 0.9648447523736208, + "grad_norm": 30.859210455419966, + "learning_rate": 5.526774916680059e-07, + "loss": 0.4597, + "step": 7520 + }, + { + "epoch": 0.9661277906081601, + "grad_norm": 22.323510056877154, + "learning_rate": 5.516442739821675e-07, + "loss": 0.4204, + "step": 7530 + }, + { + "epoch": 0.9674108288426995, + "grad_norm": 25.142352189955325, + "learning_rate": 5.506108333408329e-07, + "loss": 0.3904, + "step": 7540 + }, + { + "epoch": 0.9686938670772389, + "grad_norm": 18.92295085919245, + "learning_rate": 5.495771742055083e-07, + "loss": 0.4631, + "step": 7550 + }, + { + "epoch": 0.9699769053117783, + "grad_norm": 64.5585079720544, + "learning_rate": 5.48543301038644e-07, + "loss": 0.4495, + "step": 7560 + }, + { + "epoch": 0.9712599435463177, + "grad_norm": 21.57822735700948, + "learning_rate": 5.475092183036137e-07, + "loss": 0.4468, + "step": 7570 + }, + { + "epoch": 0.9725429817808571, + "grad_norm": 23.738692152546893, + "learning_rate": 5.464749304646961e-07, + "loss": 0.4155, + "step": 7580 + }, + { + "epoch": 0.9738260200153964, + "grad_norm": 22.312813694184896, + "learning_rate": 5.454404419870553e-07, + "loss": 0.4207, + "step": 7590 + }, + { + "epoch": 0.9751090582499359, + "grad_norm": 20.42469483755779, + "learning_rate": 5.444057573367215e-07, + "loss": 0.4161, + "step": 7600 + }, + { + "epoch": 0.9763920964844752, + "grad_norm": 17.576103547210085, + "learning_rate": 5.43370880980572e-07, + "loss": 0.4122, + "step": 7610 + }, + { + "epoch": 0.9776751347190146, + "grad_norm": 16.2284206888642, + "learning_rate": 5.423358173863116e-07, + "loss": 0.4445, + "step": 7620 + }, + { + "epoch": 0.978958172953554, + "grad_norm": 22.15178605678329, + "learning_rate": 5.413005710224535e-07, + "loss": 0.4161, + "step": 7630 + }, + { + "epoch": 0.9802412111880934, + "grad_norm": 27.840267185016412, + "learning_rate": 5.402651463582998e-07, + "loss": 0.4039, + "step": 7640 + }, + { + "epoch": 0.9815242494226328, + "grad_norm": 30.359077059416204, + "learning_rate": 5.392295478639225e-07, + "loss": 0.4227, + "step": 7650 + }, + { + "epoch": 0.9828072876571722, + "grad_norm": 20.77802812674331, + "learning_rate": 5.381937800101439e-07, + "loss": 0.434, + "step": 7660 + }, + { + "epoch": 0.9840903258917115, + "grad_norm": 23.51752852642399, + "learning_rate": 5.371578472685177e-07, + "loss": 0.4241, + "step": 7670 + }, + { + "epoch": 0.985373364126251, + "grad_norm": 22.677680748857224, + "learning_rate": 5.361217541113092e-07, + "loss": 0.4465, + "step": 7680 + }, + { + "epoch": 0.9866564023607903, + "grad_norm": 31.57453537504272, + "learning_rate": 5.350855050114762e-07, + "loss": 0.399, + "step": 7690 + }, + { + "epoch": 0.9879394405953298, + "grad_norm": 27.374134115394686, + "learning_rate": 5.3404910444265e-07, + "loss": 0.4156, + "step": 7700 + }, + { + "epoch": 0.9892224788298691, + "grad_norm": 29.622451033830277, + "learning_rate": 5.330125568791157e-07, + "loss": 0.4576, + "step": 7710 + }, + { + "epoch": 0.9905055170644085, + "grad_norm": 41.21790016892921, + "learning_rate": 5.319758667957927e-07, + "loss": 0.4782, + "step": 7720 + }, + { + "epoch": 0.9917885552989479, + "grad_norm": 30.552701568245332, + "learning_rate": 5.309390386682165e-07, + "loss": 0.4205, + "step": 7730 + }, + { + "epoch": 0.9930715935334873, + "grad_norm": 29.099078490380247, + "learning_rate": 5.299020769725171e-07, + "loss": 0.3967, + "step": 7740 + }, + { + "epoch": 0.9943546317680267, + "grad_norm": 45.11753524413772, + "learning_rate": 5.288649861854027e-07, + "loss": 0.4487, + "step": 7750 + }, + { + "epoch": 0.9956376700025661, + "grad_norm": 41.39669349245314, + "learning_rate": 5.278277707841378e-07, + "loss": 0.4337, + "step": 7760 + }, + { + "epoch": 0.9969207082371054, + "grad_norm": 22.288602594753083, + "learning_rate": 5.267904352465255e-07, + "loss": 0.4655, + "step": 7770 + }, + { + "epoch": 0.9982037464716449, + "grad_norm": 35.46690586277827, + "learning_rate": 5.25752984050887e-07, + "loss": 0.4993, + "step": 7780 + }, + { + "epoch": 0.9994867847061842, + "grad_norm": 21.71569737197539, + "learning_rate": 5.24715421676043e-07, + "loss": 0.4111, + "step": 7790 + }, + { + "epoch": 1.0007698229407236, + "grad_norm": 21.733610400527485, + "learning_rate": 5.236777526012945e-07, + "loss": 0.4301, + "step": 7800 + }, + { + "epoch": 1.002052861175263, + "grad_norm": 29.97763651146444, + "learning_rate": 5.226399813064027e-07, + "loss": 0.4285, + "step": 7810 + }, + { + "epoch": 1.0033358994098025, + "grad_norm": 19.822233891762057, + "learning_rate": 5.216021122715702e-07, + "loss": 0.4061, + "step": 7820 + }, + { + "epoch": 1.0046189376443417, + "grad_norm": 35.10536113257614, + "learning_rate": 5.20564149977422e-07, + "loss": 0.414, + "step": 7830 + }, + { + "epoch": 1.0059019758788812, + "grad_norm": 23.194734309203575, + "learning_rate": 5.195260989049848e-07, + "loss": 0.3758, + "step": 7840 + }, + { + "epoch": 1.0071850141134207, + "grad_norm": 31.44737098842723, + "learning_rate": 5.184879635356696e-07, + "loss": 0.4272, + "step": 7850 + }, + { + "epoch": 1.0084680523479599, + "grad_norm": 20.23784478928909, + "learning_rate": 5.174497483512505e-07, + "loss": 0.3907, + "step": 7860 + }, + { + "epoch": 1.0097510905824993, + "grad_norm": 26.109193076008193, + "learning_rate": 5.164114578338467e-07, + "loss": 0.4164, + "step": 7870 + }, + { + "epoch": 1.0110341288170388, + "grad_norm": 27.794539797906825, + "learning_rate": 5.153730964659022e-07, + "loss": 0.4174, + "step": 7880 + }, + { + "epoch": 1.012317167051578, + "grad_norm": 28.133199793800824, + "learning_rate": 5.143346687301672e-07, + "loss": 0.4397, + "step": 7890 + }, + { + "epoch": 1.0136002052861175, + "grad_norm": 26.471339179848712, + "learning_rate": 5.132961791096786e-07, + "loss": 0.406, + "step": 7900 + }, + { + "epoch": 1.014883243520657, + "grad_norm": 25.543837860848726, + "learning_rate": 5.122576320877398e-07, + "loss": 0.4252, + "step": 7910 + }, + { + "epoch": 1.0161662817551964, + "grad_norm": 19.84964952370872, + "learning_rate": 5.112190321479025e-07, + "loss": 0.4103, + "step": 7920 + }, + { + "epoch": 1.0174493199897356, + "grad_norm": 20.66224416270141, + "learning_rate": 5.101803837739468e-07, + "loss": 0.4364, + "step": 7930 + }, + { + "epoch": 1.018732358224275, + "grad_norm": 17.864925195850017, + "learning_rate": 5.091416914498618e-07, + "loss": 0.4137, + "step": 7940 + }, + { + "epoch": 1.0200153964588146, + "grad_norm": 29.746259578339377, + "learning_rate": 5.081029596598264e-07, + "loss": 0.4235, + "step": 7950 + }, + { + "epoch": 1.0212984346933538, + "grad_norm": 30.11517011570124, + "learning_rate": 5.0706419288819e-07, + "loss": 0.4327, + "step": 7960 + }, + { + "epoch": 1.0225814729278933, + "grad_norm": 23.840773252447082, + "learning_rate": 5.060253956194527e-07, + "loss": 0.4144, + "step": 7970 + }, + { + "epoch": 1.0238645111624327, + "grad_norm": 25.394514742446088, + "learning_rate": 5.049865723382462e-07, + "loss": 0.424, + "step": 7980 + }, + { + "epoch": 1.025147549396972, + "grad_norm": 22.802950087127062, + "learning_rate": 5.039477275293154e-07, + "loss": 0.4063, + "step": 7990 + }, + { + "epoch": 1.0264305876315114, + "grad_norm": 32.687008546576884, + "learning_rate": 5.029088656774969e-07, + "loss": 0.463, + "step": 8000 + }, + { + "epoch": 1.0277136258660509, + "grad_norm": 25.13255098285243, + "learning_rate": 5.018699912677017e-07, + "loss": 0.4125, + "step": 8010 + }, + { + "epoch": 1.02899666410059, + "grad_norm": 21.22671390994891, + "learning_rate": 5.008311087848948e-07, + "loss": 0.4208, + "step": 8020 + }, + { + "epoch": 1.0302797023351296, + "grad_norm": 24.546248691433927, + "learning_rate": 4.99792222714076e-07, + "loss": 0.4379, + "step": 8030 + }, + { + "epoch": 1.031562740569669, + "grad_norm": 33.33174453778059, + "learning_rate": 4.987533375402604e-07, + "loss": 0.3964, + "step": 8040 + }, + { + "epoch": 1.0328457788042085, + "grad_norm": 38.93925427606791, + "learning_rate": 4.977144577484597e-07, + "loss": 0.4176, + "step": 8050 + }, + { + "epoch": 1.0341288170387477, + "grad_norm": 18.04265532101691, + "learning_rate": 4.966755878236621e-07, + "loss": 0.3596, + "step": 8060 + }, + { + "epoch": 1.0354118552732872, + "grad_norm": 26.604136359854163, + "learning_rate": 4.956367322508131e-07, + "loss": 0.3939, + "step": 8070 + }, + { + "epoch": 1.0366948935078266, + "grad_norm": 31.0226805335182, + "learning_rate": 4.945978955147962e-07, + "loss": 0.4542, + "step": 8080 + }, + { + "epoch": 1.0379779317423659, + "grad_norm": 25.705711831610977, + "learning_rate": 4.935590821004141e-07, + "loss": 0.4339, + "step": 8090 + }, + { + "epoch": 1.0392609699769053, + "grad_norm": 28.29171312236222, + "learning_rate": 4.925202964923683e-07, + "loss": 0.4289, + "step": 8100 + }, + { + "epoch": 1.0405440082114448, + "grad_norm": 27.73190249085465, + "learning_rate": 4.9148154317524e-07, + "loss": 0.4371, + "step": 8110 + }, + { + "epoch": 1.041827046445984, + "grad_norm": 26.63770613061821, + "learning_rate": 4.90442826633472e-07, + "loss": 0.4103, + "step": 8120 + }, + { + "epoch": 1.0431100846805235, + "grad_norm": 29.239925702794004, + "learning_rate": 4.894041513513476e-07, + "loss": 0.4224, + "step": 8130 + }, + { + "epoch": 1.044393122915063, + "grad_norm": 36.43564943561658, + "learning_rate": 4.883655218129719e-07, + "loss": 0.3843, + "step": 8140 + }, + { + "epoch": 1.0456761611496022, + "grad_norm": 28.975912890908987, + "learning_rate": 4.873269425022526e-07, + "loss": 0.4489, + "step": 8150 + }, + { + "epoch": 1.0469591993841416, + "grad_norm": 19.277848710646314, + "learning_rate": 4.862884179028812e-07, + "loss": 0.4247, + "step": 8160 + }, + { + "epoch": 1.048242237618681, + "grad_norm": 17.76280422898058, + "learning_rate": 4.852499524983121e-07, + "loss": 0.4585, + "step": 8170 + }, + { + "epoch": 1.0495252758532205, + "grad_norm": 38.761170145780845, + "learning_rate": 4.842115507717445e-07, + "loss": 0.4501, + "step": 8180 + }, + { + "epoch": 1.0508083140877598, + "grad_norm": 23.709568247306287, + "learning_rate": 4.831732172061032e-07, + "loss": 0.3919, + "step": 8190 + }, + { + "epoch": 1.0520913523222992, + "grad_norm": 41.50161362671521, + "learning_rate": 4.821349562840175e-07, + "loss": 0.3605, + "step": 8200 + }, + { + "epoch": 1.0533743905568387, + "grad_norm": 23.183380803656508, + "learning_rate": 4.810967724878046e-07, + "loss": 0.39, + "step": 8210 + }, + { + "epoch": 1.054657428791378, + "grad_norm": 28.329959333775793, + "learning_rate": 4.800586702994476e-07, + "loss": 0.3993, + "step": 8220 + }, + { + "epoch": 1.0559404670259174, + "grad_norm": 28.055538264663696, + "learning_rate": 4.790206542005777e-07, + "loss": 0.3973, + "step": 8230 + }, + { + "epoch": 1.0572235052604568, + "grad_norm": 52.50059502035498, + "learning_rate": 4.779827286724546e-07, + "loss": 0.4647, + "step": 8240 + }, + { + "epoch": 1.058506543494996, + "grad_norm": 22.039132606637878, + "learning_rate": 4.769448981959467e-07, + "loss": 0.3736, + "step": 8250 + }, + { + "epoch": 1.0597895817295355, + "grad_norm": 19.789330785658237, + "learning_rate": 4.7590716725151227e-07, + "loss": 0.4317, + "step": 8260 + }, + { + "epoch": 1.061072619964075, + "grad_norm": 17.430832968873897, + "learning_rate": 4.7486954031917947e-07, + "loss": 0.3449, + "step": 8270 + }, + { + "epoch": 1.0623556581986142, + "grad_norm": 29.227468063649315, + "learning_rate": 4.7383202187852804e-07, + "loss": 0.3949, + "step": 8280 + }, + { + "epoch": 1.0636386964331537, + "grad_norm": 20.666180429023427, + "learning_rate": 4.7279461640866915e-07, + "loss": 0.4267, + "step": 8290 + }, + { + "epoch": 1.0649217346676931, + "grad_norm": 27.07338728589568, + "learning_rate": 4.7175732838822567e-07, + "loss": 0.4229, + "step": 8300 + }, + { + "epoch": 1.0662047729022326, + "grad_norm": 64.46471053058796, + "learning_rate": 4.707201622953144e-07, + "loss": 0.4163, + "step": 8310 + }, + { + "epoch": 1.0674878111367718, + "grad_norm": 29.53107756511278, + "learning_rate": 4.696831226075252e-07, + "loss": 0.4549, + "step": 8320 + }, + { + "epoch": 1.0687708493713113, + "grad_norm": 21.401572625560654, + "learning_rate": 4.6864621380190185e-07, + "loss": 0.3906, + "step": 8330 + }, + { + "epoch": 1.0700538876058507, + "grad_norm": 25.68064157495705, + "learning_rate": 4.67609440354924e-07, + "loss": 0.4097, + "step": 8340 + }, + { + "epoch": 1.07133692584039, + "grad_norm": 29.204815823185196, + "learning_rate": 4.6657280674248647e-07, + "loss": 0.3968, + "step": 8350 + }, + { + "epoch": 1.0726199640749294, + "grad_norm": 19.349733181722936, + "learning_rate": 4.6553631743988015e-07, + "loss": 0.4541, + "step": 8360 + }, + { + "epoch": 1.073903002309469, + "grad_norm": 23.88235965612456, + "learning_rate": 4.644999769217731e-07, + "loss": 0.4336, + "step": 8370 + }, + { + "epoch": 1.0751860405440081, + "grad_norm": 20.716537070735193, + "learning_rate": 4.634637896621916e-07, + "loss": 0.453, + "step": 8380 + }, + { + "epoch": 1.0764690787785476, + "grad_norm": 21.907079841387258, + "learning_rate": 4.624277601344994e-07, + "loss": 0.4023, + "step": 8390 + }, + { + "epoch": 1.077752117013087, + "grad_norm": 31.0077552228138, + "learning_rate": 4.613918928113797e-07, + "loss": 0.4077, + "step": 8400 + }, + { + "epoch": 1.0790351552476263, + "grad_norm": 24.814817763941488, + "learning_rate": 4.6035619216481597e-07, + "loss": 0.387, + "step": 8410 + }, + { + "epoch": 1.0803181934821657, + "grad_norm": 22.916020047978275, + "learning_rate": 4.593206626660709e-07, + "loss": 0.4038, + "step": 8420 + }, + { + "epoch": 1.0816012317167052, + "grad_norm": 34.52067995368807, + "learning_rate": 4.5828530878566947e-07, + "loss": 0.4167, + "step": 8430 + }, + { + "epoch": 1.0828842699512446, + "grad_norm": 26.818332859388466, + "learning_rate": 4.572501349933778e-07, + "loss": 0.4134, + "step": 8440 + }, + { + "epoch": 1.0841673081857839, + "grad_norm": 38.61770375410947, + "learning_rate": 4.5621514575818513e-07, + "loss": 0.3922, + "step": 8450 + }, + { + "epoch": 1.0854503464203233, + "grad_norm": 32.86279234960328, + "learning_rate": 4.5518034554828327e-07, + "loss": 0.4353, + "step": 8460 + }, + { + "epoch": 1.0867333846548628, + "grad_norm": 17.021630749961787, + "learning_rate": 4.541457388310483e-07, + "loss": 0.4301, + "step": 8470 + }, + { + "epoch": 1.088016422889402, + "grad_norm": 15.569877228668638, + "learning_rate": 4.531113300730214e-07, + "loss": 0.3665, + "step": 8480 + }, + { + "epoch": 1.0892994611239415, + "grad_norm": 20.32281432171883, + "learning_rate": 4.52077123739888e-07, + "loss": 0.3952, + "step": 8490 + }, + { + "epoch": 1.090582499358481, + "grad_norm": 28.57556449062946, + "learning_rate": 4.5104312429646084e-07, + "loss": 0.4014, + "step": 8500 + }, + { + "epoch": 1.0918655375930202, + "grad_norm": 24.060081177017913, + "learning_rate": 4.50009336206659e-07, + "loss": 0.4189, + "step": 8510 + }, + { + "epoch": 1.0931485758275596, + "grad_norm": 61.887573372064445, + "learning_rate": 4.489757639334887e-07, + "loss": 0.4197, + "step": 8520 + }, + { + "epoch": 1.094431614062099, + "grad_norm": 22.86581598887989, + "learning_rate": 4.4794241193902526e-07, + "loss": 0.3657, + "step": 8530 + }, + { + "epoch": 1.0957146522966383, + "grad_norm": 19.205641078587664, + "learning_rate": 4.469092846843926e-07, + "loss": 0.4073, + "step": 8540 + }, + { + "epoch": 1.0969976905311778, + "grad_norm": 22.150008793493797, + "learning_rate": 4.4587638662974405e-07, + "loss": 0.4166, + "step": 8550 + }, + { + "epoch": 1.0982807287657173, + "grad_norm": 37.80514708444142, + "learning_rate": 4.448437222342441e-07, + "loss": 0.4094, + "step": 8560 + }, + { + "epoch": 1.0995637670002567, + "grad_norm": 27.785704463118815, + "learning_rate": 4.438112959560484e-07, + "loss": 0.4563, + "step": 8570 + }, + { + "epoch": 1.100846805234796, + "grad_norm": 24.725406681497297, + "learning_rate": 4.427791122522841e-07, + "loss": 0.416, + "step": 8580 + }, + { + "epoch": 1.1021298434693354, + "grad_norm": 18.011322311395915, + "learning_rate": 4.417471755790315e-07, + "loss": 0.4527, + "step": 8590 + }, + { + "epoch": 1.1034128817038749, + "grad_norm": 21.827477068122878, + "learning_rate": 4.4071549039130454e-07, + "loss": 0.4103, + "step": 8600 + }, + { + "epoch": 1.104695919938414, + "grad_norm": 23.687973182277076, + "learning_rate": 4.3968406114303146e-07, + "loss": 0.3922, + "step": 8610 + }, + { + "epoch": 1.1059789581729536, + "grad_norm": 39.61772231465808, + "learning_rate": 4.3865289228703504e-07, + "loss": 0.4134, + "step": 8620 + }, + { + "epoch": 1.107261996407493, + "grad_norm": 33.04757878745131, + "learning_rate": 4.3762198827501464e-07, + "loss": 0.4148, + "step": 8630 + }, + { + "epoch": 1.1085450346420322, + "grad_norm": 23.072864794965415, + "learning_rate": 4.3659135355752593e-07, + "loss": 0.3901, + "step": 8640 + }, + { + "epoch": 1.1098280728765717, + "grad_norm": 36.000322415907696, + "learning_rate": 4.355609925839617e-07, + "loss": 0.4115, + "step": 8650 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 24.14465954632059, + "learning_rate": 4.3453090980253325e-07, + "loss": 0.4132, + "step": 8660 + }, + { + "epoch": 1.1123941493456506, + "grad_norm": 22.28343764640214, + "learning_rate": 4.3350110966025134e-07, + "loss": 0.402, + "step": 8670 + }, + { + "epoch": 1.1136771875801899, + "grad_norm": 23.19104068729757, + "learning_rate": 4.3247159660290553e-07, + "loss": 0.4191, + "step": 8680 + }, + { + "epoch": 1.1149602258147293, + "grad_norm": 26.870602004176426, + "learning_rate": 4.314423750750468e-07, + "loss": 0.4394, + "step": 8690 + }, + { + "epoch": 1.1162432640492688, + "grad_norm": 24.17036684533003, + "learning_rate": 4.304134495199674e-07, + "loss": 0.4035, + "step": 8700 + }, + { + "epoch": 1.117526302283808, + "grad_norm": 24.867079178805373, + "learning_rate": 4.2938482437968133e-07, + "loss": 0.431, + "step": 8710 + }, + { + "epoch": 1.1188093405183475, + "grad_norm": 20.494639172813187, + "learning_rate": 4.283565040949063e-07, + "loss": 0.4327, + "step": 8720 + }, + { + "epoch": 1.120092378752887, + "grad_norm": 32.009129944870274, + "learning_rate": 4.273284931050438e-07, + "loss": 0.4276, + "step": 8730 + }, + { + "epoch": 1.1213754169874262, + "grad_norm": 22.532918324746255, + "learning_rate": 4.2630079584815947e-07, + "loss": 0.4045, + "step": 8740 + }, + { + "epoch": 1.1226584552219656, + "grad_norm": 30.323864932906677, + "learning_rate": 4.2527341676096535e-07, + "loss": 0.4256, + "step": 8750 + }, + { + "epoch": 1.123941493456505, + "grad_norm": 33.441393697961836, + "learning_rate": 4.2424636027879924e-07, + "loss": 0.4009, + "step": 8760 + }, + { + "epoch": 1.1252245316910443, + "grad_norm": 20.53035399560357, + "learning_rate": 4.23219630835607e-07, + "loss": 0.3949, + "step": 8770 + }, + { + "epoch": 1.1265075699255838, + "grad_norm": 18.49082461220664, + "learning_rate": 4.221932328639213e-07, + "loss": 0.4291, + "step": 8780 + }, + { + "epoch": 1.1277906081601232, + "grad_norm": 53.10950789586209, + "learning_rate": 4.211671707948452e-07, + "loss": 0.3946, + "step": 8790 + }, + { + "epoch": 1.1290736463946627, + "grad_norm": 23.794217633107394, + "learning_rate": 4.2014144905803093e-07, + "loss": 0.4361, + "step": 8800 + }, + { + "epoch": 1.130356684629202, + "grad_norm": 19.074499990270926, + "learning_rate": 4.191160720816613e-07, + "loss": 0.4219, + "step": 8810 + }, + { + "epoch": 1.1316397228637414, + "grad_norm": 34.06848091833755, + "learning_rate": 4.180910442924311e-07, + "loss": 0.4866, + "step": 8820 + }, + { + "epoch": 1.1329227610982806, + "grad_norm": 32.484482040668546, + "learning_rate": 4.1706637011552775e-07, + "loss": 0.4013, + "step": 8830 + }, + { + "epoch": 1.13420579933282, + "grad_norm": 37.64903866910102, + "learning_rate": 4.1604205397461146e-07, + "loss": 0.3864, + "step": 8840 + }, + { + "epoch": 1.1354888375673595, + "grad_norm": 26.02779435259084, + "learning_rate": 4.1501810029179735e-07, + "loss": 0.4064, + "step": 8850 + }, + { + "epoch": 1.136771875801899, + "grad_norm": 29.262990092122106, + "learning_rate": 4.139945134876357e-07, + "loss": 0.4076, + "step": 8860 + }, + { + "epoch": 1.1380549140364382, + "grad_norm": 38.427302337611145, + "learning_rate": 4.1297129798109245e-07, + "loss": 0.4285, + "step": 8870 + }, + { + "epoch": 1.1393379522709777, + "grad_norm": 21.8656979268188, + "learning_rate": 4.119484581895308e-07, + "loss": 0.4035, + "step": 8880 + }, + { + "epoch": 1.1406209905055171, + "grad_norm": 31.002904348083018, + "learning_rate": 4.109259985286927e-07, + "loss": 0.4311, + "step": 8890 + }, + { + "epoch": 1.1419040287400564, + "grad_norm": 21.795370463844673, + "learning_rate": 4.0990392341267776e-07, + "loss": 0.4105, + "step": 8900 + }, + { + "epoch": 1.1431870669745958, + "grad_norm": 13.85464142841709, + "learning_rate": 4.0888223725392624e-07, + "loss": 0.4339, + "step": 8910 + }, + { + "epoch": 1.1444701052091353, + "grad_norm": 17.52669688314415, + "learning_rate": 4.078609444631991e-07, + "loss": 0.3961, + "step": 8920 + }, + { + "epoch": 1.1457531434436747, + "grad_norm": 19.308667610972368, + "learning_rate": 4.0684004944955916e-07, + "loss": 0.425, + "step": 8930 + }, + { + "epoch": 1.147036181678214, + "grad_norm": 24.403304649015908, + "learning_rate": 4.058195566203516e-07, + "loss": 0.4157, + "step": 8940 + }, + { + "epoch": 1.1483192199127534, + "grad_norm": 25.8650649243673, + "learning_rate": 4.0479947038118543e-07, + "loss": 0.3974, + "step": 8950 + }, + { + "epoch": 1.1496022581472927, + "grad_norm": 22.51279269909686, + "learning_rate": 4.0377979513591504e-07, + "loss": 0.4184, + "step": 8960 + }, + { + "epoch": 1.1508852963818321, + "grad_norm": 27.881612718688665, + "learning_rate": 4.0276053528661946e-07, + "loss": 0.4347, + "step": 8970 + }, + { + "epoch": 1.1521683346163716, + "grad_norm": 39.376622406711824, + "learning_rate": 4.0174169523358485e-07, + "loss": 0.37, + "step": 8980 + }, + { + "epoch": 1.153451372850911, + "grad_norm": 23.06156953530335, + "learning_rate": 4.007232793752856e-07, + "loss": 0.4307, + "step": 8990 + }, + { + "epoch": 1.1547344110854503, + "grad_norm": 31.028929051393963, + "learning_rate": 3.997052921083636e-07, + "loss": 0.4317, + "step": 9000 + }, + { + "epoch": 1.1560174493199897, + "grad_norm": 23.945824824359477, + "learning_rate": 3.986877378276116e-07, + "loss": 0.3734, + "step": 9010 + }, + { + "epoch": 1.1573004875545292, + "grad_norm": 31.508588112389535, + "learning_rate": 3.9767062092595255e-07, + "loss": 0.4098, + "step": 9020 + }, + { + "epoch": 1.1585835257890684, + "grad_norm": 26.231073044192854, + "learning_rate": 3.9665394579442094e-07, + "loss": 0.3896, + "step": 9030 + }, + { + "epoch": 1.1598665640236079, + "grad_norm": 19.901188439260828, + "learning_rate": 3.956377168221448e-07, + "loss": 0.4058, + "step": 9040 + }, + { + "epoch": 1.1611496022581473, + "grad_norm": 28.614306676157124, + "learning_rate": 3.9462193839632535e-07, + "loss": 0.3857, + "step": 9050 + }, + { + "epoch": 1.1624326404926868, + "grad_norm": 25.1816033632786, + "learning_rate": 3.9360661490221904e-07, + "loss": 0.425, + "step": 9060 + }, + { + "epoch": 1.163715678727226, + "grad_norm": 20.912402300049195, + "learning_rate": 3.925917507231181e-07, + "loss": 0.3965, + "step": 9070 + }, + { + "epoch": 1.1649987169617655, + "grad_norm": 42.967670087175755, + "learning_rate": 3.9157735024033235e-07, + "loss": 0.4488, + "step": 9080 + }, + { + "epoch": 1.1662817551963047, + "grad_norm": 22.488410740099827, + "learning_rate": 3.905634178331693e-07, + "loss": 0.4116, + "step": 9090 + }, + { + "epoch": 1.1675647934308442, + "grad_norm": 26.15258177362436, + "learning_rate": 3.8954995787891563e-07, + "loss": 0.4053, + "step": 9100 + }, + { + "epoch": 1.1688478316653836, + "grad_norm": 22.31282161034061, + "learning_rate": 3.885369747528188e-07, + "loss": 0.3973, + "step": 9110 + }, + { + "epoch": 1.170130869899923, + "grad_norm": 31.326125313937926, + "learning_rate": 3.8752447282806755e-07, + "loss": 0.3766, + "step": 9120 + }, + { + "epoch": 1.1714139081344623, + "grad_norm": 17.320456543004315, + "learning_rate": 3.865124564757729e-07, + "loss": 0.4493, + "step": 9130 + }, + { + "epoch": 1.1726969463690018, + "grad_norm": 20.535121829299914, + "learning_rate": 3.8550093006495016e-07, + "loss": 0.4386, + "step": 9140 + }, + { + "epoch": 1.1739799846035412, + "grad_norm": 20.967761699081983, + "learning_rate": 3.844898979624991e-07, + "loss": 0.416, + "step": 9150 + }, + { + "epoch": 1.1752630228380805, + "grad_norm": 19.50881322360687, + "learning_rate": 3.8347936453318555e-07, + "loss": 0.4247, + "step": 9160 + }, + { + "epoch": 1.17654606107262, + "grad_norm": 26.062612955607374, + "learning_rate": 3.8246933413962235e-07, + "loss": 0.4512, + "step": 9170 + }, + { + "epoch": 1.1778290993071594, + "grad_norm": 26.762914722723426, + "learning_rate": 3.814598111422513e-07, + "loss": 0.429, + "step": 9180 + }, + { + "epoch": 1.1791121375416989, + "grad_norm": 18.953473451331607, + "learning_rate": 3.8045079989932287e-07, + "loss": 0.4053, + "step": 9190 + }, + { + "epoch": 1.180395175776238, + "grad_norm": 29.50638465783403, + "learning_rate": 3.7944230476687865e-07, + "loss": 0.4311, + "step": 9200 + }, + { + "epoch": 1.1816782140107776, + "grad_norm": 35.497287682422446, + "learning_rate": 3.7843433009873214e-07, + "loss": 0.408, + "step": 9210 + }, + { + "epoch": 1.1829612522453168, + "grad_norm": 17.645655738077316, + "learning_rate": 3.7742688024645003e-07, + "loss": 0.3896, + "step": 9220 + }, + { + "epoch": 1.1842442904798562, + "grad_norm": 21.080563781300885, + "learning_rate": 3.764199595593328e-07, + "loss": 0.3987, + "step": 9230 + }, + { + "epoch": 1.1855273287143957, + "grad_norm": 17.45617348052372, + "learning_rate": 3.7541357238439677e-07, + "loss": 0.4235, + "step": 9240 + }, + { + "epoch": 1.1868103669489352, + "grad_norm": 26.971330911821262, + "learning_rate": 3.7440772306635527e-07, + "loss": 0.3981, + "step": 9250 + }, + { + "epoch": 1.1880934051834744, + "grad_norm": 26.361646309010673, + "learning_rate": 3.734024159475991e-07, + "loss": 0.4194, + "step": 9260 + }, + { + "epoch": 1.1893764434180139, + "grad_norm": 33.85139094651643, + "learning_rate": 3.723976553681787e-07, + "loss": 0.3783, + "step": 9270 + }, + { + "epoch": 1.1906594816525533, + "grad_norm": 27.408018964690125, + "learning_rate": 3.713934456657851e-07, + "loss": 0.397, + "step": 9280 + }, + { + "epoch": 1.1919425198870925, + "grad_norm": 23.552002508964655, + "learning_rate": 3.7038979117573044e-07, + "loss": 0.4317, + "step": 9290 + }, + { + "epoch": 1.193225558121632, + "grad_norm": 13.89131824355512, + "learning_rate": 3.693866962309308e-07, + "loss": 0.4155, + "step": 9300 + }, + { + "epoch": 1.1945085963561715, + "grad_norm": 31.36631209132862, + "learning_rate": 3.6838416516188625e-07, + "loss": 0.4533, + "step": 9310 + }, + { + "epoch": 1.195791634590711, + "grad_norm": 24.14130801380919, + "learning_rate": 3.6738220229666216e-07, + "loss": 0.406, + "step": 9320 + }, + { + "epoch": 1.1970746728252502, + "grad_norm": 17.077491993036045, + "learning_rate": 3.6638081196087153e-07, + "loss": 0.4117, + "step": 9330 + }, + { + "epoch": 1.1983577110597896, + "grad_norm": 28.507529767977122, + "learning_rate": 3.653799984776555e-07, + "loss": 0.4248, + "step": 9340 + }, + { + "epoch": 1.199640749294329, + "grad_norm": 19.884816715283204, + "learning_rate": 3.6437976616766444e-07, + "loss": 0.4145, + "step": 9350 + }, + { + "epoch": 1.2009237875288683, + "grad_norm": 36.52064822630453, + "learning_rate": 3.6338011934904e-07, + "loss": 0.4058, + "step": 9360 + }, + { + "epoch": 1.2022068257634078, + "grad_norm": 45.624612440057774, + "learning_rate": 3.623810623373964e-07, + "loss": 0.4215, + "step": 9370 + }, + { + "epoch": 1.2034898639979472, + "grad_norm": 18.401398643473303, + "learning_rate": 3.613825994458015e-07, + "loss": 0.4122, + "step": 9380 + }, + { + "epoch": 1.2047729022324865, + "grad_norm": 21.822673350440905, + "learning_rate": 3.6038473498475767e-07, + "loss": 0.4053, + "step": 9390 + }, + { + "epoch": 1.206055940467026, + "grad_norm": 24.22673541233331, + "learning_rate": 3.593874732621847e-07, + "loss": 0.4213, + "step": 9400 + }, + { + "epoch": 1.2073389787015654, + "grad_norm": 24.276274546662954, + "learning_rate": 3.583908185833997e-07, + "loss": 0.3588, + "step": 9410 + }, + { + "epoch": 1.2086220169361046, + "grad_norm": 36.819416500800294, + "learning_rate": 3.5739477525109896e-07, + "loss": 0.4458, + "step": 9420 + }, + { + "epoch": 1.209905055170644, + "grad_norm": 17.644644582988786, + "learning_rate": 3.5639934756533997e-07, + "loss": 0.3794, + "step": 9430 + }, + { + "epoch": 1.2111880934051835, + "grad_norm": 25.644984011921085, + "learning_rate": 3.5540453982352224e-07, + "loss": 0.4373, + "step": 9440 + }, + { + "epoch": 1.212471131639723, + "grad_norm": 15.118888000783269, + "learning_rate": 3.5441035632036864e-07, + "loss": 0.4093, + "step": 9450 + }, + { + "epoch": 1.2137541698742622, + "grad_norm": 26.25537997321305, + "learning_rate": 3.534168013479073e-07, + "loss": 0.3946, + "step": 9460 + }, + { + "epoch": 1.2150372081088017, + "grad_norm": 23.79290847134502, + "learning_rate": 3.5242387919545313e-07, + "loss": 0.4085, + "step": 9470 + }, + { + "epoch": 1.2163202463433411, + "grad_norm": 23.76630324169316, + "learning_rate": 3.514315941495885e-07, + "loss": 0.4, + "step": 9480 + }, + { + "epoch": 1.2176032845778804, + "grad_norm": 29.788159185376664, + "learning_rate": 3.504399504941457e-07, + "loss": 0.4298, + "step": 9490 + }, + { + "epoch": 1.2188863228124198, + "grad_norm": 18.617487202399907, + "learning_rate": 3.494489525101884e-07, + "loss": 0.3843, + "step": 9500 + }, + { + "epoch": 1.2201693610469593, + "grad_norm": 24.379453254645018, + "learning_rate": 3.4845860447599176e-07, + "loss": 0.3973, + "step": 9510 + }, + { + "epoch": 1.2214523992814985, + "grad_norm": 21.49404612859524, + "learning_rate": 3.4746891066702605e-07, + "loss": 0.3647, + "step": 9520 + }, + { + "epoch": 1.222735437516038, + "grad_norm": 25.185348497009976, + "learning_rate": 3.4647987535593657e-07, + "loss": 0.4241, + "step": 9530 + }, + { + "epoch": 1.2240184757505774, + "grad_norm": 26.021286151911948, + "learning_rate": 3.454915028125263e-07, + "loss": 0.3732, + "step": 9540 + }, + { + "epoch": 1.2253015139851167, + "grad_norm": 23.139044161715393, + "learning_rate": 3.4450379730373633e-07, + "loss": 0.3643, + "step": 9550 + }, + { + "epoch": 1.2265845522196561, + "grad_norm": 43.24657274098848, + "learning_rate": 3.4351676309362843e-07, + "loss": 0.4295, + "step": 9560 + }, + { + "epoch": 1.2278675904541956, + "grad_norm": 22.025013982537356, + "learning_rate": 3.4253040444336655e-07, + "loss": 0.426, + "step": 9570 + }, + { + "epoch": 1.229150628688735, + "grad_norm": 20.263647940958066, + "learning_rate": 3.415447256111973e-07, + "loss": 0.4229, + "step": 9580 + }, + { + "epoch": 1.2304336669232743, + "grad_norm": 22.25311251376838, + "learning_rate": 3.4055973085243326e-07, + "loss": 0.4045, + "step": 9590 + }, + { + "epoch": 1.2317167051578137, + "grad_norm": 19.72060930988219, + "learning_rate": 3.395754244194337e-07, + "loss": 0.4008, + "step": 9600 + }, + { + "epoch": 1.2329997433923532, + "grad_norm": 20.434258924790985, + "learning_rate": 3.385918105615856e-07, + "loss": 0.3924, + "step": 9610 + }, + { + "epoch": 1.2342827816268924, + "grad_norm": 18.20250261842057, + "learning_rate": 3.3760889352528677e-07, + "loss": 0.3698, + "step": 9620 + }, + { + "epoch": 1.2355658198614319, + "grad_norm": 20.849347870925946, + "learning_rate": 3.366266775539264e-07, + "loss": 0.4091, + "step": 9630 + }, + { + "epoch": 1.2368488580959713, + "grad_norm": 19.265089381704847, + "learning_rate": 3.3564516688786693e-07, + "loss": 0.4243, + "step": 9640 + }, + { + "epoch": 1.2381318963305106, + "grad_norm": 20.96876473093279, + "learning_rate": 3.3466436576442623e-07, + "loss": 0.3704, + "step": 9650 + }, + { + "epoch": 1.23941493456505, + "grad_norm": 25.542366161422738, + "learning_rate": 3.3368427841785905e-07, + "loss": 0.3884, + "step": 9660 + }, + { + "epoch": 1.2406979727995895, + "grad_norm": 23.67254977794937, + "learning_rate": 3.327049090793381e-07, + "loss": 0.4169, + "step": 9670 + }, + { + "epoch": 1.2419810110341287, + "grad_norm": 40.66323744361764, + "learning_rate": 3.3172626197693673e-07, + "loss": 0.393, + "step": 9680 + }, + { + "epoch": 1.2432640492686682, + "grad_norm": 19.624468970281875, + "learning_rate": 3.307483413356106e-07, + "loss": 0.4106, + "step": 9690 + }, + { + "epoch": 1.2445470875032076, + "grad_norm": 20.46949508682431, + "learning_rate": 3.2977115137717854e-07, + "loss": 0.3953, + "step": 9700 + }, + { + "epoch": 1.245830125737747, + "grad_norm": 22.852388857585513, + "learning_rate": 3.287946963203051e-07, + "loss": 0.3821, + "step": 9710 + }, + { + "epoch": 1.2471131639722863, + "grad_norm": 32.90021890896219, + "learning_rate": 3.2781898038048237e-07, + "loss": 0.3877, + "step": 9720 + }, + { + "epoch": 1.2483962022068258, + "grad_norm": 24.1740623995025, + "learning_rate": 3.268440077700116e-07, + "loss": 0.4055, + "step": 9730 + }, + { + "epoch": 1.2496792404413652, + "grad_norm": 29.57881682666279, + "learning_rate": 3.2586978269798463e-07, + "loss": 0.377, + "step": 9740 + }, + { + "epoch": 1.2509622786759045, + "grad_norm": 15.073810761883488, + "learning_rate": 3.248963093702662e-07, + "loss": 0.3997, + "step": 9750 + }, + { + "epoch": 1.252245316910444, + "grad_norm": 17.42645258511258, + "learning_rate": 3.2392359198947605e-07, + "loss": 0.3891, + "step": 9760 + }, + { + "epoch": 1.2535283551449834, + "grad_norm": 19.220433753618767, + "learning_rate": 3.229516347549698e-07, + "loss": 0.434, + "step": 9770 + }, + { + "epoch": 1.2548113933795226, + "grad_norm": 15.952314679313375, + "learning_rate": 3.219804418628216e-07, + "loss": 0.4013, + "step": 9780 + }, + { + "epoch": 1.256094431614062, + "grad_norm": 33.79619438094675, + "learning_rate": 3.2101001750580636e-07, + "loss": 0.4133, + "step": 9790 + }, + { + "epoch": 1.2573774698486015, + "grad_norm": 26.988813371820967, + "learning_rate": 3.200403658733801e-07, + "loss": 0.4212, + "step": 9800 + }, + { + "epoch": 1.2586605080831408, + "grad_norm": 16.709658430660575, + "learning_rate": 3.1907149115166397e-07, + "loss": 0.4195, + "step": 9810 + }, + { + "epoch": 1.2599435463176802, + "grad_norm": 22.521076672871736, + "learning_rate": 3.181033975234244e-07, + "loss": 0.4397, + "step": 9820 + }, + { + "epoch": 1.2612265845522197, + "grad_norm": 18.598209399972006, + "learning_rate": 3.1713608916805567e-07, + "loss": 0.4007, + "step": 9830 + }, + { + "epoch": 1.2625096227867592, + "grad_norm": 16.183176929182896, + "learning_rate": 3.1616957026156243e-07, + "loss": 0.371, + "step": 9840 + }, + { + "epoch": 1.2637926610212984, + "grad_norm": 25.50356319022905, + "learning_rate": 3.152038449765407e-07, + "loss": 0.4095, + "step": 9850 + }, + { + "epoch": 1.2650756992558378, + "grad_norm": 23.857846812465365, + "learning_rate": 3.1423891748216104e-07, + "loss": 0.3905, + "step": 9860 + }, + { + "epoch": 1.266358737490377, + "grad_norm": 34.448776352226666, + "learning_rate": 3.132747919441486e-07, + "loss": 0.4065, + "step": 9870 + }, + { + "epoch": 1.2676417757249165, + "grad_norm": 26.051871244480505, + "learning_rate": 3.1231147252476763e-07, + "loss": 0.4031, + "step": 9880 + }, + { + "epoch": 1.268924813959456, + "grad_norm": 19.198264277116728, + "learning_rate": 3.113489633828016e-07, + "loss": 0.4052, + "step": 9890 + }, + { + "epoch": 1.2702078521939955, + "grad_norm": 22.955109707023716, + "learning_rate": 3.1038726867353583e-07, + "loss": 0.3869, + "step": 9900 + }, + { + "epoch": 1.2714908904285347, + "grad_norm": 23.210020487200644, + "learning_rate": 3.0942639254873995e-07, + "loss": 0.3729, + "step": 9910 + }, + { + "epoch": 1.2727739286630741, + "grad_norm": 27.389177220187307, + "learning_rate": 3.0846633915664965e-07, + "loss": 0.4096, + "step": 9920 + }, + { + "epoch": 1.2740569668976136, + "grad_norm": 15.246044252925389, + "learning_rate": 3.075071126419483e-07, + "loss": 0.3966, + "step": 9930 + }, + { + "epoch": 1.2753400051321528, + "grad_norm": 28.224774374206618, + "learning_rate": 3.065487171457502e-07, + "loss": 0.4222, + "step": 9940 + }, + { + "epoch": 1.2766230433666923, + "grad_norm": 20.622559605621515, + "learning_rate": 3.0559115680558157e-07, + "loss": 0.3828, + "step": 9950 + }, + { + "epoch": 1.2779060816012318, + "grad_norm": 24.406214227447137, + "learning_rate": 3.0463443575536317e-07, + "loss": 0.4297, + "step": 9960 + }, + { + "epoch": 1.2791891198357712, + "grad_norm": 34.26566697018427, + "learning_rate": 3.0367855812539247e-07, + "loss": 0.3781, + "step": 9970 + }, + { + "epoch": 1.2804721580703105, + "grad_norm": 25.029286527623093, + "learning_rate": 3.0272352804232617e-07, + "loss": 0.3964, + "step": 9980 + }, + { + "epoch": 1.28175519630485, + "grad_norm": 25.297213761677014, + "learning_rate": 3.0176934962916125e-07, + "loss": 0.3999, + "step": 9990 + }, + { + "epoch": 1.2830382345393891, + "grad_norm": 18.941914712669643, + "learning_rate": 3.008160270052184e-07, + "loss": 0.4161, + "step": 10000 + }, + { + "epoch": 1.2843212727739286, + "grad_norm": 20.441866870704384, + "learning_rate": 2.9986356428612386e-07, + "loss": 0.4026, + "step": 10010 + }, + { + "epoch": 1.285604311008468, + "grad_norm": 22.18790274111227, + "learning_rate": 2.9891196558379126e-07, + "loss": 0.3989, + "step": 10020 + }, + { + "epoch": 1.2868873492430075, + "grad_norm": 21.68198809308786, + "learning_rate": 2.979612350064042e-07, + "loss": 0.4137, + "step": 10030 + }, + { + "epoch": 1.2881703874775468, + "grad_norm": 28.260767528121633, + "learning_rate": 2.970113766583983e-07, + "loss": 0.4279, + "step": 10040 + }, + { + "epoch": 1.2894534257120862, + "grad_norm": 29.541300623799426, + "learning_rate": 2.960623946404442e-07, + "loss": 0.4269, + "step": 10050 + }, + { + "epoch": 1.2907364639466257, + "grad_norm": 31.11008953602184, + "learning_rate": 2.951142930494288e-07, + "loss": 0.4274, + "step": 10060 + }, + { + "epoch": 1.292019502181165, + "grad_norm": 32.90890920202704, + "learning_rate": 2.94167075978438e-07, + "loss": 0.4048, + "step": 10070 + }, + { + "epoch": 1.2933025404157044, + "grad_norm": 22.364805434278484, + "learning_rate": 2.9322074751673974e-07, + "loss": 0.4253, + "step": 10080 + }, + { + "epoch": 1.2945855786502438, + "grad_norm": 24.877820704440825, + "learning_rate": 2.922753117497649e-07, + "loss": 0.4238, + "step": 10090 + }, + { + "epoch": 1.2958686168847833, + "grad_norm": 15.522379559775498, + "learning_rate": 2.9133077275909107e-07, + "loss": 0.3711, + "step": 10100 + }, + { + "epoch": 1.2971516551193225, + "grad_norm": 24.3579842760168, + "learning_rate": 2.9038713462242414e-07, + "loss": 0.3737, + "step": 10110 + }, + { + "epoch": 1.298434693353862, + "grad_norm": 15.437075347077805, + "learning_rate": 2.8944440141358073e-07, + "loss": 0.395, + "step": 10120 + }, + { + "epoch": 1.2997177315884012, + "grad_norm": 22.077332961045705, + "learning_rate": 2.885025772024709e-07, + "loss": 0.3999, + "step": 10130 + }, + { + "epoch": 1.3010007698229407, + "grad_norm": 25.355772889681077, + "learning_rate": 2.8756166605508083e-07, + "loss": 0.4031, + "step": 10140 + }, + { + "epoch": 1.3022838080574801, + "grad_norm": 24.663552708456876, + "learning_rate": 2.8662167203345413e-07, + "loss": 0.3998, + "step": 10150 + }, + { + "epoch": 1.3035668462920196, + "grad_norm": 16.63326992121582, + "learning_rate": 2.8568259919567526e-07, + "loss": 0.365, + "step": 10160 + }, + { + "epoch": 1.3048498845265588, + "grad_norm": 17.973255239659736, + "learning_rate": 2.847444515958523e-07, + "loss": 0.3944, + "step": 10170 + }, + { + "epoch": 1.3061329227610983, + "grad_norm": 29.064381855798906, + "learning_rate": 2.838072332840987e-07, + "loss": 0.4003, + "step": 10180 + }, + { + "epoch": 1.3074159609956377, + "grad_norm": 36.69546537161767, + "learning_rate": 2.828709483065157e-07, + "loss": 0.3984, + "step": 10190 + }, + { + "epoch": 1.308698999230177, + "grad_norm": 21.57559563207781, + "learning_rate": 2.819356007051753e-07, + "loss": 0.3925, + "step": 10200 + }, + { + "epoch": 1.3099820374647164, + "grad_norm": 26.37400357388259, + "learning_rate": 2.8100119451810335e-07, + "loss": 0.413, + "step": 10210 + }, + { + "epoch": 1.3112650756992559, + "grad_norm": 25.87251472816358, + "learning_rate": 2.800677337792604e-07, + "loss": 0.381, + "step": 10220 + }, + { + "epoch": 1.3125481139337953, + "grad_norm": 20.686412049445575, + "learning_rate": 2.791352225185266e-07, + "loss": 0.4205, + "step": 10230 + }, + { + "epoch": 1.3138311521683346, + "grad_norm": 22.157549254056942, + "learning_rate": 2.7820366476168225e-07, + "loss": 0.3895, + "step": 10240 + }, + { + "epoch": 1.315114190402874, + "grad_norm": 22.414927276663704, + "learning_rate": 2.77273064530391e-07, + "loss": 0.373, + "step": 10250 + }, + { + "epoch": 1.3163972286374133, + "grad_norm": 35.997650174732435, + "learning_rate": 2.763434258421836e-07, + "loss": 0.416, + "step": 10260 + }, + { + "epoch": 1.3176802668719527, + "grad_norm": 38.2188609004921, + "learning_rate": 2.7541475271043944e-07, + "loss": 0.4103, + "step": 10270 + }, + { + "epoch": 1.3189633051064922, + "grad_norm": 29.699727547746814, + "learning_rate": 2.744870491443687e-07, + "loss": 0.4084, + "step": 10280 + }, + { + "epoch": 1.3202463433410316, + "grad_norm": 25.789561864954347, + "learning_rate": 2.7356031914899703e-07, + "loss": 0.3879, + "step": 10290 + }, + { + "epoch": 1.3215293815755709, + "grad_norm": 28.110691291026626, + "learning_rate": 2.726345667251461e-07, + "loss": 0.4149, + "step": 10300 + }, + { + "epoch": 1.3228124198101103, + "grad_norm": 33.13590019722009, + "learning_rate": 2.717097958694172e-07, + "loss": 0.3875, + "step": 10310 + }, + { + "epoch": 1.3240954580446498, + "grad_norm": 20.63676016977155, + "learning_rate": 2.707860105741749e-07, + "loss": 0.3952, + "step": 10320 + }, + { + "epoch": 1.325378496279189, + "grad_norm": 22.92723165656273, + "learning_rate": 2.698632148275286e-07, + "loss": 0.3691, + "step": 10330 + }, + { + "epoch": 1.3266615345137285, + "grad_norm": 34.112715884701764, + "learning_rate": 2.689414126133154e-07, + "loss": 0.4207, + "step": 10340 + }, + { + "epoch": 1.327944572748268, + "grad_norm": 27.745076162173604, + "learning_rate": 2.68020607911083e-07, + "loss": 0.402, + "step": 10350 + }, + { + "epoch": 1.3292276109828074, + "grad_norm": 19.921433955597458, + "learning_rate": 2.671008046960734e-07, + "loss": 0.3827, + "step": 10360 + }, + { + "epoch": 1.3305106492173466, + "grad_norm": 32.66965912213876, + "learning_rate": 2.661820069392049e-07, + "loss": 0.3932, + "step": 10370 + }, + { + "epoch": 1.331793687451886, + "grad_norm": 28.128951567667077, + "learning_rate": 2.6526421860705473e-07, + "loss": 0.4157, + "step": 10380 + }, + { + "epoch": 1.3330767256864253, + "grad_norm": 23.71103603368344, + "learning_rate": 2.6434744366184216e-07, + "loss": 0.4031, + "step": 10390 + }, + { + "epoch": 1.3343597639209648, + "grad_norm": 18.48914362746193, + "learning_rate": 2.634316860614123e-07, + "loss": 0.3983, + "step": 10400 + }, + { + "epoch": 1.3356428021555042, + "grad_norm": 35.93621826217056, + "learning_rate": 2.6251694975921733e-07, + "loss": 0.383, + "step": 10410 + }, + { + "epoch": 1.3369258403900437, + "grad_norm": 37.701384733209984, + "learning_rate": 2.6160323870430107e-07, + "loss": 0.3973, + "step": 10420 + }, + { + "epoch": 1.338208878624583, + "grad_norm": 24.703671581680478, + "learning_rate": 2.6069055684128106e-07, + "loss": 0.4435, + "step": 10430 + }, + { + "epoch": 1.3394919168591224, + "grad_norm": 22.99771023361343, + "learning_rate": 2.597789081103313e-07, + "loss": 0.3966, + "step": 10440 + }, + { + "epoch": 1.3407749550936618, + "grad_norm": 19.237542715960384, + "learning_rate": 2.588682964471657e-07, + "loss": 0.4429, + "step": 10450 + }, + { + "epoch": 1.342057993328201, + "grad_norm": 17.784894820076364, + "learning_rate": 2.579587257830216e-07, + "loss": 0.3597, + "step": 10460 + }, + { + "epoch": 1.3433410315627405, + "grad_norm": 22.668656129767843, + "learning_rate": 2.5705020004464115e-07, + "loss": 0.4064, + "step": 10470 + }, + { + "epoch": 1.34462406979728, + "grad_norm": 29.312644318878057, + "learning_rate": 2.5614272315425676e-07, + "loss": 0.3895, + "step": 10480 + }, + { + "epoch": 1.3459071080318195, + "grad_norm": 19.166600699257906, + "learning_rate": 2.552362990295714e-07, + "loss": 0.3908, + "step": 10490 + }, + { + "epoch": 1.3471901462663587, + "grad_norm": 28.606668953741412, + "learning_rate": 2.5433093158374437e-07, + "loss": 0.3858, + "step": 10500 + }, + { + "epoch": 1.3484731845008981, + "grad_norm": 20.09889150935428, + "learning_rate": 2.53426624725372e-07, + "loss": 0.4053, + "step": 10510 + }, + { + "epoch": 1.3497562227354374, + "grad_norm": 37.066587349225394, + "learning_rate": 2.52523382358473e-07, + "loss": 0.4283, + "step": 10520 + }, + { + "epoch": 1.3510392609699768, + "grad_norm": 15.274151945929399, + "learning_rate": 2.516212083824697e-07, + "loss": 0.4159, + "step": 10530 + }, + { + "epoch": 1.3523222992045163, + "grad_norm": 18.004359702101908, + "learning_rate": 2.5072010669217216e-07, + "loss": 0.4173, + "step": 10540 + }, + { + "epoch": 1.3536053374390558, + "grad_norm": 40.456507756314736, + "learning_rate": 2.4982008117776154e-07, + "loss": 0.381, + "step": 10550 + }, + { + "epoch": 1.354888375673595, + "grad_norm": 22.804040477335263, + "learning_rate": 2.489211357247732e-07, + "loss": 0.3839, + "step": 10560 + }, + { + "epoch": 1.3561714139081344, + "grad_norm": 21.148667452003657, + "learning_rate": 2.480232742140789e-07, + "loss": 0.3985, + "step": 10570 + }, + { + "epoch": 1.357454452142674, + "grad_norm": 21.629181782104915, + "learning_rate": 2.4712650052187174e-07, + "loss": 0.4181, + "step": 10580 + }, + { + "epoch": 1.3587374903772131, + "grad_norm": 22.411145686874654, + "learning_rate": 2.4623081851964805e-07, + "loss": 0.386, + "step": 10590 + }, + { + "epoch": 1.3600205286117526, + "grad_norm": 40.03783191366064, + "learning_rate": 2.453362320741911e-07, + "loss": 0.3892, + "step": 10600 + }, + { + "epoch": 1.361303566846292, + "grad_norm": 32.904031664393365, + "learning_rate": 2.444427450475548e-07, + "loss": 0.3966, + "step": 10610 + }, + { + "epoch": 1.3625866050808315, + "grad_norm": 16.11725564560703, + "learning_rate": 2.4355036129704696e-07, + "loss": 0.4048, + "step": 10620 + }, + { + "epoch": 1.3638696433153707, + "grad_norm": 23.27595746158021, + "learning_rate": 2.426590846752117e-07, + "loss": 0.354, + "step": 10630 + }, + { + "epoch": 1.3651526815499102, + "grad_norm": 30.305988595177595, + "learning_rate": 2.4176891902981384e-07, + "loss": 0.4081, + "step": 10640 + }, + { + "epoch": 1.3664357197844494, + "grad_norm": 27.816639365147424, + "learning_rate": 2.40879868203822e-07, + "loss": 0.3811, + "step": 10650 + }, + { + "epoch": 1.367718758018989, + "grad_norm": 26.032011368490924, + "learning_rate": 2.399919360353923e-07, + "loss": 0.409, + "step": 10660 + }, + { + "epoch": 1.3690017962535284, + "grad_norm": 26.38851578877664, + "learning_rate": 2.391051263578508e-07, + "loss": 0.367, + "step": 10670 + }, + { + "epoch": 1.3702848344880678, + "grad_norm": 20.15031022885454, + "learning_rate": 2.3821944299967777e-07, + "loss": 0.3739, + "step": 10680 + }, + { + "epoch": 1.371567872722607, + "grad_norm": 20.473361037837392, + "learning_rate": 2.373348897844915e-07, + "loss": 0.3883, + "step": 10690 + }, + { + "epoch": 1.3728509109571465, + "grad_norm": 27.03062192978216, + "learning_rate": 2.364514705310307e-07, + "loss": 0.3888, + "step": 10700 + }, + { + "epoch": 1.374133949191686, + "grad_norm": 26.366317966969373, + "learning_rate": 2.3556918905313894e-07, + "loss": 0.4095, + "step": 10710 + }, + { + "epoch": 1.3754169874262252, + "grad_norm": 20.084024897765946, + "learning_rate": 2.3468804915974793e-07, + "loss": 0.4093, + "step": 10720 + }, + { + "epoch": 1.3767000256607647, + "grad_norm": 22.550595022559712, + "learning_rate": 2.3380805465486082e-07, + "loss": 0.3573, + "step": 10730 + }, + { + "epoch": 1.3779830638953041, + "grad_norm": 20.496421954056295, + "learning_rate": 2.329292093375356e-07, + "loss": 0.3938, + "step": 10740 + }, + { + "epoch": 1.3792661021298436, + "grad_norm": 32.1539912522411, + "learning_rate": 2.3205151700186997e-07, + "loss": 0.4118, + "step": 10750 + }, + { + "epoch": 1.3805491403643828, + "grad_norm": 17.243621747711178, + "learning_rate": 2.3117498143698312e-07, + "loss": 0.382, + "step": 10760 + }, + { + "epoch": 1.3818321785989223, + "grad_norm": 25.62516046456443, + "learning_rate": 2.3029960642700097e-07, + "loss": 0.4266, + "step": 10770 + }, + { + "epoch": 1.3831152168334615, + "grad_norm": 15.563907196288786, + "learning_rate": 2.2942539575103887e-07, + "loss": 0.3897, + "step": 10780 + }, + { + "epoch": 1.384398255068001, + "grad_norm": 30.31507325185448, + "learning_rate": 2.2855235318318533e-07, + "loss": 0.3936, + "step": 10790 + }, + { + "epoch": 1.3856812933025404, + "grad_norm": 19.182559534602817, + "learning_rate": 2.2768048249248644e-07, + "loss": 0.3857, + "step": 10800 + }, + { + "epoch": 1.3869643315370799, + "grad_norm": 20.856364548129047, + "learning_rate": 2.2680978744292912e-07, + "loss": 0.4052, + "step": 10810 + }, + { + "epoch": 1.3882473697716193, + "grad_norm": 32.76925723481884, + "learning_rate": 2.2594027179342458e-07, + "loss": 0.4248, + "step": 10820 + }, + { + "epoch": 1.3895304080061586, + "grad_norm": 27.720128305671253, + "learning_rate": 2.2507193929779223e-07, + "loss": 0.3938, + "step": 10830 + }, + { + "epoch": 1.390813446240698, + "grad_norm": 24.738635079957454, + "learning_rate": 2.242047937047442e-07, + "loss": 0.4418, + "step": 10840 + }, + { + "epoch": 1.3920964844752373, + "grad_norm": 14.894204608906772, + "learning_rate": 2.2333883875786858e-07, + "loss": 0.3924, + "step": 10850 + }, + { + "epoch": 1.3933795227097767, + "grad_norm": 21.451323803194075, + "learning_rate": 2.2247407819561255e-07, + "loss": 0.3844, + "step": 10860 + }, + { + "epoch": 1.3946625609443162, + "grad_norm": 19.604389620846458, + "learning_rate": 2.2161051575126782e-07, + "loss": 0.3826, + "step": 10870 + }, + { + "epoch": 1.3959455991788556, + "grad_norm": 26.527778772120524, + "learning_rate": 2.207481551529531e-07, + "loss": 0.418, + "step": 10880 + }, + { + "epoch": 1.3972286374133949, + "grad_norm": 36.67360797571187, + "learning_rate": 2.198870001235986e-07, + "loss": 0.4423, + "step": 10890 + }, + { + "epoch": 1.3985116756479343, + "grad_norm": 20.26508382561515, + "learning_rate": 2.1902705438093028e-07, + "loss": 0.39, + "step": 10900 + }, + { + "epoch": 1.3997947138824736, + "grad_norm": 17.64944774681594, + "learning_rate": 2.181683216374533e-07, + "loss": 0.3836, + "step": 10910 + }, + { + "epoch": 1.401077752117013, + "grad_norm": 47.619041665351915, + "learning_rate": 2.1731080560043597e-07, + "loss": 0.3936, + "step": 10920 + }, + { + "epoch": 1.4023607903515525, + "grad_norm": 18.146577436885924, + "learning_rate": 2.164545099718938e-07, + "loss": 0.4093, + "step": 10930 + }, + { + "epoch": 1.403643828586092, + "grad_norm": 23.277027267205657, + "learning_rate": 2.155994384485742e-07, + "loss": 0.3724, + "step": 10940 + }, + { + "epoch": 1.4049268668206314, + "grad_norm": 28.987291410687213, + "learning_rate": 2.147455947219392e-07, + "loss": 0.3752, + "step": 10950 + }, + { + "epoch": 1.4062099050551706, + "grad_norm": 23.556433488305483, + "learning_rate": 2.13892982478151e-07, + "loss": 0.3698, + "step": 10960 + }, + { + "epoch": 1.40749294328971, + "grad_norm": 25.008030355840507, + "learning_rate": 2.130416053980546e-07, + "loss": 0.3907, + "step": 10970 + }, + { + "epoch": 1.4087759815242493, + "grad_norm": 26.852653752440386, + "learning_rate": 2.121914671571633e-07, + "loss": 0.4407, + "step": 10980 + }, + { + "epoch": 1.4100590197587888, + "grad_norm": 19.265352729972975, + "learning_rate": 2.1134257142564154e-07, + "loss": 0.3804, + "step": 10990 + }, + { + "epoch": 1.4113420579933282, + "grad_norm": 17.064953977760673, + "learning_rate": 2.104949218682902e-07, + "loss": 0.382, + "step": 11000 + }, + { + "epoch": 1.4126250962278677, + "grad_norm": 24.07692029014715, + "learning_rate": 2.096485221445301e-07, + "loss": 0.4365, + "step": 11010 + }, + { + "epoch": 1.413908134462407, + "grad_norm": 20.160084158704755, + "learning_rate": 2.0880337590838614e-07, + "loss": 0.3743, + "step": 11020 + }, + { + "epoch": 1.4151911726969464, + "grad_norm": 33.28710029110148, + "learning_rate": 2.079594868084718e-07, + "loss": 0.4071, + "step": 11030 + }, + { + "epoch": 1.4164742109314856, + "grad_norm": 30.178207538976444, + "learning_rate": 2.0711685848797362e-07, + "loss": 0.329, + "step": 11040 + }, + { + "epoch": 1.417757249166025, + "grad_norm": 27.26301260370644, + "learning_rate": 2.0627549458463472e-07, + "loss": 0.384, + "step": 11050 + }, + { + "epoch": 1.4190402874005645, + "grad_norm": 25.85708622864269, + "learning_rate": 2.0543539873074017e-07, + "loss": 0.3957, + "step": 11060 + }, + { + "epoch": 1.420323325635104, + "grad_norm": 38.67107266806456, + "learning_rate": 2.0459657455310008e-07, + "loss": 0.3833, + "step": 11070 + }, + { + "epoch": 1.4216063638696435, + "grad_norm": 17.538805880078357, + "learning_rate": 2.037590256730347e-07, + "loss": 0.3864, + "step": 11080 + }, + { + "epoch": 1.4228894021041827, + "grad_norm": 20.36950049745968, + "learning_rate": 2.0292275570635891e-07, + "loss": 0.3701, + "step": 11090 + }, + { + "epoch": 1.4241724403387221, + "grad_norm": 15.480173920375263, + "learning_rate": 2.0208776826336616e-07, + "loss": 0.3853, + "step": 11100 + }, + { + "epoch": 1.4254554785732614, + "grad_norm": 28.742970207428485, + "learning_rate": 2.0125406694881352e-07, + "loss": 0.3933, + "step": 11110 + }, + { + "epoch": 1.4267385168078008, + "grad_norm": 30.951279114205274, + "learning_rate": 2.0042165536190447e-07, + "loss": 0.3507, + "step": 11120 + }, + { + "epoch": 1.4280215550423403, + "grad_norm": 22.683724259664746, + "learning_rate": 1.9959053709627572e-07, + "loss": 0.3736, + "step": 11130 + }, + { + "epoch": 1.4293045932768798, + "grad_norm": 27.03219831114418, + "learning_rate": 1.9876071573998033e-07, + "loss": 0.4199, + "step": 11140 + }, + { + "epoch": 1.430587631511419, + "grad_norm": 21.545236233948497, + "learning_rate": 1.979321948754718e-07, + "loss": 0.3956, + "step": 11150 + }, + { + "epoch": 1.4318706697459584, + "grad_norm": 38.396809633594906, + "learning_rate": 1.971049780795901e-07, + "loss": 0.3735, + "step": 11160 + }, + { + "epoch": 1.4331537079804977, + "grad_norm": 28.21297512365461, + "learning_rate": 1.9627906892354468e-07, + "loss": 0.4316, + "step": 11170 + }, + { + "epoch": 1.4344367462150371, + "grad_norm": 21.106349632440768, + "learning_rate": 1.954544709728998e-07, + "loss": 0.4116, + "step": 11180 + }, + { + "epoch": 1.4357197844495766, + "grad_norm": 24.730760558685557, + "learning_rate": 1.9463118778755944e-07, + "loss": 0.4143, + "step": 11190 + }, + { + "epoch": 1.437002822684116, + "grad_norm": 18.622611519834248, + "learning_rate": 1.938092229217515e-07, + "loss": 0.4001, + "step": 11200 + }, + { + "epoch": 1.4382858609186555, + "grad_norm": 19.488178119830017, + "learning_rate": 1.9298857992401214e-07, + "loss": 0.3749, + "step": 11210 + }, + { + "epoch": 1.4395688991531947, + "grad_norm": 29.15072520606126, + "learning_rate": 1.9216926233717084e-07, + "loss": 0.3799, + "step": 11220 + }, + { + "epoch": 1.4408519373877342, + "grad_norm": 21.728295511701837, + "learning_rate": 1.9135127369833575e-07, + "loss": 0.3628, + "step": 11230 + }, + { + "epoch": 1.4421349756222734, + "grad_norm": 24.03144669042725, + "learning_rate": 1.9053461753887695e-07, + "loss": 0.3945, + "step": 11240 + }, + { + "epoch": 1.443418013856813, + "grad_norm": 22.5922765056965, + "learning_rate": 1.8971929738441272e-07, + "loss": 0.4005, + "step": 11250 + }, + { + "epoch": 1.4447010520913524, + "grad_norm": 29.231783438287007, + "learning_rate": 1.8890531675479293e-07, + "loss": 0.401, + "step": 11260 + }, + { + "epoch": 1.4459840903258918, + "grad_norm": 37.249140960221965, + "learning_rate": 1.8809267916408528e-07, + "loss": 0.4032, + "step": 11270 + }, + { + "epoch": 1.447267128560431, + "grad_norm": 14.423301278042098, + "learning_rate": 1.872813881205586e-07, + "loss": 0.4113, + "step": 11280 + }, + { + "epoch": 1.4485501667949705, + "grad_norm": 22.10035621104487, + "learning_rate": 1.8647144712666908e-07, + "loss": 0.431, + "step": 11290 + }, + { + "epoch": 1.4498332050295097, + "grad_norm": 21.377396017009996, + "learning_rate": 1.856628596790446e-07, + "loss": 0.4066, + "step": 11300 + }, + { + "epoch": 1.4511162432640492, + "grad_norm": 30.618071846215784, + "learning_rate": 1.8485562926846916e-07, + "loss": 0.3886, + "step": 11310 + }, + { + "epoch": 1.4523992814985887, + "grad_norm": 19.295121279601283, + "learning_rate": 1.8404975937986821e-07, + "loss": 0.4015, + "step": 11320 + }, + { + "epoch": 1.4536823197331281, + "grad_norm": 19.988030382612646, + "learning_rate": 1.832452534922943e-07, + "loss": 0.4093, + "step": 11330 + }, + { + "epoch": 1.4549653579676676, + "grad_norm": 24.68872294963879, + "learning_rate": 1.824421150789106e-07, + "loss": 0.4106, + "step": 11340 + }, + { + "epoch": 1.4562483962022068, + "grad_norm": 33.58517414505513, + "learning_rate": 1.8164034760697745e-07, + "loss": 0.4007, + "step": 11350 + }, + { + "epoch": 1.4575314344367463, + "grad_norm": 21.391624505189938, + "learning_rate": 1.8083995453783603e-07, + "loss": 0.4016, + "step": 11360 + }, + { + "epoch": 1.4588144726712855, + "grad_norm": 18.163805404628366, + "learning_rate": 1.8004093932689414e-07, + "loss": 0.4364, + "step": 11370 + }, + { + "epoch": 1.460097510905825, + "grad_norm": 22.30677581576104, + "learning_rate": 1.7924330542361148e-07, + "loss": 0.3856, + "step": 11380 + }, + { + "epoch": 1.4613805491403644, + "grad_norm": 20.01222553294679, + "learning_rate": 1.784470562714845e-07, + "loss": 0.3664, + "step": 11390 + }, + { + "epoch": 1.4626635873749039, + "grad_norm": 19.016297570080354, + "learning_rate": 1.77652195308031e-07, + "loss": 0.3881, + "step": 11400 + }, + { + "epoch": 1.463946625609443, + "grad_norm": 17.127757781158703, + "learning_rate": 1.7685872596477592e-07, + "loss": 0.3901, + "step": 11410 + }, + { + "epoch": 1.4652296638439826, + "grad_norm": 33.12776745376937, + "learning_rate": 1.7606665166723672e-07, + "loss": 0.4244, + "step": 11420 + }, + { + "epoch": 1.4665127020785218, + "grad_norm": 17.73700411532125, + "learning_rate": 1.7527597583490823e-07, + "loss": 0.3769, + "step": 11430 + }, + { + "epoch": 1.4677957403130613, + "grad_norm": 34.628654558439926, + "learning_rate": 1.7448670188124725e-07, + "loss": 0.3817, + "step": 11440 + }, + { + "epoch": 1.4690787785476007, + "grad_norm": 19.340249016694035, + "learning_rate": 1.736988332136594e-07, + "loss": 0.4047, + "step": 11450 + }, + { + "epoch": 1.4703618167821402, + "grad_norm": 21.376179004541353, + "learning_rate": 1.7291237323348284e-07, + "loss": 0.4083, + "step": 11460 + }, + { + "epoch": 1.4716448550166796, + "grad_norm": 25.207506711150664, + "learning_rate": 1.721273253359743e-07, + "loss": 0.4002, + "step": 11470 + }, + { + "epoch": 1.4729278932512189, + "grad_norm": 28.88785265179978, + "learning_rate": 1.7134369291029455e-07, + "loss": 0.3628, + "step": 11480 + }, + { + "epoch": 1.4742109314857583, + "grad_norm": 22.398847194039966, + "learning_rate": 1.7056147933949377e-07, + "loss": 0.3696, + "step": 11490 + }, + { + "epoch": 1.4754939697202976, + "grad_norm": 25.061018634329578, + "learning_rate": 1.697806880004962e-07, + "loss": 0.3926, + "step": 11500 + }, + { + "epoch": 1.476777007954837, + "grad_norm": 22.9045340776861, + "learning_rate": 1.6900132226408637e-07, + "loss": 0.402, + "step": 11510 + }, + { + "epoch": 1.4780600461893765, + "grad_norm": 23.14052088238419, + "learning_rate": 1.6822338549489446e-07, + "loss": 0.3662, + "step": 11520 + }, + { + "epoch": 1.479343084423916, + "grad_norm": 28.36917674545169, + "learning_rate": 1.6744688105138123e-07, + "loss": 0.3724, + "step": 11530 + }, + { + "epoch": 1.4806261226584552, + "grad_norm": 20.630020652722823, + "learning_rate": 1.666718122858244e-07, + "loss": 0.3791, + "step": 11540 + }, + { + "epoch": 1.4819091608929946, + "grad_norm": 27.814738801331465, + "learning_rate": 1.658981825443032e-07, + "loss": 0.445, + "step": 11550 + }, + { + "epoch": 1.483192199127534, + "grad_norm": 18.4063241689024, + "learning_rate": 1.651259951666844e-07, + "loss": 0.4008, + "step": 11560 + }, + { + "epoch": 1.4844752373620733, + "grad_norm": 19.066620384349996, + "learning_rate": 1.6435525348660824e-07, + "loss": 0.3711, + "step": 11570 + }, + { + "epoch": 1.4857582755966128, + "grad_norm": 16.372964510870585, + "learning_rate": 1.635859608314734e-07, + "loss": 0.382, + "step": 11580 + }, + { + "epoch": 1.4870413138311522, + "grad_norm": 24.289927137226588, + "learning_rate": 1.6281812052242333e-07, + "loss": 0.3706, + "step": 11590 + }, + { + "epoch": 1.4883243520656917, + "grad_norm": 29.119547430068643, + "learning_rate": 1.620517358743309e-07, + "loss": 0.3864, + "step": 11600 + }, + { + "epoch": 1.489607390300231, + "grad_norm": 26.854493537815245, + "learning_rate": 1.612868101957849e-07, + "loss": 0.3963, + "step": 11610 + }, + { + "epoch": 1.4908904285347704, + "grad_norm": 21.23695361363263, + "learning_rate": 1.605233467890758e-07, + "loss": 0.4087, + "step": 11620 + }, + { + "epoch": 1.4921734667693096, + "grad_norm": 18.799100837107773, + "learning_rate": 1.597613489501809e-07, + "loss": 0.3768, + "step": 11630 + }, + { + "epoch": 1.493456505003849, + "grad_norm": 18.190005518871427, + "learning_rate": 1.590008199687508e-07, + "loss": 0.3894, + "step": 11640 + }, + { + "epoch": 1.4947395432383885, + "grad_norm": 24.40585257773337, + "learning_rate": 1.582417631280945e-07, + "loss": 0.4001, + "step": 11650 + }, + { + "epoch": 1.496022581472928, + "grad_norm": 23.340935706411244, + "learning_rate": 1.5748418170516557e-07, + "loss": 0.4049, + "step": 11660 + }, + { + "epoch": 1.4973056197074672, + "grad_norm": 22.068974324537418, + "learning_rate": 1.567280789705483e-07, + "loss": 0.3748, + "step": 11670 + }, + { + "epoch": 1.4985886579420067, + "grad_norm": 26.63910503829797, + "learning_rate": 1.5597345818844322e-07, + "loss": 0.3958, + "step": 11680 + }, + { + "epoch": 1.4998716961765461, + "grad_norm": 27.355543398243178, + "learning_rate": 1.552203226166528e-07, + "loss": 0.3766, + "step": 11690 + }, + { + "epoch": 1.5011547344110854, + "grad_norm": 34.418880386646826, + "learning_rate": 1.5446867550656767e-07, + "loss": 0.4015, + "step": 11700 + }, + { + "epoch": 1.5024377726456248, + "grad_norm": 38.15776088728365, + "learning_rate": 1.53718520103153e-07, + "loss": 0.3969, + "step": 11710 + }, + { + "epoch": 1.5037208108801643, + "grad_norm": 24.199225812579275, + "learning_rate": 1.5296985964493342e-07, + "loss": 0.4168, + "step": 11720 + }, + { + "epoch": 1.5050038491147038, + "grad_norm": 20.925974368919082, + "learning_rate": 1.5222269736398013e-07, + "loss": 0.3873, + "step": 11730 + }, + { + "epoch": 1.506286887349243, + "grad_norm": 31.086905725420486, + "learning_rate": 1.514770364858966e-07, + "loss": 0.4118, + "step": 11740 + }, + { + "epoch": 1.5075699255837824, + "grad_norm": 26.522538069074805, + "learning_rate": 1.5073288022980413e-07, + "loss": 0.373, + "step": 11750 + }, + { + "epoch": 1.5088529638183217, + "grad_norm": 32.467771929045135, + "learning_rate": 1.499902318083283e-07, + "loss": 0.3896, + "step": 11760 + }, + { + "epoch": 1.5101360020528611, + "grad_norm": 13.269381189155448, + "learning_rate": 1.4924909442758566e-07, + "loss": 0.3949, + "step": 11770 + }, + { + "epoch": 1.5114190402874006, + "grad_norm": 21.26872685679763, + "learning_rate": 1.4850947128716911e-07, + "loss": 0.396, + "step": 11780 + }, + { + "epoch": 1.51270207852194, + "grad_norm": 17.80120282778252, + "learning_rate": 1.4777136558013443e-07, + "loss": 0.4186, + "step": 11790 + }, + { + "epoch": 1.5139851167564793, + "grad_norm": 15.53669814398538, + "learning_rate": 1.47034780492986e-07, + "loss": 0.4184, + "step": 11800 + }, + { + "epoch": 1.5152681549910187, + "grad_norm": 24.4956799893459, + "learning_rate": 1.4629971920566425e-07, + "loss": 0.3848, + "step": 11810 + }, + { + "epoch": 1.516551193225558, + "grad_norm": 46.64185287967465, + "learning_rate": 1.455661848915305e-07, + "loss": 0.3718, + "step": 11820 + }, + { + "epoch": 1.5178342314600974, + "grad_norm": 18.911055008289814, + "learning_rate": 1.4483418071735432e-07, + "loss": 0.4009, + "step": 11830 + }, + { + "epoch": 1.519117269694637, + "grad_norm": 26.72809885524814, + "learning_rate": 1.4410370984329929e-07, + "loss": 0.3793, + "step": 11840 + }, + { + "epoch": 1.5204003079291764, + "grad_norm": 25.94111818777712, + "learning_rate": 1.4337477542290926e-07, + "loss": 0.4047, + "step": 11850 + }, + { + "epoch": 1.5216833461637158, + "grad_norm": 23.66275292708834, + "learning_rate": 1.426473806030955e-07, + "loss": 0.387, + "step": 11860 + }, + { + "epoch": 1.522966384398255, + "grad_norm": 19.93096312027027, + "learning_rate": 1.4192152852412247e-07, + "loss": 0.4187, + "step": 11870 + }, + { + "epoch": 1.5242494226327945, + "grad_norm": 21.28257627707498, + "learning_rate": 1.4119722231959403e-07, + "loss": 0.3797, + "step": 11880 + }, + { + "epoch": 1.5255324608673337, + "grad_norm": 32.7495390085428, + "learning_rate": 1.4047446511644084e-07, + "loss": 0.3959, + "step": 11890 + }, + { + "epoch": 1.5268154991018732, + "grad_norm": 15.868788441233063, + "learning_rate": 1.397532600349058e-07, + "loss": 0.4149, + "step": 11900 + }, + { + "epoch": 1.5280985373364127, + "grad_norm": 15.436646458287159, + "learning_rate": 1.390336101885315e-07, + "loss": 0.3751, + "step": 11910 + }, + { + "epoch": 1.5293815755709521, + "grad_norm": 14.170084855847644, + "learning_rate": 1.3831551868414597e-07, + "loss": 0.3683, + "step": 11920 + }, + { + "epoch": 1.5306646138054916, + "grad_norm": 23.654070553054723, + "learning_rate": 1.3759898862185016e-07, + "loss": 0.42, + "step": 11930 + }, + { + "epoch": 1.5319476520400308, + "grad_norm": 31.67802720325113, + "learning_rate": 1.368840230950035e-07, + "loss": 0.4158, + "step": 11940 + }, + { + "epoch": 1.53323069027457, + "grad_norm": 25.213581593916828, + "learning_rate": 1.3617062519021144e-07, + "loss": 0.4035, + "step": 11950 + }, + { + "epoch": 1.5345137285091095, + "grad_norm": 22.442263375806007, + "learning_rate": 1.3545879798731164e-07, + "loss": 0.4054, + "step": 11960 + }, + { + "epoch": 1.535796766743649, + "grad_norm": 21.43096147821901, + "learning_rate": 1.3474854455936123e-07, + "loss": 0.4454, + "step": 11970 + }, + { + "epoch": 1.5370798049781884, + "grad_norm": 15.665957167005535, + "learning_rate": 1.3403986797262252e-07, + "loss": 0.4058, + "step": 11980 + }, + { + "epoch": 1.5383628432127279, + "grad_norm": 20.248174699585906, + "learning_rate": 1.3333277128655062e-07, + "loss": 0.3897, + "step": 11990 + }, + { + "epoch": 1.539645881447267, + "grad_norm": 16.599063815417047, + "learning_rate": 1.326272575537803e-07, + "loss": 0.381, + "step": 12000 + }, + { + "epoch": 1.5409289196818066, + "grad_norm": 22.799452836148312, + "learning_rate": 1.319233298201119e-07, + "loss": 0.4013, + "step": 12010 + }, + { + "epoch": 1.5422119579163458, + "grad_norm": 27.135984307134915, + "learning_rate": 1.3122099112449926e-07, + "loss": 0.4131, + "step": 12020 + }, + { + "epoch": 1.5434949961508853, + "grad_norm": 18.924438459912086, + "learning_rate": 1.305202444990362e-07, + "loss": 0.3761, + "step": 12030 + }, + { + "epoch": 1.5447780343854247, + "grad_norm": 33.882971938522346, + "learning_rate": 1.298210929689429e-07, + "loss": 0.4118, + "step": 12040 + }, + { + "epoch": 1.5460610726199642, + "grad_norm": 16.516338961024186, + "learning_rate": 1.2912353955255345e-07, + "loss": 0.3836, + "step": 12050 + }, + { + "epoch": 1.5473441108545036, + "grad_norm": 18.061750914370545, + "learning_rate": 1.284275872613028e-07, + "loss": 0.3706, + "step": 12060 + }, + { + "epoch": 1.5486271490890429, + "grad_norm": 31.107587705790845, + "learning_rate": 1.277332390997138e-07, + "loss": 0.401, + "step": 12070 + }, + { + "epoch": 1.549910187323582, + "grad_norm": 21.79619453260265, + "learning_rate": 1.270404980653836e-07, + "loss": 0.3894, + "step": 12080 + }, + { + "epoch": 1.5511932255581216, + "grad_norm": 19.79607510975861, + "learning_rate": 1.2634936714897115e-07, + "loss": 0.3924, + "step": 12090 + }, + { + "epoch": 1.552476263792661, + "grad_norm": 20.87879257799972, + "learning_rate": 1.2565984933418495e-07, + "loss": 0.361, + "step": 12100 + }, + { + "epoch": 1.5537593020272005, + "grad_norm": 13.661674927229074, + "learning_rate": 1.2497194759776868e-07, + "loss": 0.3939, + "step": 12110 + }, + { + "epoch": 1.55504234026174, + "grad_norm": 23.327388795808364, + "learning_rate": 1.2428566490948988e-07, + "loss": 0.3792, + "step": 12120 + }, + { + "epoch": 1.5563253784962792, + "grad_norm": 12.855711839941009, + "learning_rate": 1.2360100423212605e-07, + "loss": 0.4019, + "step": 12130 + }, + { + "epoch": 1.5576084167308186, + "grad_norm": 29.06764681712454, + "learning_rate": 1.2291796852145215e-07, + "loss": 0.3984, + "step": 12140 + }, + { + "epoch": 1.5588914549653579, + "grad_norm": 26.056539916573293, + "learning_rate": 1.2223656072622824e-07, + "loss": 0.4025, + "step": 12150 + }, + { + "epoch": 1.5601744931998973, + "grad_norm": 27.859575684401346, + "learning_rate": 1.215567837881865e-07, + "loss": 0.404, + "step": 12160 + }, + { + "epoch": 1.5614575314344368, + "grad_norm": 19.196483574827436, + "learning_rate": 1.2087864064201796e-07, + "loss": 0.4094, + "step": 12170 + }, + { + "epoch": 1.5627405696689762, + "grad_norm": 11.570136318397633, + "learning_rate": 1.2020213421536103e-07, + "loss": 0.3852, + "step": 12180 + }, + { + "epoch": 1.5640236079035157, + "grad_norm": 32.52978747413739, + "learning_rate": 1.1952726742878756e-07, + "loss": 0.3988, + "step": 12190 + }, + { + "epoch": 1.565306646138055, + "grad_norm": 19.438208813962703, + "learning_rate": 1.1885404319579107e-07, + "loss": 0.3856, + "step": 12200 + }, + { + "epoch": 1.5665896843725942, + "grad_norm": 32.91442466777494, + "learning_rate": 1.1818246442277407e-07, + "loss": 0.3891, + "step": 12210 + }, + { + "epoch": 1.5678727226071336, + "grad_norm": 23.58194766687123, + "learning_rate": 1.1751253400903549e-07, + "loss": 0.3523, + "step": 12220 + }, + { + "epoch": 1.569155760841673, + "grad_norm": 18.089749592118967, + "learning_rate": 1.168442548467577e-07, + "loss": 0.3809, + "step": 12230 + }, + { + "epoch": 1.5704387990762125, + "grad_norm": 17.820377675396994, + "learning_rate": 1.1617762982099444e-07, + "loss": 0.4348, + "step": 12240 + }, + { + "epoch": 1.571721837310752, + "grad_norm": 34.32024680224867, + "learning_rate": 1.1551266180965864e-07, + "loss": 0.4256, + "step": 12250 + }, + { + "epoch": 1.5730048755452912, + "grad_norm": 17.037915854449793, + "learning_rate": 1.1484935368350946e-07, + "loss": 0.3741, + "step": 12260 + }, + { + "epoch": 1.5742879137798307, + "grad_norm": 20.076618468168302, + "learning_rate": 1.141877083061401e-07, + "loss": 0.3777, + "step": 12270 + }, + { + "epoch": 1.57557095201437, + "grad_norm": 36.04163028660487, + "learning_rate": 1.1352772853396531e-07, + "loss": 0.373, + "step": 12280 + }, + { + "epoch": 1.5768539902489094, + "grad_norm": 29.857689653859538, + "learning_rate": 1.1286941721620952e-07, + "loss": 0.4018, + "step": 12290 + }, + { + "epoch": 1.5781370284834488, + "grad_norm": 18.81281215817597, + "learning_rate": 1.1221277719489386e-07, + "loss": 0.3835, + "step": 12300 + }, + { + "epoch": 1.5794200667179883, + "grad_norm": 24.74468186690184, + "learning_rate": 1.1155781130482455e-07, + "loss": 0.3704, + "step": 12310 + }, + { + "epoch": 1.5807031049525277, + "grad_norm": 27.55050524193802, + "learning_rate": 1.1090452237358028e-07, + "loss": 0.3951, + "step": 12320 + }, + { + "epoch": 1.581986143187067, + "grad_norm": 37.34359373190292, + "learning_rate": 1.1025291322149987e-07, + "loss": 0.378, + "step": 12330 + }, + { + "epoch": 1.5832691814216062, + "grad_norm": 21.945207084811415, + "learning_rate": 1.096029866616704e-07, + "loss": 0.3888, + "step": 12340 + }, + { + "epoch": 1.5845522196561457, + "grad_norm": 22.10261914479958, + "learning_rate": 1.0895474549991518e-07, + "loss": 0.3804, + "step": 12350 + }, + { + "epoch": 1.5858352578906851, + "grad_norm": 14.06206786423721, + "learning_rate": 1.0830819253478102e-07, + "loss": 0.4046, + "step": 12360 + }, + { + "epoch": 1.5871182961252246, + "grad_norm": 22.071262347200882, + "learning_rate": 1.0766333055752702e-07, + "loss": 0.3993, + "step": 12370 + }, + { + "epoch": 1.588401334359764, + "grad_norm": 17.029321080463358, + "learning_rate": 1.0702016235211159e-07, + "loss": 0.359, + "step": 12380 + }, + { + "epoch": 1.5896843725943033, + "grad_norm": 30.34882183633248, + "learning_rate": 1.0637869069518134e-07, + "loss": 0.375, + "step": 12390 + }, + { + "epoch": 1.5909674108288427, + "grad_norm": 19.42975881007921, + "learning_rate": 1.057389183560582e-07, + "loss": 0.4081, + "step": 12400 + }, + { + "epoch": 1.592250449063382, + "grad_norm": 19.398485244589924, + "learning_rate": 1.0510084809672837e-07, + "loss": 0.3848, + "step": 12410 + }, + { + "epoch": 1.5935334872979214, + "grad_norm": 24.38198896497627, + "learning_rate": 1.044644826718295e-07, + "loss": 0.4168, + "step": 12420 + }, + { + "epoch": 1.594816525532461, + "grad_norm": 21.509785268107684, + "learning_rate": 1.0382982482863933e-07, + "loss": 0.3752, + "step": 12430 + }, + { + "epoch": 1.5960995637670004, + "grad_norm": 15.665764334396904, + "learning_rate": 1.03196877307064e-07, + "loss": 0.3952, + "step": 12440 + }, + { + "epoch": 1.5973826020015398, + "grad_norm": 37.726850337431586, + "learning_rate": 1.0256564283962587e-07, + "loss": 0.3692, + "step": 12450 + }, + { + "epoch": 1.598665640236079, + "grad_norm": 35.75549722113709, + "learning_rate": 1.0193612415145154e-07, + "loss": 0.4181, + "step": 12460 + }, + { + "epoch": 1.5999486784706183, + "grad_norm": 30.24881848458098, + "learning_rate": 1.0130832396026062e-07, + "loss": 0.3568, + "step": 12470 + }, + { + "epoch": 1.6012317167051577, + "grad_norm": 19.403188818802963, + "learning_rate": 1.0068224497635369e-07, + "loss": 0.3735, + "step": 12480 + }, + { + "epoch": 1.6025147549396972, + "grad_norm": 21.48947114682834, + "learning_rate": 1.0005788990260034e-07, + "loss": 0.4127, + "step": 12490 + }, + { + "epoch": 1.6037977931742367, + "grad_norm": 19.4085378906999, + "learning_rate": 9.943526143442827e-08, + "loss": 0.3374, + "step": 12500 + }, + { + "epoch": 1.605080831408776, + "grad_norm": 20.050229258332195, + "learning_rate": 9.881436225981104e-08, + "loss": 0.387, + "step": 12510 + }, + { + "epoch": 1.6063638696433153, + "grad_norm": 15.521165085978387, + "learning_rate": 9.819519505925645e-08, + "loss": 0.4206, + "step": 12520 + }, + { + "epoch": 1.6076469078778548, + "grad_norm": 23.36406653556797, + "learning_rate": 9.757776250579503e-08, + "loss": 0.383, + "step": 12530 + }, + { + "epoch": 1.608929946112394, + "grad_norm": 20.80724351410046, + "learning_rate": 9.696206726496892e-08, + "loss": 0.3889, + "step": 12540 + }, + { + "epoch": 1.6102129843469335, + "grad_norm": 21.013436176790016, + "learning_rate": 9.634811199482007e-08, + "loss": 0.4159, + "step": 12550 + }, + { + "epoch": 1.611496022581473, + "grad_norm": 23.60987217204866, + "learning_rate": 9.573589934587845e-08, + "loss": 0.4171, + "step": 12560 + }, + { + "epoch": 1.6127790608160124, + "grad_norm": 30.256858141895254, + "learning_rate": 9.51254319611508e-08, + "loss": 0.4062, + "step": 12570 + }, + { + "epoch": 1.6140620990505519, + "grad_norm": 17.881411197734348, + "learning_rate": 9.451671247610987e-08, + "loss": 0.4119, + "step": 12580 + }, + { + "epoch": 1.615345137285091, + "grad_norm": 23.114712484126233, + "learning_rate": 9.390974351868186e-08, + "loss": 0.3407, + "step": 12590 + }, + { + "epoch": 1.6166281755196303, + "grad_norm": 18.519430872615388, + "learning_rate": 9.330452770923603e-08, + "loss": 0.3948, + "step": 12600 + }, + { + "epoch": 1.6179112137541698, + "grad_norm": 30.205164400885582, + "learning_rate": 9.270106766057322e-08, + "loss": 0.373, + "step": 12610 + }, + { + "epoch": 1.6191942519887093, + "grad_norm": 19.68837824726568, + "learning_rate": 9.209936597791407e-08, + "loss": 0.4289, + "step": 12620 + }, + { + "epoch": 1.6204772902232487, + "grad_norm": 25.36911325242145, + "learning_rate": 9.149942525888798e-08, + "loss": 0.3736, + "step": 12630 + }, + { + "epoch": 1.6217603284577882, + "grad_norm": 27.95313135618856, + "learning_rate": 9.090124809352268e-08, + "loss": 0.3857, + "step": 12640 + }, + { + "epoch": 1.6230433666923274, + "grad_norm": 18.049043604826476, + "learning_rate": 9.030483706423164e-08, + "loss": 0.3702, + "step": 12650 + }, + { + "epoch": 1.6243264049268669, + "grad_norm": 19.478564534546415, + "learning_rate": 8.971019474580427e-08, + "loss": 0.3621, + "step": 12660 + }, + { + "epoch": 1.625609443161406, + "grad_norm": 20.027303131597424, + "learning_rate": 8.911732370539393e-08, + "loss": 0.3995, + "step": 12670 + }, + { + "epoch": 1.6268924813959456, + "grad_norm": 22.72507277944192, + "learning_rate": 8.852622650250696e-08, + "loss": 0.3708, + "step": 12680 + }, + { + "epoch": 1.628175519630485, + "grad_norm": 22.48298169086088, + "learning_rate": 8.793690568899215e-08, + "loss": 0.3651, + "step": 12690 + }, + { + "epoch": 1.6294585578650245, + "grad_norm": 18.99985768633383, + "learning_rate": 8.734936380902935e-08, + "loss": 0.4046, + "step": 12700 + }, + { + "epoch": 1.630741596099564, + "grad_norm": 17.848182499433424, + "learning_rate": 8.676360339911826e-08, + "loss": 0.372, + "step": 12710 + }, + { + "epoch": 1.6320246343341032, + "grad_norm": 15.650207688567319, + "learning_rate": 8.617962698806763e-08, + "loss": 0.4091, + "step": 12720 + }, + { + "epoch": 1.6333076725686424, + "grad_norm": 21.433439325005793, + "learning_rate": 8.559743709698492e-08, + "loss": 0.3319, + "step": 12730 + }, + { + "epoch": 1.6345907108031819, + "grad_norm": 13.73125609722741, + "learning_rate": 8.50170362392647e-08, + "loss": 0.3773, + "step": 12740 + }, + { + "epoch": 1.6358737490377213, + "grad_norm": 25.675713633123852, + "learning_rate": 8.443842692057779e-08, + "loss": 0.4182, + "step": 12750 + }, + { + "epoch": 1.6371567872722608, + "grad_norm": 15.556251198812078, + "learning_rate": 8.386161163886119e-08, + "loss": 0.4013, + "step": 12760 + }, + { + "epoch": 1.6384398255068002, + "grad_norm": 27.909916679865315, + "learning_rate": 8.328659288430623e-08, + "loss": 0.381, + "step": 12770 + }, + { + "epoch": 1.6397228637413395, + "grad_norm": 30.576642911447774, + "learning_rate": 8.271337313934867e-08, + "loss": 0.4026, + "step": 12780 + }, + { + "epoch": 1.641005901975879, + "grad_norm": 18.3011326054354, + "learning_rate": 8.214195487865782e-08, + "loss": 0.4248, + "step": 12790 + }, + { + "epoch": 1.6422889402104182, + "grad_norm": 21.204154717444684, + "learning_rate": 8.157234056912559e-08, + "loss": 0.4021, + "step": 12800 + }, + { + "epoch": 1.6435719784449576, + "grad_norm": 19.69541833157778, + "learning_rate": 8.100453266985602e-08, + "loss": 0.3815, + "step": 12810 + }, + { + "epoch": 1.644855016679497, + "grad_norm": 21.19692748995577, + "learning_rate": 8.043853363215436e-08, + "loss": 0.4068, + "step": 12820 + }, + { + "epoch": 1.6461380549140365, + "grad_norm": 19.123477430983886, + "learning_rate": 7.987434589951725e-08, + "loss": 0.3903, + "step": 12830 + }, + { + "epoch": 1.647421093148576, + "grad_norm": 22.530758610301987, + "learning_rate": 7.931197190762118e-08, + "loss": 0.4071, + "step": 12840 + }, + { + "epoch": 1.6487041313831152, + "grad_norm": 23.756999477838438, + "learning_rate": 7.875141408431302e-08, + "loss": 0.3771, + "step": 12850 + }, + { + "epoch": 1.6499871696176545, + "grad_norm": 18.588105864913423, + "learning_rate": 7.819267484959829e-08, + "loss": 0.394, + "step": 12860 + }, + { + "epoch": 1.651270207852194, + "grad_norm": 24.16778751565798, + "learning_rate": 7.763575661563209e-08, + "loss": 0.3802, + "step": 12870 + }, + { + "epoch": 1.6525532460867334, + "grad_norm": 16.45557334890205, + "learning_rate": 7.708066178670758e-08, + "loss": 0.3785, + "step": 12880 + }, + { + "epoch": 1.6538362843212728, + "grad_norm": 24.572777995201207, + "learning_rate": 7.652739275924608e-08, + "loss": 0.3807, + "step": 12890 + }, + { + "epoch": 1.6551193225558123, + "grad_norm": 22.086576320485708, + "learning_rate": 7.597595192178702e-08, + "loss": 0.3821, + "step": 12900 + }, + { + "epoch": 1.6564023607903515, + "grad_norm": 27.075592469946407, + "learning_rate": 7.54263416549768e-08, + "loss": 0.4054, + "step": 12910 + }, + { + "epoch": 1.657685399024891, + "grad_norm": 19.103069129548917, + "learning_rate": 7.487856433155915e-08, + "loss": 0.4048, + "step": 12920 + }, + { + "epoch": 1.6589684372594302, + "grad_norm": 27.420686199648543, + "learning_rate": 7.433262231636494e-08, + "loss": 0.3957, + "step": 12930 + }, + { + "epoch": 1.6602514754939697, + "grad_norm": 19.86861424039294, + "learning_rate": 7.378851796630143e-08, + "loss": 0.3594, + "step": 12940 + }, + { + "epoch": 1.6615345137285091, + "grad_norm": 29.679177663330613, + "learning_rate": 7.324625363034275e-08, + "loss": 0.3667, + "step": 12950 + }, + { + "epoch": 1.6628175519630486, + "grad_norm": 25.724684368947436, + "learning_rate": 7.270583164951926e-08, + "loss": 0.4175, + "step": 12960 + }, + { + "epoch": 1.664100590197588, + "grad_norm": 13.471290533669226, + "learning_rate": 7.216725435690752e-08, + "loss": 0.3865, + "step": 12970 + }, + { + "epoch": 1.6653836284321273, + "grad_norm": 17.070088271986698, + "learning_rate": 7.163052407762044e-08, + "loss": 0.3716, + "step": 12980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 42.06243310744608, + "learning_rate": 7.109564312879712e-08, + "loss": 0.3623, + "step": 12990 + }, + { + "epoch": 1.667949704901206, + "grad_norm": 24.976355106849358, + "learning_rate": 7.056261381959316e-08, + "loss": 0.4242, + "step": 13000 + }, + { + "epoch": 1.6692327431357454, + "grad_norm": 31.560333720839612, + "learning_rate": 7.003143845116955e-08, + "loss": 0.3594, + "step": 13010 + }, + { + "epoch": 1.670515781370285, + "grad_norm": 35.124273936093566, + "learning_rate": 6.950211931668421e-08, + "loss": 0.3976, + "step": 13020 + }, + { + "epoch": 1.6717988196048243, + "grad_norm": 25.073966517707493, + "learning_rate": 6.89746587012815e-08, + "loss": 0.3919, + "step": 13030 + }, + { + "epoch": 1.6730818578393636, + "grad_norm": 15.58846424892136, + "learning_rate": 6.84490588820818e-08, + "loss": 0.3973, + "step": 13040 + }, + { + "epoch": 1.674364896073903, + "grad_norm": 18.38775907562159, + "learning_rate": 6.79253221281727e-08, + "loss": 0.3527, + "step": 13050 + }, + { + "epoch": 1.6756479343084423, + "grad_norm": 28.862030815866305, + "learning_rate": 6.740345070059828e-08, + "loss": 0.3875, + "step": 13060 + }, + { + "epoch": 1.6769309725429817, + "grad_norm": 21.82223826391644, + "learning_rate": 6.688344685234987e-08, + "loss": 0.4058, + "step": 13070 + }, + { + "epoch": 1.6782140107775212, + "grad_norm": 21.534525974819644, + "learning_rate": 6.636531282835627e-08, + "loss": 0.4055, + "step": 13080 + }, + { + "epoch": 1.6794970490120606, + "grad_norm": 23.689007110988406, + "learning_rate": 6.58490508654741e-08, + "loss": 0.3946, + "step": 13090 + }, + { + "epoch": 1.6807800872466, + "grad_norm": 22.90574972242346, + "learning_rate": 6.533466319247783e-08, + "loss": 0.3785, + "step": 13100 + }, + { + "epoch": 1.6820631254811393, + "grad_norm": 26.996819205820575, + "learning_rate": 6.482215203005015e-08, + "loss": 0.4062, + "step": 13110 + }, + { + "epoch": 1.6833461637156786, + "grad_norm": 20.568807488368346, + "learning_rate": 6.431151959077324e-08, + "loss": 0.4189, + "step": 13120 + }, + { + "epoch": 1.684629201950218, + "grad_norm": 17.074825266466622, + "learning_rate": 6.380276807911789e-08, + "loss": 0.3759, + "step": 13130 + }, + { + "epoch": 1.6859122401847575, + "grad_norm": 22.263144263940017, + "learning_rate": 6.329589969143517e-08, + "loss": 0.4039, + "step": 13140 + }, + { + "epoch": 1.687195278419297, + "grad_norm": 30.05896886429508, + "learning_rate": 6.27909166159461e-08, + "loss": 0.4023, + "step": 13150 + }, + { + "epoch": 1.6884783166538364, + "grad_norm": 27.153954400989967, + "learning_rate": 6.228782103273283e-08, + "loss": 0.3753, + "step": 13160 + }, + { + "epoch": 1.6897613548883756, + "grad_norm": 25.800737655964067, + "learning_rate": 6.178661511372858e-08, + "loss": 0.3931, + "step": 13170 + }, + { + "epoch": 1.691044393122915, + "grad_norm": 20.15915692954668, + "learning_rate": 6.128730102270896e-08, + "loss": 0.3793, + "step": 13180 + }, + { + "epoch": 1.6923274313574543, + "grad_norm": 26.55776604499149, + "learning_rate": 6.078988091528237e-08, + "loss": 0.3557, + "step": 13190 + }, + { + "epoch": 1.6936104695919938, + "grad_norm": 20.5380046954262, + "learning_rate": 6.029435693888019e-08, + "loss": 0.3791, + "step": 13200 + }, + { + "epoch": 1.6948935078265333, + "grad_norm": 21.436801612487468, + "learning_rate": 5.980073123274815e-08, + "loss": 0.3563, + "step": 13210 + }, + { + "epoch": 1.6961765460610727, + "grad_norm": 26.520744201501632, + "learning_rate": 5.930900592793714e-08, + "loss": 0.3823, + "step": 13220 + }, + { + "epoch": 1.6974595842956122, + "grad_norm": 18.921064791138175, + "learning_rate": 5.881918314729328e-08, + "loss": 0.3742, + "step": 13230 + }, + { + "epoch": 1.6987426225301514, + "grad_norm": 33.1421425182308, + "learning_rate": 5.833126500544966e-08, + "loss": 0.384, + "step": 13240 + }, + { + "epoch": 1.7000256607646906, + "grad_norm": 15.859687030630768, + "learning_rate": 5.784525360881659e-08, + "loss": 0.3515, + "step": 13250 + }, + { + "epoch": 1.70130869899923, + "grad_norm": 20.162977600637078, + "learning_rate": 5.736115105557249e-08, + "loss": 0.4097, + "step": 13260 + }, + { + "epoch": 1.7025917372337696, + "grad_norm": 19.114615450499855, + "learning_rate": 5.687895943565546e-08, + "loss": 0.3852, + "step": 13270 + }, + { + "epoch": 1.703874775468309, + "grad_norm": 26.970136638946535, + "learning_rate": 5.639868083075361e-08, + "loss": 0.3452, + "step": 13280 + }, + { + "epoch": 1.7051578137028485, + "grad_norm": 20.32152521465425, + "learning_rate": 5.5920317314296305e-08, + "loss": 0.4026, + "step": 13290 + }, + { + "epoch": 1.7064408519373877, + "grad_norm": 20.974641374826913, + "learning_rate": 5.544387095144509e-08, + "loss": 0.3923, + "step": 13300 + }, + { + "epoch": 1.7077238901719272, + "grad_norm": 30.357139970146342, + "learning_rate": 5.496934379908513e-08, + "loss": 0.3923, + "step": 13310 + }, + { + "epoch": 1.7090069284064664, + "grad_norm": 21.93831707706908, + "learning_rate": 5.44967379058161e-08, + "loss": 0.379, + "step": 13320 + }, + { + "epoch": 1.7102899666410059, + "grad_norm": 21.049775272418376, + "learning_rate": 5.402605531194293e-08, + "loss": 0.3519, + "step": 13330 + }, + { + "epoch": 1.7115730048755453, + "grad_norm": 30.051815826011666, + "learning_rate": 5.355729804946801e-08, + "loss": 0.3852, + "step": 13340 + }, + { + "epoch": 1.7128560431100848, + "grad_norm": 17.315877345298475, + "learning_rate": 5.309046814208129e-08, + "loss": 0.3996, + "step": 13350 + }, + { + "epoch": 1.7141390813446242, + "grad_norm": 26.9894162258069, + "learning_rate": 5.262556760515213e-08, + "loss": 0.3837, + "step": 13360 + }, + { + "epoch": 1.7154221195791635, + "grad_norm": 19.68778988481601, + "learning_rate": 5.216259844572085e-08, + "loss": 0.3912, + "step": 13370 + }, + { + "epoch": 1.7167051578137027, + "grad_norm": 19.94140524196602, + "learning_rate": 5.1701562662489596e-08, + "loss": 0.4006, + "step": 13380 + }, + { + "epoch": 1.7179881960482422, + "grad_norm": 23.53864620633266, + "learning_rate": 5.1242462245813744e-08, + "loss": 0.3818, + "step": 13390 + }, + { + "epoch": 1.7192712342827816, + "grad_norm": 19.860792542436766, + "learning_rate": 5.0785299177693305e-08, + "loss": 0.4167, + "step": 13400 + }, + { + "epoch": 1.720554272517321, + "grad_norm": 19.230443731412567, + "learning_rate": 5.033007543176498e-08, + "loss": 0.3492, + "step": 13410 + }, + { + "epoch": 1.7218373107518605, + "grad_norm": 16.710240391143717, + "learning_rate": 4.9876792973292615e-08, + "loss": 0.4004, + "step": 13420 + }, + { + "epoch": 1.7231203489863998, + "grad_norm": 26.955704167922033, + "learning_rate": 4.942545375915963e-08, + "loss": 0.3833, + "step": 13430 + }, + { + "epoch": 1.7244033872209392, + "grad_norm": 23.88235162745621, + "learning_rate": 4.897605973785995e-08, + "loss": 0.3757, + "step": 13440 + }, + { + "epoch": 1.7256864254554785, + "grad_norm": 22.757264438900187, + "learning_rate": 4.852861284948984e-08, + "loss": 0.3668, + "step": 13450 + }, + { + "epoch": 1.726969463690018, + "grad_norm": 21.8866943322943, + "learning_rate": 4.808311502573975e-08, + "loss": 0.4023, + "step": 13460 + }, + { + "epoch": 1.7282525019245574, + "grad_norm": 16.62296456875243, + "learning_rate": 4.763956818988546e-08, + "loss": 0.3553, + "step": 13470 + }, + { + "epoch": 1.7295355401590968, + "grad_norm": 27.991416399509436, + "learning_rate": 4.719797425678046e-08, + "loss": 0.4052, + "step": 13480 + }, + { + "epoch": 1.7308185783936363, + "grad_norm": 30.999253089792703, + "learning_rate": 4.6758335132846826e-08, + "loss": 0.4006, + "step": 13490 + }, + { + "epoch": 1.7321016166281755, + "grad_norm": 29.04780741979244, + "learning_rate": 4.6320652716067555e-08, + "loss": 0.399, + "step": 13500 + }, + { + "epoch": 1.7333846548627148, + "grad_norm": 17.761116109640845, + "learning_rate": 4.5884928895978614e-08, + "loss": 0.4125, + "step": 13510 + }, + { + "epoch": 1.7346676930972542, + "grad_norm": 38.82992312588126, + "learning_rate": 4.545116555366002e-08, + "loss": 0.3985, + "step": 13520 + }, + { + "epoch": 1.7359507313317937, + "grad_norm": 17.574256170597366, + "learning_rate": 4.5019364561728446e-08, + "loss": 0.4163, + "step": 13530 + }, + { + "epoch": 1.7372337695663331, + "grad_norm": 18.57188344560859, + "learning_rate": 4.458952778432856e-08, + "loss": 0.3744, + "step": 13540 + }, + { + "epoch": 1.7385168078008726, + "grad_norm": 16.622471952134774, + "learning_rate": 4.416165707712532e-08, + "loss": 0.3896, + "step": 13550 + }, + { + "epoch": 1.7397998460354118, + "grad_norm": 18.453559151212502, + "learning_rate": 4.373575428729609e-08, + "loss": 0.3912, + "step": 13560 + }, + { + "epoch": 1.7410828842699513, + "grad_norm": 29.114394275823795, + "learning_rate": 4.3311821253522285e-08, + "loss": 0.382, + "step": 13570 + }, + { + "epoch": 1.7423659225044905, + "grad_norm": 27.00276488625539, + "learning_rate": 4.288985980598164e-08, + "loss": 0.4028, + "step": 13580 + }, + { + "epoch": 1.74364896073903, + "grad_norm": 23.722489576635798, + "learning_rate": 4.246987176634009e-08, + "loss": 0.4207, + "step": 13590 + }, + { + "epoch": 1.7449319989735694, + "grad_norm": 22.904041592068445, + "learning_rate": 4.205185894774454e-08, + "loss": 0.3739, + "step": 13600 + }, + { + "epoch": 1.7462150372081089, + "grad_norm": 19.130006723192313, + "learning_rate": 4.163582315481407e-08, + "loss": 0.4146, + "step": 13610 + }, + { + "epoch": 1.7474980754426483, + "grad_norm": 25.455194854816483, + "learning_rate": 4.1221766183633045e-08, + "loss": 0.3814, + "step": 13620 + }, + { + "epoch": 1.7487811136771876, + "grad_norm": 14.132070039286369, + "learning_rate": 4.080968982174299e-08, + "loss": 0.3378, + "step": 13630 + }, + { + "epoch": 1.7500641519117268, + "grad_norm": 17.9771018438934, + "learning_rate": 4.0399595848134624e-08, + "loss": 0.3534, + "step": 13640 + }, + { + "epoch": 1.7513471901462663, + "grad_norm": 25.234417255800746, + "learning_rate": 3.999148603324037e-08, + "loss": 0.3801, + "step": 13650 + }, + { + "epoch": 1.7526302283808057, + "grad_norm": 19.427526142913013, + "learning_rate": 3.9585362138927104e-08, + "loss": 0.3865, + "step": 13660 + }, + { + "epoch": 1.7539132666153452, + "grad_norm": 26.56571553134844, + "learning_rate": 3.9181225918488105e-08, + "loss": 0.3846, + "step": 13670 + }, + { + "epoch": 1.7551963048498846, + "grad_norm": 19.30510291577036, + "learning_rate": 3.877907911663542e-08, + "loss": 0.3998, + "step": 13680 + }, + { + "epoch": 1.7564793430844239, + "grad_norm": 15.019580693998911, + "learning_rate": 3.83789234694924e-08, + "loss": 0.3649, + "step": 13690 + }, + { + "epoch": 1.7577623813189633, + "grad_norm": 17.245708190240308, + "learning_rate": 3.798076070458683e-08, + "loss": 0.4091, + "step": 13700 + }, + { + "epoch": 1.7590454195535026, + "grad_norm": 18.811426515122903, + "learning_rate": 3.758459254084234e-08, + "loss": 0.399, + "step": 13710 + }, + { + "epoch": 1.760328457788042, + "grad_norm": 24.90795154440674, + "learning_rate": 3.7190420688572034e-08, + "loss": 0.3904, + "step": 13720 + }, + { + "epoch": 1.7616114960225815, + "grad_norm": 23.45767119249471, + "learning_rate": 3.679824684947042e-08, + "loss": 0.4106, + "step": 13730 + }, + { + "epoch": 1.762894534257121, + "grad_norm": 31.795924811515228, + "learning_rate": 3.6408072716606345e-08, + "loss": 0.3779, + "step": 13740 + }, + { + "epoch": 1.7641775724916604, + "grad_norm": 16.10809912711329, + "learning_rate": 3.6019899974415676e-08, + "loss": 0.4049, + "step": 13750 + }, + { + "epoch": 1.7654606107261996, + "grad_norm": 29.266461515456925, + "learning_rate": 3.563373029869415e-08, + "loss": 0.4087, + "step": 13760 + }, + { + "epoch": 1.7667436489607389, + "grad_norm": 18.122290754398886, + "learning_rate": 3.5249565356589626e-08, + "loss": 0.3665, + "step": 13770 + }, + { + "epoch": 1.7680266871952783, + "grad_norm": 28.05563461148085, + "learning_rate": 3.486740680659561e-08, + "loss": 0.385, + "step": 13780 + }, + { + "epoch": 1.7693097254298178, + "grad_norm": 21.200768985835403, + "learning_rate": 3.448725629854349e-08, + "loss": 0.3685, + "step": 13790 + }, + { + "epoch": 1.7705927636643572, + "grad_norm": 25.568963482243078, + "learning_rate": 3.410911547359585e-08, + "loss": 0.4075, + "step": 13800 + }, + { + "epoch": 1.7718758018988967, + "grad_norm": 37.38946968817979, + "learning_rate": 3.3732985964239014e-08, + "loss": 0.371, + "step": 13810 + }, + { + "epoch": 1.773158840133436, + "grad_norm": 26.496611879554415, + "learning_rate": 3.33588693942764e-08, + "loss": 0.3565, + "step": 13820 + }, + { + "epoch": 1.7744418783679754, + "grad_norm": 22.51670837523052, + "learning_rate": 3.2986767378821e-08, + "loss": 0.3871, + "step": 13830 + }, + { + "epoch": 1.7757249166025146, + "grad_norm": 25.351358390908626, + "learning_rate": 3.261668152428881e-08, + "loss": 0.3682, + "step": 13840 + }, + { + "epoch": 1.777007954837054, + "grad_norm": 21.33895266673386, + "learning_rate": 3.2248613428391915e-08, + "loss": 0.3603, + "step": 13850 + }, + { + "epoch": 1.7782909930715936, + "grad_norm": 35.941323657336255, + "learning_rate": 3.188256468013139e-08, + "loss": 0.3841, + "step": 13860 + }, + { + "epoch": 1.779574031306133, + "grad_norm": 19.219535853466123, + "learning_rate": 3.1518536859790445e-08, + "loss": 0.3397, + "step": 13870 + }, + { + "epoch": 1.7808570695406725, + "grad_norm": 36.057704077682025, + "learning_rate": 3.115653153892761e-08, + "loss": 0.4093, + "step": 13880 + }, + { + "epoch": 1.7821401077752117, + "grad_norm": 20.661050522254033, + "learning_rate": 3.0796550280370145e-08, + "loss": 0.3488, + "step": 13890 + }, + { + "epoch": 1.783423146009751, + "grad_norm": 19.122344169518094, + "learning_rate": 3.043859463820703e-08, + "loss": 0.4124, + "step": 13900 + }, + { + "epoch": 1.7847061842442904, + "grad_norm": 25.617357132738753, + "learning_rate": 3.008266615778249e-08, + "loss": 0.3776, + "step": 13910 + }, + { + "epoch": 1.7859892224788299, + "grad_norm": 19.161865511164212, + "learning_rate": 2.9728766375689217e-08, + "loss": 0.3905, + "step": 13920 + }, + { + "epoch": 1.7872722607133693, + "grad_norm": 19.440177151475012, + "learning_rate": 2.9376896819761632e-08, + "loss": 0.4044, + "step": 13930 + }, + { + "epoch": 1.7885552989479088, + "grad_norm": 19.160390849801182, + "learning_rate": 2.9027059009069322e-08, + "loss": 0.3637, + "step": 13940 + }, + { + "epoch": 1.789838337182448, + "grad_norm": 24.62840585297446, + "learning_rate": 2.8679254453910785e-08, + "loss": 0.3842, + "step": 13950 + }, + { + "epoch": 1.7911213754169875, + "grad_norm": 21.0782216250499, + "learning_rate": 2.833348465580654e-08, + "loss": 0.403, + "step": 13960 + }, + { + "epoch": 1.7924044136515267, + "grad_norm": 24.31605785524227, + "learning_rate": 2.7989751107492744e-08, + "loss": 0.3711, + "step": 13970 + }, + { + "epoch": 1.7936874518860662, + "grad_norm": 23.917076493029672, + "learning_rate": 2.764805529291475e-08, + "loss": 0.343, + "step": 13980 + }, + { + "epoch": 1.7949704901206056, + "grad_norm": 33.162122184402655, + "learning_rate": 2.7308398687220958e-08, + "loss": 0.3791, + "step": 13990 + }, + { + "epoch": 1.796253528355145, + "grad_norm": 18.590982783780543, + "learning_rate": 2.6970782756755905e-08, + "loss": 0.4252, + "step": 14000 + }, + { + "epoch": 1.7975365665896845, + "grad_norm": 31.75443570758698, + "learning_rate": 2.6635208959054524e-08, + "loss": 0.3968, + "step": 14010 + }, + { + "epoch": 1.7988196048242238, + "grad_norm": 27.908412827328124, + "learning_rate": 2.6301678742835397e-08, + "loss": 0.4005, + "step": 14020 + }, + { + "epoch": 1.800102643058763, + "grad_norm": 25.907893556182387, + "learning_rate": 2.5970193547994734e-08, + "loss": 0.381, + "step": 14030 + }, + { + "epoch": 1.8013856812933025, + "grad_norm": 22.830346111282267, + "learning_rate": 2.5640754805600128e-08, + "loss": 0.4124, + "step": 14040 + }, + { + "epoch": 1.802668719527842, + "grad_norm": 18.960737482985728, + "learning_rate": 2.5313363937884414e-08, + "loss": 0.3907, + "step": 14050 + }, + { + "epoch": 1.8039517577623814, + "grad_norm": 33.832694949395865, + "learning_rate": 2.4988022358239213e-08, + "loss": 0.4204, + "step": 14060 + }, + { + "epoch": 1.8052347959969208, + "grad_norm": 34.82428207514107, + "learning_rate": 2.466473147120951e-08, + "loss": 0.3982, + "step": 14070 + }, + { + "epoch": 1.80651783423146, + "grad_norm": 26.452865516278813, + "learning_rate": 2.434349267248681e-08, + "loss": 0.3905, + "step": 14080 + }, + { + "epoch": 1.8078008724659995, + "grad_norm": 14.487714145449283, + "learning_rate": 2.4024307348903428e-08, + "loss": 0.4007, + "step": 14090 + }, + { + "epoch": 1.8090839107005388, + "grad_norm": 31.17593619546588, + "learning_rate": 2.3707176878426882e-08, + "loss": 0.3688, + "step": 14100 + }, + { + "epoch": 1.8103669489350782, + "grad_norm": 35.16115860117904, + "learning_rate": 2.3392102630153455e-08, + "loss": 0.369, + "step": 14110 + }, + { + "epoch": 1.8116499871696177, + "grad_norm": 25.709921209637656, + "learning_rate": 2.307908596430225e-08, + "loss": 0.4033, + "step": 14120 + }, + { + "epoch": 1.8129330254041571, + "grad_norm": 23.794968197522202, + "learning_rate": 2.276812823220964e-08, + "loss": 0.3608, + "step": 14130 + }, + { + "epoch": 1.8142160636386966, + "grad_norm": 23.596862930880487, + "learning_rate": 2.2459230776323335e-08, + "loss": 0.3655, + "step": 14140 + }, + { + "epoch": 1.8154991018732358, + "grad_norm": 20.17345925697029, + "learning_rate": 2.2152394930196606e-08, + "loss": 0.3783, + "step": 14150 + }, + { + "epoch": 1.816782140107775, + "grad_norm": 20.286745886656462, + "learning_rate": 2.1847622018482282e-08, + "loss": 0.3769, + "step": 14160 + }, + { + "epoch": 1.8180651783423145, + "grad_norm": 20.8488075561613, + "learning_rate": 2.154491335692732e-08, + "loss": 0.375, + "step": 14170 + }, + { + "epoch": 1.819348216576854, + "grad_norm": 14.755946664568791, + "learning_rate": 2.1244270252367026e-08, + "loss": 0.3975, + "step": 14180 + }, + { + "epoch": 1.8206312548113934, + "grad_norm": 23.130417823171896, + "learning_rate": 2.094569400271934e-08, + "loss": 0.3671, + "step": 14190 + }, + { + "epoch": 1.8219142930459329, + "grad_norm": 19.347281379217204, + "learning_rate": 2.0649185896979447e-08, + "loss": 0.3752, + "step": 14200 + }, + { + "epoch": 1.8231973312804721, + "grad_norm": 17.167058448300754, + "learning_rate": 2.0354747215213964e-08, + "loss": 0.3554, + "step": 14210 + }, + { + "epoch": 1.8244803695150116, + "grad_norm": 21.229515583965437, + "learning_rate": 2.0062379228555525e-08, + "loss": 0.367, + "step": 14220 + }, + { + "epoch": 1.8257634077495508, + "grad_norm": 21.377988107197385, + "learning_rate": 1.9772083199197208e-08, + "loss": 0.3696, + "step": 14230 + }, + { + "epoch": 1.8270464459840903, + "grad_norm": 20.480340410164526, + "learning_rate": 1.9483860380387408e-08, + "loss": 0.3617, + "step": 14240 + }, + { + "epoch": 1.8283294842186297, + "grad_norm": 40.69331508164844, + "learning_rate": 1.9197712016423838e-08, + "loss": 0.4117, + "step": 14250 + }, + { + "epoch": 1.8296125224531692, + "grad_norm": 18.298096902227368, + "learning_rate": 1.8913639342648892e-08, + "loss": 0.3623, + "step": 14260 + }, + { + "epoch": 1.8308955606877086, + "grad_norm": 25.942411964363973, + "learning_rate": 1.8631643585443557e-08, + "loss": 0.381, + "step": 14270 + }, + { + "epoch": 1.8321785989222479, + "grad_norm": 23.683514906578814, + "learning_rate": 1.835172596222273e-08, + "loss": 0.4151, + "step": 14280 + }, + { + "epoch": 1.8334616371567871, + "grad_norm": 16.01452670466323, + "learning_rate": 1.8073887681429533e-08, + "loss": 0.3825, + "step": 14290 + }, + { + "epoch": 1.8347446753913266, + "grad_norm": 23.09428236849842, + "learning_rate": 1.7798129942530548e-08, + "loss": 0.3907, + "step": 14300 + }, + { + "epoch": 1.836027713625866, + "grad_norm": 17.14210714323239, + "learning_rate": 1.7524453936010096e-08, + "loss": 0.3644, + "step": 14310 + }, + { + "epoch": 1.8373107518604055, + "grad_norm": 28.102542593143554, + "learning_rate": 1.7252860843365358e-08, + "loss": 0.4077, + "step": 14320 + }, + { + "epoch": 1.838593790094945, + "grad_norm": 24.326480345680842, + "learning_rate": 1.6983351837101532e-08, + "loss": 0.4085, + "step": 14330 + }, + { + "epoch": 1.8398768283294842, + "grad_norm": 17.302205884490593, + "learning_rate": 1.6715928080726415e-08, + "loss": 0.4235, + "step": 14340 + }, + { + "epoch": 1.8411598665640236, + "grad_norm": 20.075398310312515, + "learning_rate": 1.6450590728745383e-08, + "loss": 0.3924, + "step": 14350 + }, + { + "epoch": 1.8424429047985629, + "grad_norm": 26.918528999585888, + "learning_rate": 1.6187340926656636e-08, + "loss": 0.3575, + "step": 14360 + }, + { + "epoch": 1.8437259430331023, + "grad_norm": 49.10947443273659, + "learning_rate": 1.5926179810946184e-08, + "loss": 0.3918, + "step": 14370 + }, + { + "epoch": 1.8450089812676418, + "grad_norm": 16.443859917762573, + "learning_rate": 1.5667108509082592e-08, + "loss": 0.3592, + "step": 14380 + }, + { + "epoch": 1.8462920195021812, + "grad_norm": 21.443633415657327, + "learning_rate": 1.5410128139512757e-08, + "loss": 0.372, + "step": 14390 + }, + { + "epoch": 1.8475750577367207, + "grad_norm": 19.653234367776253, + "learning_rate": 1.5155239811656562e-08, + "loss": 0.3838, + "step": 14400 + }, + { + "epoch": 1.84885809597126, + "grad_norm": 21.173774500666056, + "learning_rate": 1.4902444625902343e-08, + "loss": 0.371, + "step": 14410 + }, + { + "epoch": 1.8501411342057992, + "grad_norm": 32.502041513027, + "learning_rate": 1.4651743673601891e-08, + "loss": 0.3691, + "step": 14420 + }, + { + "epoch": 1.8514241724403386, + "grad_norm": 29.403565380569287, + "learning_rate": 1.4403138037066054e-08, + "loss": 0.402, + "step": 14430 + }, + { + "epoch": 1.852707210674878, + "grad_norm": 17.78022701346854, + "learning_rate": 1.4156628789559922e-08, + "loss": 0.3665, + "step": 14440 + }, + { + "epoch": 1.8539902489094175, + "grad_norm": 31.77736908440453, + "learning_rate": 1.3912216995297987e-08, + "loss": 0.4185, + "step": 14450 + }, + { + "epoch": 1.855273287143957, + "grad_norm": 36.523978560958206, + "learning_rate": 1.3669903709439934e-08, + "loss": 0.3885, + "step": 14460 + }, + { + "epoch": 1.8565563253784962, + "grad_norm": 19.351683471049125, + "learning_rate": 1.3429689978085912e-08, + "loss": 0.3878, + "step": 14470 + }, + { + "epoch": 1.8578393636130357, + "grad_norm": 18.809203628995434, + "learning_rate": 1.3191576838271767e-08, + "loss": 0.3627, + "step": 14480 + }, + { + "epoch": 1.859122401847575, + "grad_norm": 22.81258313196216, + "learning_rate": 1.29555653179651e-08, + "loss": 0.3682, + "step": 14490 + }, + { + "epoch": 1.8604054400821144, + "grad_norm": 18.649130550775848, + "learning_rate": 1.2721656436060379e-08, + "loss": 0.3874, + "step": 14500 + }, + { + "epoch": 1.8616884783166538, + "grad_norm": 16.058343997256063, + "learning_rate": 1.2489851202374723e-08, + "loss": 0.3733, + "step": 14510 + }, + { + "epoch": 1.8629715165511933, + "grad_norm": 23.613955007972443, + "learning_rate": 1.226015061764335e-08, + "loss": 0.387, + "step": 14520 + }, + { + "epoch": 1.8642545547857328, + "grad_norm": 21.082054187508554, + "learning_rate": 1.2032555673515687e-08, + "loss": 0.3884, + "step": 14530 + }, + { + "epoch": 1.865537593020272, + "grad_norm": 17.202115265252825, + "learning_rate": 1.1807067352550603e-08, + "loss": 0.3811, + "step": 14540 + }, + { + "epoch": 1.8668206312548112, + "grad_norm": 20.7122592658224, + "learning_rate": 1.1583686628212574e-08, + "loss": 0.3764, + "step": 14550 + }, + { + "epoch": 1.8681036694893507, + "grad_norm": 24.028459031839105, + "learning_rate": 1.1362414464867076e-08, + "loss": 0.417, + "step": 14560 + }, + { + "epoch": 1.8693867077238902, + "grad_norm": 23.639304189503946, + "learning_rate": 1.1143251817776755e-08, + "loss": 0.4064, + "step": 14570 + }, + { + "epoch": 1.8706697459584296, + "grad_norm": 29.13602379741378, + "learning_rate": 1.0926199633097154e-08, + "loss": 0.3654, + "step": 14580 + }, + { + "epoch": 1.871952784192969, + "grad_norm": 22.29682937167472, + "learning_rate": 1.0711258847872829e-08, + "loss": 0.3727, + "step": 14590 + }, + { + "epoch": 1.8732358224275083, + "grad_norm": 16.936559821203367, + "learning_rate": 1.0498430390032787e-08, + "loss": 0.3653, + "step": 14600 + }, + { + "epoch": 1.8745188606620478, + "grad_norm": 20.352829864913318, + "learning_rate": 1.0287715178387057e-08, + "loss": 0.3931, + "step": 14610 + }, + { + "epoch": 1.875801898896587, + "grad_norm": 15.598024434125769, + "learning_rate": 1.0079114122622412e-08, + "loss": 0.4065, + "step": 14620 + }, + { + "epoch": 1.8770849371311265, + "grad_norm": 20.617354655496662, + "learning_rate": 9.872628123298699e-09, + "loss": 0.3771, + "step": 14630 + }, + { + "epoch": 1.878367975365666, + "grad_norm": 27.55277946660392, + "learning_rate": 9.66825807184446e-09, + "loss": 0.3834, + "step": 14640 + }, + { + "epoch": 1.8796510136002054, + "grad_norm": 16.648677133890747, + "learning_rate": 9.46600485055371e-09, + "loss": 0.3875, + "step": 14650 + }, + { + "epoch": 1.8809340518347448, + "grad_norm": 21.2399896534851, + "learning_rate": 9.265869332581556e-09, + "loss": 0.3823, + "step": 14660 + }, + { + "epoch": 1.882217090069284, + "grad_norm": 20.178290843226172, + "learning_rate": 9.067852381940799e-09, + "loss": 0.391, + "step": 14670 + }, + { + "epoch": 1.8835001283038233, + "grad_norm": 17.476233484402368, + "learning_rate": 8.87195485349812e-09, + "loss": 0.3829, + "step": 14680 + }, + { + "epoch": 1.8847831665383628, + "grad_norm": 15.469209189311465, + "learning_rate": 8.678177592970404e-09, + "loss": 0.3864, + "step": 14690 + }, + { + "epoch": 1.8860662047729022, + "grad_norm": 18.5098097618441, + "learning_rate": 8.486521436920912e-09, + "loss": 0.3867, + "step": 14700 + }, + { + "epoch": 1.8873492430074417, + "grad_norm": 23.415942020527353, + "learning_rate": 8.296987212755735e-09, + "loss": 0.3878, + "step": 14710 + }, + { + "epoch": 1.8886322812419811, + "grad_norm": 20.780236567113327, + "learning_rate": 8.10957573872062e-09, + "loss": 0.3977, + "step": 14720 + }, + { + "epoch": 1.8899153194765204, + "grad_norm": 16.895301014593258, + "learning_rate": 7.924287823896814e-09, + "loss": 0.3693, + "step": 14730 + }, + { + "epoch": 1.8911983577110598, + "grad_norm": 18.13505114493306, + "learning_rate": 7.741124268197952e-09, + "loss": 0.3911, + "step": 14740 + }, + { + "epoch": 1.892481395945599, + "grad_norm": 16.07608059047086, + "learning_rate": 7.560085862366505e-09, + "loss": 0.3702, + "step": 14750 + }, + { + "epoch": 1.8937644341801385, + "grad_norm": 32.84513977182166, + "learning_rate": 7.381173387970397e-09, + "loss": 0.3919, + "step": 14760 + }, + { + "epoch": 1.895047472414678, + "grad_norm": 17.984131801622738, + "learning_rate": 7.20438761739961e-09, + "loss": 0.3647, + "step": 14770 + }, + { + "epoch": 1.8963305106492174, + "grad_norm": 19.146374790268403, + "learning_rate": 7.029729313862864e-09, + "loss": 0.3902, + "step": 14780 + }, + { + "epoch": 1.8976135488837569, + "grad_norm": 16.865100984509766, + "learning_rate": 6.857199231384281e-09, + "loss": 0.4088, + "step": 14790 + }, + { + "epoch": 1.8988965871182961, + "grad_norm": 23.17593149089719, + "learning_rate": 6.686798114800218e-09, + "loss": 0.3778, + "step": 14800 + }, + { + "epoch": 1.9001796253528354, + "grad_norm": 19.58662165872448, + "learning_rate": 6.5185266997557774e-09, + "loss": 0.3775, + "step": 14810 + }, + { + "epoch": 1.9014626635873748, + "grad_norm": 14.433975515990209, + "learning_rate": 6.3523857127021905e-09, + "loss": 0.3802, + "step": 14820 + }, + { + "epoch": 1.9027457018219143, + "grad_norm": 33.86582475059043, + "learning_rate": 6.188375870892992e-09, + "loss": 0.4132, + "step": 14830 + }, + { + "epoch": 1.9040287400564537, + "grad_norm": 25.82726696578014, + "learning_rate": 6.026497882381521e-09, + "loss": 0.3796, + "step": 14840 + }, + { + "epoch": 1.9053117782909932, + "grad_norm": 17.807378312719816, + "learning_rate": 5.866752446017531e-09, + "loss": 0.3607, + "step": 14850 + }, + { + "epoch": 1.9065948165255324, + "grad_norm": 16.30869097625096, + "learning_rate": 5.7091402514442e-09, + "loss": 0.3537, + "step": 14860 + }, + { + "epoch": 1.9078778547600719, + "grad_norm": 17.326439541826645, + "learning_rate": 5.553661979095181e-09, + "loss": 0.3539, + "step": 14870 + }, + { + "epoch": 1.9091608929946111, + "grad_norm": 27.117597137893636, + "learning_rate": 5.400318300191831e-09, + "loss": 0.397, + "step": 14880 + }, + { + "epoch": 1.9104439312291506, + "grad_norm": 20.129599923041937, + "learning_rate": 5.249109876740099e-09, + "loss": 0.3829, + "step": 14890 + }, + { + "epoch": 1.91172696946369, + "grad_norm": 15.651632830625392, + "learning_rate": 5.100037361527698e-09, + "loss": 0.3803, + "step": 14900 + }, + { + "epoch": 1.9130100076982295, + "grad_norm": 21.153913253239494, + "learning_rate": 4.953101398121273e-09, + "loss": 0.3785, + "step": 14910 + }, + { + "epoch": 1.914293045932769, + "grad_norm": 15.57791470250422, + "learning_rate": 4.808302620863958e-09, + "loss": 0.3357, + "step": 14920 + }, + { + "epoch": 1.9155760841673082, + "grad_norm": 26.992058234265397, + "learning_rate": 4.665641654871988e-09, + "loss": 0.411, + "step": 14930 + }, + { + "epoch": 1.9168591224018474, + "grad_norm": 15.226011512345641, + "learning_rate": 4.5251191160326495e-09, + "loss": 0.4087, + "step": 14940 + }, + { + "epoch": 1.9181421606363869, + "grad_norm": 22.291917286999457, + "learning_rate": 4.386735611001169e-09, + "loss": 0.364, + "step": 14950 + }, + { + "epoch": 1.9194251988709263, + "grad_norm": 27.79320469880846, + "learning_rate": 4.25049173719838e-09, + "loss": 0.3863, + "step": 14960 + }, + { + "epoch": 1.9207082371054658, + "grad_norm": 20.84433160635907, + "learning_rate": 4.116388082808009e-09, + "loss": 0.4086, + "step": 14970 + }, + { + "epoch": 1.9219912753400052, + "grad_norm": 18.59203523437589, + "learning_rate": 3.984425226774113e-09, + "loss": 0.3637, + "step": 14980 + }, + { + "epoch": 1.9232743135745445, + "grad_norm": 24.212637605084907, + "learning_rate": 3.854603738798645e-09, + "loss": 0.4081, + "step": 14990 + }, + { + "epoch": 1.924557351809084, + "grad_norm": 20.083683977608395, + "learning_rate": 3.7269241793390084e-09, + "loss": 0.4041, + "step": 15000 + }, + { + "epoch": 1.9258403900436232, + "grad_norm": 15.382975504010382, + "learning_rate": 3.6013870996055573e-09, + "loss": 0.3883, + "step": 15010 + }, + { + "epoch": 1.9271234282781626, + "grad_norm": 21.532073883869487, + "learning_rate": 3.477993041559213e-09, + "loss": 0.3988, + "step": 15020 + }, + { + "epoch": 1.928406466512702, + "grad_norm": 33.62173546075581, + "learning_rate": 3.356742537909407e-09, + "loss": 0.3785, + "step": 15030 + }, + { + "epoch": 1.9296895047472415, + "grad_norm": 25.37389792480037, + "learning_rate": 3.2376361121112526e-09, + "loss": 0.3884, + "step": 15040 + }, + { + "epoch": 1.930972542981781, + "grad_norm": 26.984759585607772, + "learning_rate": 3.1206742783637662e-09, + "loss": 0.3963, + "step": 15050 + }, + { + "epoch": 1.9322555812163202, + "grad_norm": 32.48680676075061, + "learning_rate": 3.0058575416073707e-09, + "loss": 0.4212, + "step": 15060 + }, + { + "epoch": 1.9335386194508595, + "grad_norm": 20.738477426188325, + "learning_rate": 2.893186397521896e-09, + "loss": 0.3693, + "step": 15070 + }, + { + "epoch": 1.934821657685399, + "grad_norm": 13.818452322258569, + "learning_rate": 2.7826613325243608e-09, + "loss": 0.3731, + "step": 15080 + }, + { + "epoch": 1.9361046959199384, + "grad_norm": 36.323370959513106, + "learning_rate": 2.6742828237666936e-09, + "loss": 0.4075, + "step": 15090 + }, + { + "epoch": 1.9373877341544778, + "grad_norm": 24.63803986584088, + "learning_rate": 2.568051339134014e-09, + "loss": 0.3689, + "step": 15100 + }, + { + "epoch": 1.9386707723890173, + "grad_norm": 20.370241132290193, + "learning_rate": 2.4639673372423563e-09, + "loss": 0.3891, + "step": 15110 + }, + { + "epoch": 1.9399538106235565, + "grad_norm": 18.430147514058884, + "learning_rate": 2.3620312674367816e-09, + "loss": 0.4009, + "step": 15120 + }, + { + "epoch": 1.941236848858096, + "grad_norm": 18.236479644825593, + "learning_rate": 2.26224356978949e-09, + "loss": 0.3785, + "step": 15130 + }, + { + "epoch": 1.9425198870926352, + "grad_norm": 18.471820251246047, + "learning_rate": 2.1646046750978253e-09, + "loss": 0.3813, + "step": 15140 + }, + { + "epoch": 1.9438029253271747, + "grad_norm": 28.21151859853527, + "learning_rate": 2.0691150048823823e-09, + "loss": 0.3949, + "step": 15150 + }, + { + "epoch": 1.9450859635617141, + "grad_norm": 21.40228893359066, + "learning_rate": 1.9757749713853465e-09, + "loss": 0.3417, + "step": 15160 + }, + { + "epoch": 1.9463690017962536, + "grad_norm": 22.09142827652811, + "learning_rate": 1.884584977568604e-09, + "loss": 0.3649, + "step": 15170 + }, + { + "epoch": 1.947652040030793, + "grad_norm": 21.53497191351553, + "learning_rate": 1.7955454171120765e-09, + "loss": 0.375, + "step": 15180 + }, + { + "epoch": 1.9489350782653323, + "grad_norm": 24.63967423533786, + "learning_rate": 1.7086566744117238e-09, + "loss": 0.3833, + "step": 15190 + }, + { + "epoch": 1.9502181164998715, + "grad_norm": 15.864729823311729, + "learning_rate": 1.6239191245784878e-09, + "loss": 0.3619, + "step": 15200 + }, + { + "epoch": 1.951501154734411, + "grad_norm": 31.06751571764041, + "learning_rate": 1.541333133436018e-09, + "loss": 0.3972, + "step": 15210 + }, + { + "epoch": 1.9527841929689504, + "grad_norm": 27.765648004136047, + "learning_rate": 1.4608990575195047e-09, + "loss": 0.3869, + "step": 15220 + }, + { + "epoch": 1.95406723120349, + "grad_norm": 21.691984204612208, + "learning_rate": 1.3826172440741247e-09, + "loss": 0.3876, + "step": 15230 + }, + { + "epoch": 1.9553502694380294, + "grad_norm": 22.106450930006027, + "learning_rate": 1.3064880310531546e-09, + "loss": 0.3635, + "step": 15240 + }, + { + "epoch": 1.9566333076725686, + "grad_norm": 32.682811288591516, + "learning_rate": 1.2325117471171375e-09, + "loss": 0.3883, + "step": 15250 + }, + { + "epoch": 1.957916345907108, + "grad_norm": 21.951542456195014, + "learning_rate": 1.1606887116317743e-09, + "loss": 0.3659, + "step": 15260 + }, + { + "epoch": 1.9591993841416473, + "grad_norm": 15.634192285873588, + "learning_rate": 1.0910192346672565e-09, + "loss": 0.3828, + "step": 15270 + }, + { + "epoch": 1.9604824223761868, + "grad_norm": 25.833025943322415, + "learning_rate": 1.023503616996324e-09, + "loss": 0.3868, + "step": 15280 + }, + { + "epoch": 1.9617654606107262, + "grad_norm": 26.889668226660223, + "learning_rate": 9.581421500931552e-10, + "loss": 0.3699, + "step": 15290 + }, + { + "epoch": 1.9630484988452657, + "grad_norm": 26.347578045782612, + "learning_rate": 8.949351161324225e-10, + "loss": 0.3944, + "step": 15300 + }, + { + "epoch": 1.9643315370798051, + "grad_norm": 28.929706825242125, + "learning_rate": 8.338827879875721e-10, + "loss": 0.3627, + "step": 15310 + }, + { + "epoch": 1.9656145753143444, + "grad_norm": 27.721311972266314, + "learning_rate": 7.749854292300462e-10, + "loss": 0.3792, + "step": 15320 + }, + { + "epoch": 1.9668976135488836, + "grad_norm": 27.326838591421556, + "learning_rate": 7.182432941278405e-10, + "loss": 0.3881, + "step": 15330 + }, + { + "epoch": 1.968180651783423, + "grad_norm": 18.82634222284022, + "learning_rate": 6.636566276446709e-10, + "loss": 0.3833, + "step": 15340 + }, + { + "epoch": 1.9694636900179625, + "grad_norm": 23.882625079454797, + "learning_rate": 6.112256654386971e-10, + "loss": 0.3932, + "step": 15350 + }, + { + "epoch": 1.970746728252502, + "grad_norm": 28.41432072979343, + "learning_rate": 5.609506338617453e-10, + "loss": 0.3665, + "step": 15360 + }, + { + "epoch": 1.9720297664870414, + "grad_norm": 28.6701015479653, + "learning_rate": 5.128317499580315e-10, + "loss": 0.4166, + "step": 15370 + }, + { + "epoch": 1.9733128047215807, + "grad_norm": 20.398846368676065, + "learning_rate": 4.668692214634951e-10, + "loss": 0.3771, + "step": 15380 + }, + { + "epoch": 1.9745958429561201, + "grad_norm": 21.942154811753483, + "learning_rate": 4.2306324680468906e-10, + "loss": 0.3852, + "step": 15390 + }, + { + "epoch": 1.9758788811906594, + "grad_norm": 31.35931840379162, + "learning_rate": 3.814140150981693e-10, + "loss": 0.3694, + "step": 15400 + }, + { + "epoch": 1.9771619194251988, + "grad_norm": 16.46146185964616, + "learning_rate": 3.41921706149384e-10, + "loss": 0.4182, + "step": 15410 + }, + { + "epoch": 1.9784449576597383, + "grad_norm": 23.007646845653685, + "learning_rate": 3.0458649045211894e-10, + "loss": 0.3754, + "step": 15420 + }, + { + "epoch": 1.9797279958942777, + "grad_norm": 29.207251304057124, + "learning_rate": 2.694085291877202e-10, + "loss": 0.4025, + "step": 15430 + }, + { + "epoch": 1.9810110341288172, + "grad_norm": 40.225641240749354, + "learning_rate": 2.363879742243169e-10, + "loss": 0.3967, + "step": 15440 + }, + { + "epoch": 1.9822940723633564, + "grad_norm": 26.370731923423367, + "learning_rate": 2.0552496811615526e-10, + "loss": 0.3537, + "step": 15450 + }, + { + "epoch": 1.9835771105978957, + "grad_norm": 23.297865499847934, + "learning_rate": 1.7681964410320993e-10, + "loss": 0.3625, + "step": 15460 + }, + { + "epoch": 1.9848601488324351, + "grad_norm": 24.457477258438264, + "learning_rate": 1.502721261102957e-10, + "loss": 0.3744, + "step": 15470 + }, + { + "epoch": 1.9861431870669746, + "grad_norm": 20.426639970908088, + "learning_rate": 1.2588252874673466e-10, + "loss": 0.3871, + "step": 15480 + }, + { + "epoch": 1.987426225301514, + "grad_norm": 25.474012639551983, + "learning_rate": 1.0365095730580087e-10, + "loss": 0.3966, + "step": 15490 + }, + { + "epoch": 1.9887092635360535, + "grad_norm": 16.83860087388366, + "learning_rate": 8.357750776427641e-11, + "loss": 0.3445, + "step": 15500 + }, + { + "epoch": 1.9899923017705927, + "grad_norm": 20.53891136914264, + "learning_rate": 6.566226678206277e-11, + "loss": 0.3842, + "step": 15510 + }, + { + "epoch": 1.9912753400051322, + "grad_norm": 23.070185297574486, + "learning_rate": 4.9905311701681216e-11, + "loss": 0.4082, + "step": 15520 + }, + { + "epoch": 1.9925583782396714, + "grad_norm": 23.713986552455705, + "learning_rate": 3.630671054816181e-11, + "loss": 0.3734, + "step": 15530 + }, + { + "epoch": 1.9938414164742109, + "grad_norm": 37.625105856393006, + "learning_rate": 2.4866522028488268e-11, + "loss": 0.3705, + "step": 15540 + }, + { + "epoch": 1.9951244547087503, + "grad_norm": 26.851314689884465, + "learning_rate": 1.558479553154246e-11, + "loss": 0.4131, + "step": 15550 + }, + { + "epoch": 1.9964074929432898, + "grad_norm": 31.00346544997302, + "learning_rate": 8.461571127882373e-12, + "loss": 0.4169, + "step": 15560 + }, + { + "epoch": 1.9976905311778292, + "grad_norm": 18.298413776221132, + "learning_rate": 3.4968795694645393e-12, + "loss": 0.3661, + "step": 15570 + }, + { + "epoch": 1.9989735694123685, + "grad_norm": 15.338907881969591, + "learning_rate": 6.907422894775195e-13, + "loss": 0.3812, + "step": 15580 + }, + { + "epoch": 2.0, + "step": 15588, + "total_flos": 1369483242635264.0, + "train_loss": 0.5269709559382614, + "train_runtime": 109635.7262, + "train_samples_per_second": 1.137, + "train_steps_per_second": 0.142 + } + ], + "logging_steps": 10, + "max_steps": 15588, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1369483242635264.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}