diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,15232 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.27506654835847383, + "eval_steps": 2170, + "global_step": 2170, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012675877804537965, + "grad_norm": 2.328125, + "learning_rate": 0.0, + "loss": 2.9836, + "step": 1 + }, + { + "epoch": 0.0002535175560907593, + "grad_norm": 2.734375, + "learning_rate": 2.7649769585253456e-07, + "loss": 3.1814, + "step": 2 + }, + { + "epoch": 0.0003802763341361389, + "grad_norm": 3.21875, + "learning_rate": 5.529953917050691e-07, + "loss": 3.3287, + "step": 3 + }, + { + "epoch": 0.0005070351121815186, + "grad_norm": 2.8125, + "learning_rate": 8.294930875576038e-07, + "loss": 3.162, + "step": 4 + }, + { + "epoch": 0.0006337938902268982, + "grad_norm": 2.359375, + "learning_rate": 1.1059907834101382e-06, + "loss": 3.1274, + "step": 5 + }, + { + "epoch": 0.0007605526682722779, + "grad_norm": 2.28125, + "learning_rate": 1.3824884792626729e-06, + "loss": 3.4527, + "step": 6 + }, + { + "epoch": 0.0008873114463176575, + "grad_norm": 3.25, + "learning_rate": 1.6589861751152075e-06, + "loss": 3.134, + "step": 7 + }, + { + "epoch": 0.0010140702243630372, + "grad_norm": 2.25, + "learning_rate": 1.935483870967742e-06, + "loss": 3.1991, + "step": 8 + }, + { + "epoch": 0.0011408290024084169, + "grad_norm": 2.890625, + "learning_rate": 2.2119815668202764e-06, + "loss": 3.4574, + "step": 9 + }, + { + "epoch": 0.0012675877804537963, + "grad_norm": 2.921875, + "learning_rate": 2.4884792626728113e-06, + "loss": 2.6411, + "step": 10 + }, + { + "epoch": 0.001394346558499176, + "grad_norm": 2.75, + "learning_rate": 2.7649769585253458e-06, + "loss": 3.4507, + "step": 11 + }, + { + "epoch": 0.0015211053365445557, + "grad_norm": 2.609375, + "learning_rate": 3.0414746543778802e-06, + "loss": 3.0045, + "step": 12 + }, + { + "epoch": 0.0016478641145899354, + "grad_norm": 2.375, + "learning_rate": 3.317972350230415e-06, + "loss": 3.2921, + "step": 13 + }, + { + "epoch": 0.001774622892635315, + "grad_norm": 2.875, + "learning_rate": 3.594470046082949e-06, + "loss": 2.8938, + "step": 14 + }, + { + "epoch": 0.0019013816706806947, + "grad_norm": 2.15625, + "learning_rate": 3.870967741935484e-06, + "loss": 3.0096, + "step": 15 + }, + { + "epoch": 0.0020281404487260744, + "grad_norm": 2.65625, + "learning_rate": 4.147465437788019e-06, + "loss": 3.0827, + "step": 16 + }, + { + "epoch": 0.002154899226771454, + "grad_norm": 2.28125, + "learning_rate": 4.423963133640553e-06, + "loss": 2.9214, + "step": 17 + }, + { + "epoch": 0.0022816580048168338, + "grad_norm": 2.640625, + "learning_rate": 4.700460829493087e-06, + "loss": 3.0053, + "step": 18 + }, + { + "epoch": 0.002408416782862213, + "grad_norm": 1.984375, + "learning_rate": 4.976958525345623e-06, + "loss": 2.8434, + "step": 19 + }, + { + "epoch": 0.0025351755609075927, + "grad_norm": 2.53125, + "learning_rate": 5.253456221198157e-06, + "loss": 3.1882, + "step": 20 + }, + { + "epoch": 0.0026619343389529724, + "grad_norm": 2.171875, + "learning_rate": 5.5299539170506915e-06, + "loss": 3.3649, + "step": 21 + }, + { + "epoch": 0.002788693116998352, + "grad_norm": 2.703125, + "learning_rate": 5.8064516129032256e-06, + "loss": 4.069, + "step": 22 + }, + { + "epoch": 0.0029154518950437317, + "grad_norm": 2.28125, + "learning_rate": 6.0829493087557604e-06, + "loss": 3.3636, + "step": 23 + }, + { + "epoch": 0.0030422106730891114, + "grad_norm": 2.625, + "learning_rate": 6.359447004608295e-06, + "loss": 3.2612, + "step": 24 + }, + { + "epoch": 0.003168969451134491, + "grad_norm": 2.515625, + "learning_rate": 6.63594470046083e-06, + "loss": 3.2926, + "step": 25 + }, + { + "epoch": 0.0032957282291798708, + "grad_norm": 2.375, + "learning_rate": 6.912442396313364e-06, + "loss": 3.5472, + "step": 26 + }, + { + "epoch": 0.0034224870072252504, + "grad_norm": 2.609375, + "learning_rate": 7.188940092165898e-06, + "loss": 3.345, + "step": 27 + }, + { + "epoch": 0.00354924578527063, + "grad_norm": 2.578125, + "learning_rate": 7.465437788018433e-06, + "loss": 2.7514, + "step": 28 + }, + { + "epoch": 0.00367600456331601, + "grad_norm": 1.8828125, + "learning_rate": 7.741935483870968e-06, + "loss": 2.7626, + "step": 29 + }, + { + "epoch": 0.0038027633413613895, + "grad_norm": 2.421875, + "learning_rate": 8.018433179723503e-06, + "loss": 2.8706, + "step": 30 + }, + { + "epoch": 0.003929522119406769, + "grad_norm": 2.546875, + "learning_rate": 8.294930875576038e-06, + "loss": 3.2619, + "step": 31 + }, + { + "epoch": 0.004056280897452149, + "grad_norm": 2.78125, + "learning_rate": 8.571428571428571e-06, + "loss": 3.3856, + "step": 32 + }, + { + "epoch": 0.004183039675497528, + "grad_norm": 2.1875, + "learning_rate": 8.847926267281106e-06, + "loss": 3.0388, + "step": 33 + }, + { + "epoch": 0.004309798453542908, + "grad_norm": 1.9296875, + "learning_rate": 9.12442396313364e-06, + "loss": 2.8448, + "step": 34 + }, + { + "epoch": 0.0044365572315882874, + "grad_norm": 2.390625, + "learning_rate": 9.400921658986174e-06, + "loss": 3.1973, + "step": 35 + }, + { + "epoch": 0.0045633160096336675, + "grad_norm": 1.671875, + "learning_rate": 9.67741935483871e-06, + "loss": 2.9847, + "step": 36 + }, + { + "epoch": 0.004690074787679047, + "grad_norm": 2.59375, + "learning_rate": 9.953917050691245e-06, + "loss": 3.7099, + "step": 37 + }, + { + "epoch": 0.004816833565724426, + "grad_norm": 2.3125, + "learning_rate": 1.023041474654378e-05, + "loss": 3.0622, + "step": 38 + }, + { + "epoch": 0.004943592343769806, + "grad_norm": 2.15625, + "learning_rate": 1.0506912442396313e-05, + "loss": 2.7991, + "step": 39 + }, + { + "epoch": 0.005070351121815185, + "grad_norm": 2.28125, + "learning_rate": 1.0783410138248848e-05, + "loss": 2.8201, + "step": 40 + }, + { + "epoch": 0.0051971098998605655, + "grad_norm": 1.9921875, + "learning_rate": 1.1059907834101383e-05, + "loss": 3.2719, + "step": 41 + }, + { + "epoch": 0.005323868677905945, + "grad_norm": 1.734375, + "learning_rate": 1.1336405529953916e-05, + "loss": 3.2083, + "step": 42 + }, + { + "epoch": 0.005450627455951325, + "grad_norm": 2.359375, + "learning_rate": 1.1612903225806451e-05, + "loss": 2.871, + "step": 43 + }, + { + "epoch": 0.005577386233996704, + "grad_norm": 1.96875, + "learning_rate": 1.1889400921658986e-05, + "loss": 2.6081, + "step": 44 + }, + { + "epoch": 0.005704145012042084, + "grad_norm": 2.375, + "learning_rate": 1.2165898617511521e-05, + "loss": 3.0757, + "step": 45 + }, + { + "epoch": 0.0058309037900874635, + "grad_norm": 2.609375, + "learning_rate": 1.2442396313364056e-05, + "loss": 2.806, + "step": 46 + }, + { + "epoch": 0.005957662568132844, + "grad_norm": 2.484375, + "learning_rate": 1.271889400921659e-05, + "loss": 3.236, + "step": 47 + }, + { + "epoch": 0.006084421346178223, + "grad_norm": 2.328125, + "learning_rate": 1.2995391705069126e-05, + "loss": 3.3716, + "step": 48 + }, + { + "epoch": 0.006211180124223602, + "grad_norm": 1.6953125, + "learning_rate": 1.327188940092166e-05, + "loss": 2.9337, + "step": 49 + }, + { + "epoch": 0.006337938902268982, + "grad_norm": 2.40625, + "learning_rate": 1.3548387096774194e-05, + "loss": 3.1984, + "step": 50 + }, + { + "epoch": 0.006464697680314361, + "grad_norm": 1.71875, + "learning_rate": 1.3824884792626728e-05, + "loss": 2.7284, + "step": 51 + }, + { + "epoch": 0.0065914564583597415, + "grad_norm": 1.8046875, + "learning_rate": 1.4101382488479263e-05, + "loss": 2.902, + "step": 52 + }, + { + "epoch": 0.006718215236405121, + "grad_norm": 2.3125, + "learning_rate": 1.4377880184331796e-05, + "loss": 3.1462, + "step": 53 + }, + { + "epoch": 0.006844974014450501, + "grad_norm": 2.359375, + "learning_rate": 1.4654377880184331e-05, + "loss": 2.921, + "step": 54 + }, + { + "epoch": 0.00697173279249588, + "grad_norm": 2.5625, + "learning_rate": 1.4930875576036866e-05, + "loss": 2.8419, + "step": 55 + }, + { + "epoch": 0.00709849157054126, + "grad_norm": 1.875, + "learning_rate": 1.5207373271889403e-05, + "loss": 2.9957, + "step": 56 + }, + { + "epoch": 0.0072252503485866395, + "grad_norm": 2.0625, + "learning_rate": 1.5483870967741936e-05, + "loss": 2.7474, + "step": 57 + }, + { + "epoch": 0.00735200912663202, + "grad_norm": 2.59375, + "learning_rate": 1.576036866359447e-05, + "loss": 3.5225, + "step": 58 + }, + { + "epoch": 0.007478767904677399, + "grad_norm": 2.4375, + "learning_rate": 1.6036866359447006e-05, + "loss": 2.9143, + "step": 59 + }, + { + "epoch": 0.007605526682722779, + "grad_norm": 1.859375, + "learning_rate": 1.631336405529954e-05, + "loss": 2.8144, + "step": 60 + }, + { + "epoch": 0.007732285460768158, + "grad_norm": 2.03125, + "learning_rate": 1.6589861751152075e-05, + "loss": 2.9237, + "step": 61 + }, + { + "epoch": 0.007859044238813537, + "grad_norm": 2.390625, + "learning_rate": 1.686635944700461e-05, + "loss": 3.0369, + "step": 62 + }, + { + "epoch": 0.007985803016858917, + "grad_norm": 2.375, + "learning_rate": 1.7142857142857142e-05, + "loss": 2.514, + "step": 63 + }, + { + "epoch": 0.008112561794904298, + "grad_norm": 2.734375, + "learning_rate": 1.741935483870968e-05, + "loss": 2.7245, + "step": 64 + }, + { + "epoch": 0.008239320572949677, + "grad_norm": 2.359375, + "learning_rate": 1.769585253456221e-05, + "loss": 2.7239, + "step": 65 + }, + { + "epoch": 0.008366079350995056, + "grad_norm": 1.9375, + "learning_rate": 1.7972350230414745e-05, + "loss": 2.9843, + "step": 66 + }, + { + "epoch": 0.008492838129040435, + "grad_norm": 2.34375, + "learning_rate": 1.824884792626728e-05, + "loss": 2.9487, + "step": 67 + }, + { + "epoch": 0.008619596907085816, + "grad_norm": 2.0625, + "learning_rate": 1.8525345622119815e-05, + "loss": 3.063, + "step": 68 + }, + { + "epoch": 0.008746355685131196, + "grad_norm": 2.0, + "learning_rate": 1.8801843317972348e-05, + "loss": 2.6854, + "step": 69 + }, + { + "epoch": 0.008873114463176575, + "grad_norm": 2.203125, + "learning_rate": 1.9078341013824884e-05, + "loss": 2.663, + "step": 70 + }, + { + "epoch": 0.008999873241221954, + "grad_norm": 1.90625, + "learning_rate": 1.935483870967742e-05, + "loss": 2.9695, + "step": 71 + }, + { + "epoch": 0.009126632019267335, + "grad_norm": 1.8359375, + "learning_rate": 1.9631336405529957e-05, + "loss": 2.4734, + "step": 72 + }, + { + "epoch": 0.009253390797312714, + "grad_norm": 3.015625, + "learning_rate": 1.990783410138249e-05, + "loss": 2.9538, + "step": 73 + }, + { + "epoch": 0.009380149575358094, + "grad_norm": 2.015625, + "learning_rate": 2.0184331797235024e-05, + "loss": 2.6884, + "step": 74 + }, + { + "epoch": 0.009506908353403473, + "grad_norm": 3.109375, + "learning_rate": 2.046082949308756e-05, + "loss": 2.6867, + "step": 75 + }, + { + "epoch": 0.009633667131448852, + "grad_norm": 1.5390625, + "learning_rate": 2.0737327188940094e-05, + "loss": 2.5821, + "step": 76 + }, + { + "epoch": 0.009760425909494233, + "grad_norm": 1.890625, + "learning_rate": 2.1013824884792627e-05, + "loss": 2.3303, + "step": 77 + }, + { + "epoch": 0.009887184687539612, + "grad_norm": 1.3515625, + "learning_rate": 2.1290322580645163e-05, + "loss": 2.6871, + "step": 78 + }, + { + "epoch": 0.010013943465584992, + "grad_norm": 2.390625, + "learning_rate": 2.1566820276497696e-05, + "loss": 2.8288, + "step": 79 + }, + { + "epoch": 0.01014070224363037, + "grad_norm": 1.4296875, + "learning_rate": 2.184331797235023e-05, + "loss": 2.5753, + "step": 80 + }, + { + "epoch": 0.010267461021675752, + "grad_norm": 1.3671875, + "learning_rate": 2.2119815668202766e-05, + "loss": 2.9023, + "step": 81 + }, + { + "epoch": 0.010394219799721131, + "grad_norm": 1.453125, + "learning_rate": 2.23963133640553e-05, + "loss": 2.7125, + "step": 82 + }, + { + "epoch": 0.01052097857776651, + "grad_norm": 1.4765625, + "learning_rate": 2.2672811059907833e-05, + "loss": 2.9662, + "step": 83 + }, + { + "epoch": 0.01064773735581189, + "grad_norm": 1.625, + "learning_rate": 2.294930875576037e-05, + "loss": 2.606, + "step": 84 + }, + { + "epoch": 0.01077449613385727, + "grad_norm": 1.8671875, + "learning_rate": 2.3225806451612902e-05, + "loss": 2.6132, + "step": 85 + }, + { + "epoch": 0.01090125491190265, + "grad_norm": 1.640625, + "learning_rate": 2.350230414746544e-05, + "loss": 2.3759, + "step": 86 + }, + { + "epoch": 0.011028013689948029, + "grad_norm": 2.40625, + "learning_rate": 2.3778801843317972e-05, + "loss": 2.4852, + "step": 87 + }, + { + "epoch": 0.011154772467993408, + "grad_norm": 1.375, + "learning_rate": 2.4055299539170505e-05, + "loss": 2.3836, + "step": 88 + }, + { + "epoch": 0.011281531246038787, + "grad_norm": 1.6796875, + "learning_rate": 2.4331797235023042e-05, + "loss": 2.8551, + "step": 89 + }, + { + "epoch": 0.011408290024084168, + "grad_norm": 1.265625, + "learning_rate": 2.460829493087558e-05, + "loss": 2.3646, + "step": 90 + }, + { + "epoch": 0.011535048802129548, + "grad_norm": 1.546875, + "learning_rate": 2.488479262672811e-05, + "loss": 2.247, + "step": 91 + }, + { + "epoch": 0.011661807580174927, + "grad_norm": 1.28125, + "learning_rate": 2.5161290322580648e-05, + "loss": 2.5951, + "step": 92 + }, + { + "epoch": 0.011788566358220306, + "grad_norm": 1.625, + "learning_rate": 2.543778801843318e-05, + "loss": 2.4882, + "step": 93 + }, + { + "epoch": 0.011915325136265687, + "grad_norm": 1.8125, + "learning_rate": 2.5714285714285714e-05, + "loss": 2.5159, + "step": 94 + }, + { + "epoch": 0.012042083914311066, + "grad_norm": 1.3125, + "learning_rate": 2.599078341013825e-05, + "loss": 2.5952, + "step": 95 + }, + { + "epoch": 0.012168842692356446, + "grad_norm": 1.6875, + "learning_rate": 2.6267281105990784e-05, + "loss": 2.3814, + "step": 96 + }, + { + "epoch": 0.012295601470401825, + "grad_norm": 1.109375, + "learning_rate": 2.654377880184332e-05, + "loss": 2.6268, + "step": 97 + }, + { + "epoch": 0.012422360248447204, + "grad_norm": 1.71875, + "learning_rate": 2.6820276497695854e-05, + "loss": 2.7113, + "step": 98 + }, + { + "epoch": 0.012549119026492585, + "grad_norm": 1.2890625, + "learning_rate": 2.7096774193548387e-05, + "loss": 2.0994, + "step": 99 + }, + { + "epoch": 0.012675877804537964, + "grad_norm": 1.65625, + "learning_rate": 2.7373271889400924e-05, + "loss": 2.1403, + "step": 100 + }, + { + "epoch": 0.012802636582583344, + "grad_norm": 1.3671875, + "learning_rate": 2.7649769585253457e-05, + "loss": 2.1037, + "step": 101 + }, + { + "epoch": 0.012929395360628723, + "grad_norm": 1.5078125, + "learning_rate": 2.792626728110599e-05, + "loss": 2.3441, + "step": 102 + }, + { + "epoch": 0.013056154138674104, + "grad_norm": 1.5703125, + "learning_rate": 2.8202764976958527e-05, + "loss": 2.4583, + "step": 103 + }, + { + "epoch": 0.013182912916719483, + "grad_norm": 1.3671875, + "learning_rate": 2.847926267281106e-05, + "loss": 2.278, + "step": 104 + }, + { + "epoch": 0.013309671694764862, + "grad_norm": 1.296875, + "learning_rate": 2.8755760368663593e-05, + "loss": 2.212, + "step": 105 + }, + { + "epoch": 0.013436430472810242, + "grad_norm": 1.390625, + "learning_rate": 2.903225806451613e-05, + "loss": 2.0267, + "step": 106 + }, + { + "epoch": 0.013563189250855623, + "grad_norm": 1.3671875, + "learning_rate": 2.9308755760368663e-05, + "loss": 2.1103, + "step": 107 + }, + { + "epoch": 0.013689948028901002, + "grad_norm": 1.3671875, + "learning_rate": 2.9585253456221196e-05, + "loss": 2.2625, + "step": 108 + }, + { + "epoch": 0.013816706806946381, + "grad_norm": 2.296875, + "learning_rate": 2.9861751152073732e-05, + "loss": 2.0816, + "step": 109 + }, + { + "epoch": 0.01394346558499176, + "grad_norm": 1.3125, + "learning_rate": 3.0138248847926272e-05, + "loss": 2.191, + "step": 110 + }, + { + "epoch": 0.01407022436303714, + "grad_norm": 1.265625, + "learning_rate": 3.0414746543778806e-05, + "loss": 2.4229, + "step": 111 + }, + { + "epoch": 0.01419698314108252, + "grad_norm": 1.3828125, + "learning_rate": 3.0691244239631335e-05, + "loss": 2.3382, + "step": 112 + }, + { + "epoch": 0.0143237419191279, + "grad_norm": 1.234375, + "learning_rate": 3.096774193548387e-05, + "loss": 2.3096, + "step": 113 + }, + { + "epoch": 0.014450500697173279, + "grad_norm": 1.265625, + "learning_rate": 3.12442396313364e-05, + "loss": 2.5878, + "step": 114 + }, + { + "epoch": 0.014577259475218658, + "grad_norm": 1.28125, + "learning_rate": 3.152073732718894e-05, + "loss": 2.0876, + "step": 115 + }, + { + "epoch": 0.01470401825326404, + "grad_norm": 1.2109375, + "learning_rate": 3.1797235023041475e-05, + "loss": 2.1557, + "step": 116 + }, + { + "epoch": 0.014830777031309418, + "grad_norm": 1.0546875, + "learning_rate": 3.207373271889401e-05, + "loss": 2.1464, + "step": 117 + }, + { + "epoch": 0.014957535809354798, + "grad_norm": 1.171875, + "learning_rate": 3.235023041474654e-05, + "loss": 2.7055, + "step": 118 + }, + { + "epoch": 0.015084294587400177, + "grad_norm": 1.25, + "learning_rate": 3.262672811059908e-05, + "loss": 2.5028, + "step": 119 + }, + { + "epoch": 0.015211053365445558, + "grad_norm": 1.5, + "learning_rate": 3.2903225806451614e-05, + "loss": 2.3636, + "step": 120 + }, + { + "epoch": 0.015337812143490937, + "grad_norm": 1.265625, + "learning_rate": 3.317972350230415e-05, + "loss": 2.5675, + "step": 121 + }, + { + "epoch": 0.015464570921536316, + "grad_norm": 1.4140625, + "learning_rate": 3.345622119815669e-05, + "loss": 2.2406, + "step": 122 + }, + { + "epoch": 0.015591329699581696, + "grad_norm": 1.3125, + "learning_rate": 3.373271889400922e-05, + "loss": 2.3789, + "step": 123 + }, + { + "epoch": 0.015718088477627075, + "grad_norm": 1.234375, + "learning_rate": 3.4009216589861754e-05, + "loss": 2.4767, + "step": 124 + }, + { + "epoch": 0.015844847255672454, + "grad_norm": 1.265625, + "learning_rate": 3.4285714285714284e-05, + "loss": 2.3279, + "step": 125 + }, + { + "epoch": 0.015971606033717833, + "grad_norm": 1.265625, + "learning_rate": 3.456221198156682e-05, + "loss": 2.2047, + "step": 126 + }, + { + "epoch": 0.016098364811763216, + "grad_norm": 1.0390625, + "learning_rate": 3.483870967741936e-05, + "loss": 2.8244, + "step": 127 + }, + { + "epoch": 0.016225123589808595, + "grad_norm": 1.4140625, + "learning_rate": 3.511520737327189e-05, + "loss": 1.9411, + "step": 128 + }, + { + "epoch": 0.016351882367853975, + "grad_norm": 1.328125, + "learning_rate": 3.539170506912442e-05, + "loss": 2.1226, + "step": 129 + }, + { + "epoch": 0.016478641145899354, + "grad_norm": 1.2265625, + "learning_rate": 3.566820276497696e-05, + "loss": 2.353, + "step": 130 + }, + { + "epoch": 0.016605399923944733, + "grad_norm": 1.3046875, + "learning_rate": 3.594470046082949e-05, + "loss": 2.2297, + "step": 131 + }, + { + "epoch": 0.016732158701990112, + "grad_norm": 1.1953125, + "learning_rate": 3.622119815668203e-05, + "loss": 2.6692, + "step": 132 + }, + { + "epoch": 0.01685891748003549, + "grad_norm": 1.1015625, + "learning_rate": 3.649769585253456e-05, + "loss": 2.0721, + "step": 133 + }, + { + "epoch": 0.01698567625808087, + "grad_norm": 1.2890625, + "learning_rate": 3.67741935483871e-05, + "loss": 2.0208, + "step": 134 + }, + { + "epoch": 0.01711243503612625, + "grad_norm": 1.1484375, + "learning_rate": 3.705069124423963e-05, + "loss": 2.6096, + "step": 135 + }, + { + "epoch": 0.017239193814171633, + "grad_norm": 1.2421875, + "learning_rate": 3.7327188940092166e-05, + "loss": 2.4566, + "step": 136 + }, + { + "epoch": 0.017365952592217012, + "grad_norm": 1.234375, + "learning_rate": 3.7603686635944695e-05, + "loss": 2.1071, + "step": 137 + }, + { + "epoch": 0.01749271137026239, + "grad_norm": 1.140625, + "learning_rate": 3.788018433179724e-05, + "loss": 2.3634, + "step": 138 + }, + { + "epoch": 0.01761947014830777, + "grad_norm": 1.2109375, + "learning_rate": 3.815668202764977e-05, + "loss": 1.7798, + "step": 139 + }, + { + "epoch": 0.01774622892635315, + "grad_norm": 1.171875, + "learning_rate": 3.8433179723502305e-05, + "loss": 2.5508, + "step": 140 + }, + { + "epoch": 0.01787298770439853, + "grad_norm": 1.1328125, + "learning_rate": 3.870967741935484e-05, + "loss": 2.1198, + "step": 141 + }, + { + "epoch": 0.017999746482443908, + "grad_norm": 1.171875, + "learning_rate": 3.898617511520737e-05, + "loss": 2.2581, + "step": 142 + }, + { + "epoch": 0.018126505260489287, + "grad_norm": 1.15625, + "learning_rate": 3.9262672811059915e-05, + "loss": 2.1564, + "step": 143 + }, + { + "epoch": 0.01825326403853467, + "grad_norm": 1.125, + "learning_rate": 3.9539170506912445e-05, + "loss": 2.2281, + "step": 144 + }, + { + "epoch": 0.01838002281658005, + "grad_norm": 1.1015625, + "learning_rate": 3.981566820276498e-05, + "loss": 2.4633, + "step": 145 + }, + { + "epoch": 0.01850678159462543, + "grad_norm": 1.1484375, + "learning_rate": 4.009216589861751e-05, + "loss": 2.0364, + "step": 146 + }, + { + "epoch": 0.018633540372670808, + "grad_norm": 1.0859375, + "learning_rate": 4.036866359447005e-05, + "loss": 1.9947, + "step": 147 + }, + { + "epoch": 0.018760299150716187, + "grad_norm": 1.2734375, + "learning_rate": 4.064516129032258e-05, + "loss": 2.0986, + "step": 148 + }, + { + "epoch": 0.018887057928761566, + "grad_norm": 1.2421875, + "learning_rate": 4.092165898617512e-05, + "loss": 2.432, + "step": 149 + }, + { + "epoch": 0.019013816706806946, + "grad_norm": 1.1640625, + "learning_rate": 4.119815668202765e-05, + "loss": 2.6983, + "step": 150 + }, + { + "epoch": 0.019140575484852325, + "grad_norm": 1.1796875, + "learning_rate": 4.147465437788019e-05, + "loss": 1.8463, + "step": 151 + }, + { + "epoch": 0.019267334262897704, + "grad_norm": 0.98046875, + "learning_rate": 4.175115207373272e-05, + "loss": 2.1697, + "step": 152 + }, + { + "epoch": 0.019394093040943087, + "grad_norm": 1.28125, + "learning_rate": 4.202764976958525e-05, + "loss": 2.4353, + "step": 153 + }, + { + "epoch": 0.019520851818988466, + "grad_norm": 1.234375, + "learning_rate": 4.230414746543778e-05, + "loss": 2.7946, + "step": 154 + }, + { + "epoch": 0.019647610597033845, + "grad_norm": 1.0859375, + "learning_rate": 4.2580645161290327e-05, + "loss": 1.9568, + "step": 155 + }, + { + "epoch": 0.019774369375079225, + "grad_norm": 1.015625, + "learning_rate": 4.2857142857142856e-05, + "loss": 2.6783, + "step": 156 + }, + { + "epoch": 0.019901128153124604, + "grad_norm": 1.484375, + "learning_rate": 4.313364055299539e-05, + "loss": 2.4218, + "step": 157 + }, + { + "epoch": 0.020027886931169983, + "grad_norm": 1.1171875, + "learning_rate": 4.341013824884792e-05, + "loss": 2.4173, + "step": 158 + }, + { + "epoch": 0.020154645709215362, + "grad_norm": 1.1328125, + "learning_rate": 4.368663594470046e-05, + "loss": 2.422, + "step": 159 + }, + { + "epoch": 0.02028140448726074, + "grad_norm": 1.5859375, + "learning_rate": 4.3963133640553e-05, + "loss": 2.5844, + "step": 160 + }, + { + "epoch": 0.02040816326530612, + "grad_norm": 1.15625, + "learning_rate": 4.423963133640553e-05, + "loss": 2.7378, + "step": 161 + }, + { + "epoch": 0.020534922043351504, + "grad_norm": 0.9921875, + "learning_rate": 4.451612903225807e-05, + "loss": 2.1257, + "step": 162 + }, + { + "epoch": 0.020661680821396883, + "grad_norm": 1.203125, + "learning_rate": 4.47926267281106e-05, + "loss": 2.3838, + "step": 163 + }, + { + "epoch": 0.020788439599442262, + "grad_norm": 1.34375, + "learning_rate": 4.5069124423963135e-05, + "loss": 1.8546, + "step": 164 + }, + { + "epoch": 0.02091519837748764, + "grad_norm": 1.1953125, + "learning_rate": 4.5345622119815665e-05, + "loss": 2.0101, + "step": 165 + }, + { + "epoch": 0.02104195715553302, + "grad_norm": 1.1484375, + "learning_rate": 4.562211981566821e-05, + "loss": 2.4907, + "step": 166 + }, + { + "epoch": 0.0211687159335784, + "grad_norm": 1.25, + "learning_rate": 4.589861751152074e-05, + "loss": 2.028, + "step": 167 + }, + { + "epoch": 0.02129547471162378, + "grad_norm": 1.3515625, + "learning_rate": 4.6175115207373275e-05, + "loss": 2.1113, + "step": 168 + }, + { + "epoch": 0.021422233489669158, + "grad_norm": 1.4921875, + "learning_rate": 4.6451612903225805e-05, + "loss": 1.9932, + "step": 169 + }, + { + "epoch": 0.02154899226771454, + "grad_norm": 1.203125, + "learning_rate": 4.672811059907834e-05, + "loss": 2.5246, + "step": 170 + }, + { + "epoch": 0.02167575104575992, + "grad_norm": 1.2421875, + "learning_rate": 4.700460829493088e-05, + "loss": 2.5694, + "step": 171 + }, + { + "epoch": 0.0218025098238053, + "grad_norm": 1.15625, + "learning_rate": 4.7281105990783414e-05, + "loss": 1.9679, + "step": 172 + }, + { + "epoch": 0.02192926860185068, + "grad_norm": 1.1875, + "learning_rate": 4.7557603686635944e-05, + "loss": 2.281, + "step": 173 + }, + { + "epoch": 0.022056027379896058, + "grad_norm": 1.28125, + "learning_rate": 4.783410138248848e-05, + "loss": 2.2681, + "step": 174 + }, + { + "epoch": 0.022182786157941437, + "grad_norm": 1.1875, + "learning_rate": 4.811059907834101e-05, + "loss": 2.0401, + "step": 175 + }, + { + "epoch": 0.022309544935986816, + "grad_norm": 1.5859375, + "learning_rate": 4.838709677419355e-05, + "loss": 2.3099, + "step": 176 + }, + { + "epoch": 0.022436303714032196, + "grad_norm": 1.125, + "learning_rate": 4.8663594470046084e-05, + "loss": 3.075, + "step": 177 + }, + { + "epoch": 0.022563062492077575, + "grad_norm": 1.3203125, + "learning_rate": 4.894009216589862e-05, + "loss": 2.2041, + "step": 178 + }, + { + "epoch": 0.022689821270122958, + "grad_norm": 1.265625, + "learning_rate": 4.921658986175116e-05, + "loss": 2.7096, + "step": 179 + }, + { + "epoch": 0.022816580048168337, + "grad_norm": 1.2734375, + "learning_rate": 4.9493087557603686e-05, + "loss": 2.4346, + "step": 180 + }, + { + "epoch": 0.022943338826213716, + "grad_norm": 1.1171875, + "learning_rate": 4.976958525345622e-05, + "loss": 2.1963, + "step": 181 + }, + { + "epoch": 0.023070097604259095, + "grad_norm": 1.4453125, + "learning_rate": 5.004608294930876e-05, + "loss": 2.5156, + "step": 182 + }, + { + "epoch": 0.023196856382304475, + "grad_norm": 1.1796875, + "learning_rate": 5.0322580645161296e-05, + "loss": 2.0192, + "step": 183 + }, + { + "epoch": 0.023323615160349854, + "grad_norm": 1.1484375, + "learning_rate": 5.0599078341013826e-05, + "loss": 2.6039, + "step": 184 + }, + { + "epoch": 0.023450373938395233, + "grad_norm": 1.1171875, + "learning_rate": 5.087557603686636e-05, + "loss": 1.9497, + "step": 185 + }, + { + "epoch": 0.023577132716440612, + "grad_norm": 1.1953125, + "learning_rate": 5.115207373271889e-05, + "loss": 2.2239, + "step": 186 + }, + { + "epoch": 0.02370389149448599, + "grad_norm": 1.0, + "learning_rate": 5.142857142857143e-05, + "loss": 2.3346, + "step": 187 + }, + { + "epoch": 0.023830650272531374, + "grad_norm": 1.1015625, + "learning_rate": 5.1705069124423965e-05, + "loss": 2.5889, + "step": 188 + }, + { + "epoch": 0.023957409050576754, + "grad_norm": 1.046875, + "learning_rate": 5.19815668202765e-05, + "loss": 2.2847, + "step": 189 + }, + { + "epoch": 0.024084167828622133, + "grad_norm": 0.9375, + "learning_rate": 5.225806451612903e-05, + "loss": 2.385, + "step": 190 + }, + { + "epoch": 0.024210926606667512, + "grad_norm": 1.03125, + "learning_rate": 5.253456221198157e-05, + "loss": 2.0346, + "step": 191 + }, + { + "epoch": 0.02433768538471289, + "grad_norm": 1.0390625, + "learning_rate": 5.28110599078341e-05, + "loss": 2.5936, + "step": 192 + }, + { + "epoch": 0.02446444416275827, + "grad_norm": 1.1484375, + "learning_rate": 5.308755760368664e-05, + "loss": 2.4177, + "step": 193 + }, + { + "epoch": 0.02459120294080365, + "grad_norm": 0.96484375, + "learning_rate": 5.336405529953917e-05, + "loss": 2.28, + "step": 194 + }, + { + "epoch": 0.02471796171884903, + "grad_norm": 1.2109375, + "learning_rate": 5.364055299539171e-05, + "loss": 2.0162, + "step": 195 + }, + { + "epoch": 0.024844720496894408, + "grad_norm": 1.1796875, + "learning_rate": 5.391705069124424e-05, + "loss": 2.3761, + "step": 196 + }, + { + "epoch": 0.02497147927493979, + "grad_norm": 1.125, + "learning_rate": 5.4193548387096774e-05, + "loss": 2.2806, + "step": 197 + }, + { + "epoch": 0.02509823805298517, + "grad_norm": 1.1796875, + "learning_rate": 5.4470046082949304e-05, + "loss": 2.5046, + "step": 198 + }, + { + "epoch": 0.02522499683103055, + "grad_norm": 1.234375, + "learning_rate": 5.474654377880185e-05, + "loss": 2.7865, + "step": 199 + }, + { + "epoch": 0.02535175560907593, + "grad_norm": 1.078125, + "learning_rate": 5.5023041474654384e-05, + "loss": 2.6493, + "step": 200 + }, + { + "epoch": 0.025478514387121308, + "grad_norm": 1.203125, + "learning_rate": 5.5299539170506914e-05, + "loss": 2.0132, + "step": 201 + }, + { + "epoch": 0.025605273165166687, + "grad_norm": 1.203125, + "learning_rate": 5.557603686635945e-05, + "loss": 2.4885, + "step": 202 + }, + { + "epoch": 0.025732031943212066, + "grad_norm": 1.171875, + "learning_rate": 5.585253456221198e-05, + "loss": 2.2706, + "step": 203 + }, + { + "epoch": 0.025858790721257446, + "grad_norm": 1.203125, + "learning_rate": 5.612903225806452e-05, + "loss": 2.5947, + "step": 204 + }, + { + "epoch": 0.02598554949930283, + "grad_norm": 1.140625, + "learning_rate": 5.640552995391705e-05, + "loss": 2.4087, + "step": 205 + }, + { + "epoch": 0.026112308277348208, + "grad_norm": 1.0546875, + "learning_rate": 5.668202764976959e-05, + "loss": 2.2352, + "step": 206 + }, + { + "epoch": 0.026239067055393587, + "grad_norm": 1.2578125, + "learning_rate": 5.695852534562212e-05, + "loss": 2.4017, + "step": 207 + }, + { + "epoch": 0.026365825833438966, + "grad_norm": 1.265625, + "learning_rate": 5.7235023041474656e-05, + "loss": 2.1867, + "step": 208 + }, + { + "epoch": 0.026492584611484345, + "grad_norm": 1.0703125, + "learning_rate": 5.7511520737327186e-05, + "loss": 2.9177, + "step": 209 + }, + { + "epoch": 0.026619343389529725, + "grad_norm": 1.1484375, + "learning_rate": 5.778801843317973e-05, + "loss": 2.3404, + "step": 210 + }, + { + "epoch": 0.026746102167575104, + "grad_norm": 1.015625, + "learning_rate": 5.806451612903226e-05, + "loss": 2.7923, + "step": 211 + }, + { + "epoch": 0.026872860945620483, + "grad_norm": 1.046875, + "learning_rate": 5.8341013824884796e-05, + "loss": 2.4418, + "step": 212 + }, + { + "epoch": 0.026999619723665862, + "grad_norm": 1.1171875, + "learning_rate": 5.8617511520737325e-05, + "loss": 2.441, + "step": 213 + }, + { + "epoch": 0.027126378501711245, + "grad_norm": 1.0859375, + "learning_rate": 5.889400921658986e-05, + "loss": 2.1844, + "step": 214 + }, + { + "epoch": 0.027253137279756624, + "grad_norm": 1.34375, + "learning_rate": 5.917050691244239e-05, + "loss": 2.1207, + "step": 215 + }, + { + "epoch": 0.027379896057802004, + "grad_norm": 0.96875, + "learning_rate": 5.9447004608294935e-05, + "loss": 2.3711, + "step": 216 + }, + { + "epoch": 0.027506654835847383, + "grad_norm": 1.03125, + "learning_rate": 5.9723502304147465e-05, + "loss": 2.2324, + "step": 217 + }, + { + "epoch": 0.027633413613892762, + "grad_norm": 1.1328125, + "learning_rate": 6e-05, + "loss": 2.9252, + "step": 218 + }, + { + "epoch": 0.02776017239193814, + "grad_norm": 1.0546875, + "learning_rate": 6.0276497695852545e-05, + "loss": 2.1201, + "step": 219 + }, + { + "epoch": 0.02788693116998352, + "grad_norm": 1.0234375, + "learning_rate": 6.055299539170507e-05, + "loss": 2.0246, + "step": 220 + }, + { + "epoch": 0.0280136899480289, + "grad_norm": 0.96875, + "learning_rate": 6.082949308755761e-05, + "loss": 2.0742, + "step": 221 + }, + { + "epoch": 0.02814044872607428, + "grad_norm": 0.9296875, + "learning_rate": 6.110599078341014e-05, + "loss": 2.2892, + "step": 222 + }, + { + "epoch": 0.028267207504119662, + "grad_norm": 0.9453125, + "learning_rate": 6.138248847926267e-05, + "loss": 1.9504, + "step": 223 + }, + { + "epoch": 0.02839396628216504, + "grad_norm": 0.8984375, + "learning_rate": 6.165898617511521e-05, + "loss": 2.3808, + "step": 224 + }, + { + "epoch": 0.02852072506021042, + "grad_norm": 1.2109375, + "learning_rate": 6.193548387096774e-05, + "loss": 2.1929, + "step": 225 + }, + { + "epoch": 0.0286474838382558, + "grad_norm": 1.1484375, + "learning_rate": 6.221198156682029e-05, + "loss": 1.7926, + "step": 226 + }, + { + "epoch": 0.02877424261630118, + "grad_norm": 0.9453125, + "learning_rate": 6.24884792626728e-05, + "loss": 1.8952, + "step": 227 + }, + { + "epoch": 0.028901001394346558, + "grad_norm": 1.0859375, + "learning_rate": 6.276497695852535e-05, + "loss": 2.6975, + "step": 228 + }, + { + "epoch": 0.029027760172391937, + "grad_norm": 1.265625, + "learning_rate": 6.304147465437788e-05, + "loss": 2.7235, + "step": 229 + }, + { + "epoch": 0.029154518950437316, + "grad_norm": 1.2265625, + "learning_rate": 6.331797235023042e-05, + "loss": 2.0742, + "step": 230 + }, + { + "epoch": 0.029281277728482696, + "grad_norm": 1.0625, + "learning_rate": 6.359447004608295e-05, + "loss": 2.2087, + "step": 231 + }, + { + "epoch": 0.02940803650652808, + "grad_norm": 1.359375, + "learning_rate": 6.387096774193548e-05, + "loss": 2.2738, + "step": 232 + }, + { + "epoch": 0.029534795284573458, + "grad_norm": 1.09375, + "learning_rate": 6.414746543778802e-05, + "loss": 2.1704, + "step": 233 + }, + { + "epoch": 0.029661554062618837, + "grad_norm": 0.98046875, + "learning_rate": 6.442396313364055e-05, + "loss": 2.0741, + "step": 234 + }, + { + "epoch": 0.029788312840664216, + "grad_norm": 1.03125, + "learning_rate": 6.470046082949308e-05, + "loss": 2.2153, + "step": 235 + }, + { + "epoch": 0.029915071618709595, + "grad_norm": 1.0546875, + "learning_rate": 6.497695852534563e-05, + "loss": 2.4314, + "step": 236 + }, + { + "epoch": 0.030041830396754975, + "grad_norm": 1.171875, + "learning_rate": 6.525345622119816e-05, + "loss": 2.597, + "step": 237 + }, + { + "epoch": 0.030168589174800354, + "grad_norm": 1.0859375, + "learning_rate": 6.55299539170507e-05, + "loss": 2.2135, + "step": 238 + }, + { + "epoch": 0.030295347952845733, + "grad_norm": 1.109375, + "learning_rate": 6.580645161290323e-05, + "loss": 2.0169, + "step": 239 + }, + { + "epoch": 0.030422106730891116, + "grad_norm": 1.265625, + "learning_rate": 6.608294930875576e-05, + "loss": 1.8607, + "step": 240 + }, + { + "epoch": 0.030548865508936495, + "grad_norm": 1.296875, + "learning_rate": 6.63594470046083e-05, + "loss": 2.0917, + "step": 241 + }, + { + "epoch": 0.030675624286981874, + "grad_norm": 1.0703125, + "learning_rate": 6.663594470046083e-05, + "loss": 2.047, + "step": 242 + }, + { + "epoch": 0.030802383065027254, + "grad_norm": 1.625, + "learning_rate": 6.691244239631338e-05, + "loss": 2.3032, + "step": 243 + }, + { + "epoch": 0.030929141843072633, + "grad_norm": 1.0546875, + "learning_rate": 6.718894009216589e-05, + "loss": 2.1386, + "step": 244 + }, + { + "epoch": 0.031055900621118012, + "grad_norm": 1.28125, + "learning_rate": 6.746543778801843e-05, + "loss": 1.9799, + "step": 245 + }, + { + "epoch": 0.03118265939916339, + "grad_norm": 1.015625, + "learning_rate": 6.774193548387098e-05, + "loss": 2.4034, + "step": 246 + }, + { + "epoch": 0.031309418177208774, + "grad_norm": 1.125, + "learning_rate": 6.801843317972351e-05, + "loss": 2.103, + "step": 247 + }, + { + "epoch": 0.03143617695525415, + "grad_norm": 0.93359375, + "learning_rate": 6.829493087557604e-05, + "loss": 2.2768, + "step": 248 + }, + { + "epoch": 0.03156293573329953, + "grad_norm": 1.09375, + "learning_rate": 6.857142857142857e-05, + "loss": 2.4442, + "step": 249 + }, + { + "epoch": 0.03168969451134491, + "grad_norm": 1.125, + "learning_rate": 6.884792626728111e-05, + "loss": 2.193, + "step": 250 + }, + { + "epoch": 0.03181645328939029, + "grad_norm": 0.9609375, + "learning_rate": 6.912442396313364e-05, + "loss": 2.1956, + "step": 251 + }, + { + "epoch": 0.03194321206743567, + "grad_norm": 1.1640625, + "learning_rate": 6.940092165898617e-05, + "loss": 2.3793, + "step": 252 + }, + { + "epoch": 0.03206997084548105, + "grad_norm": 1.25, + "learning_rate": 6.967741935483871e-05, + "loss": 2.2532, + "step": 253 + }, + { + "epoch": 0.03219672962352643, + "grad_norm": 1.3046875, + "learning_rate": 6.995391705069124e-05, + "loss": 2.3969, + "step": 254 + }, + { + "epoch": 0.03232348840157181, + "grad_norm": 1.25, + "learning_rate": 7.023041474654379e-05, + "loss": 2.408, + "step": 255 + }, + { + "epoch": 0.03245024717961719, + "grad_norm": 1.0546875, + "learning_rate": 7.05069124423963e-05, + "loss": 2.004, + "step": 256 + }, + { + "epoch": 0.032577005957662566, + "grad_norm": 1.1875, + "learning_rate": 7.078341013824885e-05, + "loss": 2.2098, + "step": 257 + }, + { + "epoch": 0.03270376473570795, + "grad_norm": 1.015625, + "learning_rate": 7.105990783410139e-05, + "loss": 2.2987, + "step": 258 + }, + { + "epoch": 0.032830523513753325, + "grad_norm": 1.2109375, + "learning_rate": 7.133640552995392e-05, + "loss": 2.3733, + "step": 259 + }, + { + "epoch": 0.03295728229179871, + "grad_norm": 1.0859375, + "learning_rate": 7.161290322580646e-05, + "loss": 2.2915, + "step": 260 + }, + { + "epoch": 0.03308404106984408, + "grad_norm": 1.3125, + "learning_rate": 7.188940092165898e-05, + "loss": 2.0945, + "step": 261 + }, + { + "epoch": 0.033210799847889466, + "grad_norm": 1.0859375, + "learning_rate": 7.216589861751152e-05, + "loss": 2.2548, + "step": 262 + }, + { + "epoch": 0.03333755862593485, + "grad_norm": 0.96875, + "learning_rate": 7.244239631336407e-05, + "loss": 2.9858, + "step": 263 + }, + { + "epoch": 0.033464317403980225, + "grad_norm": 1.0546875, + "learning_rate": 7.27188940092166e-05, + "loss": 2.3404, + "step": 264 + }, + { + "epoch": 0.03359107618202561, + "grad_norm": 1.796875, + "learning_rate": 7.299539170506913e-05, + "loss": 1.5766, + "step": 265 + }, + { + "epoch": 0.03371783496007098, + "grad_norm": 1.0859375, + "learning_rate": 7.327188940092166e-05, + "loss": 1.6851, + "step": 266 + }, + { + "epoch": 0.033844593738116366, + "grad_norm": 1.390625, + "learning_rate": 7.35483870967742e-05, + "loss": 2.1749, + "step": 267 + }, + { + "epoch": 0.03397135251616174, + "grad_norm": 1.078125, + "learning_rate": 7.382488479262673e-05, + "loss": 1.9289, + "step": 268 + }, + { + "epoch": 0.034098111294207124, + "grad_norm": 1.234375, + "learning_rate": 7.410138248847926e-05, + "loss": 2.2994, + "step": 269 + }, + { + "epoch": 0.0342248700722525, + "grad_norm": 0.98046875, + "learning_rate": 7.43778801843318e-05, + "loss": 2.4787, + "step": 270 + }, + { + "epoch": 0.03435162885029788, + "grad_norm": 0.98828125, + "learning_rate": 7.465437788018433e-05, + "loss": 2.7086, + "step": 271 + }, + { + "epoch": 0.034478387628343266, + "grad_norm": 1.25, + "learning_rate": 7.493087557603687e-05, + "loss": 2.8374, + "step": 272 + }, + { + "epoch": 0.03460514640638864, + "grad_norm": 1.265625, + "learning_rate": 7.520737327188939e-05, + "loss": 2.8127, + "step": 273 + }, + { + "epoch": 0.034731905184434024, + "grad_norm": 1.0, + "learning_rate": 7.548387096774193e-05, + "loss": 2.3578, + "step": 274 + }, + { + "epoch": 0.0348586639624794, + "grad_norm": 1.0859375, + "learning_rate": 7.576036866359448e-05, + "loss": 2.3934, + "step": 275 + }, + { + "epoch": 0.03498542274052478, + "grad_norm": 0.96875, + "learning_rate": 7.603686635944701e-05, + "loss": 2.0839, + "step": 276 + }, + { + "epoch": 0.03511218151857016, + "grad_norm": 1.0625, + "learning_rate": 7.631336405529954e-05, + "loss": 2.184, + "step": 277 + }, + { + "epoch": 0.03523894029661554, + "grad_norm": 1.0, + "learning_rate": 7.658986175115207e-05, + "loss": 2.2954, + "step": 278 + }, + { + "epoch": 0.035365699074660924, + "grad_norm": 1.0859375, + "learning_rate": 7.686635944700461e-05, + "loss": 2.5797, + "step": 279 + }, + { + "epoch": 0.0354924578527063, + "grad_norm": 1.1015625, + "learning_rate": 7.714285714285715e-05, + "loss": 2.3425, + "step": 280 + }, + { + "epoch": 0.03561921663075168, + "grad_norm": 1.078125, + "learning_rate": 7.741935483870968e-05, + "loss": 1.8804, + "step": 281 + }, + { + "epoch": 0.03574597540879706, + "grad_norm": 1.0859375, + "learning_rate": 7.769585253456221e-05, + "loss": 2.3378, + "step": 282 + }, + { + "epoch": 0.03587273418684244, + "grad_norm": 1.0625, + "learning_rate": 7.797235023041474e-05, + "loss": 2.2018, + "step": 283 + }, + { + "epoch": 0.035999492964887816, + "grad_norm": 1.0546875, + "learning_rate": 7.824884792626729e-05, + "loss": 2.1572, + "step": 284 + }, + { + "epoch": 0.0361262517429332, + "grad_norm": 1.1484375, + "learning_rate": 7.852534562211983e-05, + "loss": 2.2567, + "step": 285 + }, + { + "epoch": 0.036253010520978575, + "grad_norm": 0.96875, + "learning_rate": 7.880184331797235e-05, + "loss": 2.1081, + "step": 286 + }, + { + "epoch": 0.03637976929902396, + "grad_norm": 1.1484375, + "learning_rate": 7.907834101382489e-05, + "loss": 2.7091, + "step": 287 + }, + { + "epoch": 0.03650652807706934, + "grad_norm": 1.0546875, + "learning_rate": 7.935483870967742e-05, + "loss": 2.6408, + "step": 288 + }, + { + "epoch": 0.036633286855114716, + "grad_norm": 0.96484375, + "learning_rate": 7.963133640552996e-05, + "loss": 2.6255, + "step": 289 + }, + { + "epoch": 0.0367600456331601, + "grad_norm": 0.95703125, + "learning_rate": 7.990783410138248e-05, + "loss": 2.3187, + "step": 290 + }, + { + "epoch": 0.036886804411205475, + "grad_norm": 1.09375, + "learning_rate": 8.018433179723502e-05, + "loss": 2.16, + "step": 291 + }, + { + "epoch": 0.03701356318925086, + "grad_norm": 1.0546875, + "learning_rate": 8.046082949308757e-05, + "loss": 2.4117, + "step": 292 + }, + { + "epoch": 0.03714032196729623, + "grad_norm": 1.1484375, + "learning_rate": 8.07373271889401e-05, + "loss": 1.9142, + "step": 293 + }, + { + "epoch": 0.037267080745341616, + "grad_norm": 1.1796875, + "learning_rate": 8.101382488479262e-05, + "loss": 1.9936, + "step": 294 + }, + { + "epoch": 0.03739383952338699, + "grad_norm": 1.0390625, + "learning_rate": 8.129032258064515e-05, + "loss": 2.1341, + "step": 295 + }, + { + "epoch": 0.037520598301432374, + "grad_norm": 0.95703125, + "learning_rate": 8.15668202764977e-05, + "loss": 2.312, + "step": 296 + }, + { + "epoch": 0.03764735707947776, + "grad_norm": 1.078125, + "learning_rate": 8.184331797235024e-05, + "loss": 2.1323, + "step": 297 + }, + { + "epoch": 0.03777411585752313, + "grad_norm": 0.9921875, + "learning_rate": 8.211981566820277e-05, + "loss": 2.478, + "step": 298 + }, + { + "epoch": 0.037900874635568516, + "grad_norm": 1.046875, + "learning_rate": 8.23963133640553e-05, + "loss": 2.0635, + "step": 299 + }, + { + "epoch": 0.03802763341361389, + "grad_norm": 1.046875, + "learning_rate": 8.267281105990783e-05, + "loss": 2.217, + "step": 300 + }, + { + "epoch": 0.038154392191659274, + "grad_norm": 1.078125, + "learning_rate": 8.294930875576037e-05, + "loss": 2.4008, + "step": 301 + }, + { + "epoch": 0.03828115096970465, + "grad_norm": 0.9375, + "learning_rate": 8.322580645161292e-05, + "loss": 2.3014, + "step": 302 + }, + { + "epoch": 0.03840790974775003, + "grad_norm": 0.9765625, + "learning_rate": 8.350230414746543e-05, + "loss": 2.3554, + "step": 303 + }, + { + "epoch": 0.03853466852579541, + "grad_norm": 1.109375, + "learning_rate": 8.377880184331798e-05, + "loss": 2.1672, + "step": 304 + }, + { + "epoch": 0.03866142730384079, + "grad_norm": 1.03125, + "learning_rate": 8.40552995391705e-05, + "loss": 1.9915, + "step": 305 + }, + { + "epoch": 0.038788186081886174, + "grad_norm": 1.0078125, + "learning_rate": 8.433179723502305e-05, + "loss": 2.3689, + "step": 306 + }, + { + "epoch": 0.03891494485993155, + "grad_norm": 1.0703125, + "learning_rate": 8.460829493087557e-05, + "loss": 2.736, + "step": 307 + }, + { + "epoch": 0.03904170363797693, + "grad_norm": 1.0078125, + "learning_rate": 8.488479262672811e-05, + "loss": 2.3187, + "step": 308 + }, + { + "epoch": 0.03916846241602231, + "grad_norm": 1.515625, + "learning_rate": 8.516129032258065e-05, + "loss": 1.9598, + "step": 309 + }, + { + "epoch": 0.03929522119406769, + "grad_norm": 0.90625, + "learning_rate": 8.543778801843318e-05, + "loss": 2.3678, + "step": 310 + }, + { + "epoch": 0.039421979972113066, + "grad_norm": 1.265625, + "learning_rate": 8.571428571428571e-05, + "loss": 1.8228, + "step": 311 + }, + { + "epoch": 0.03954873875015845, + "grad_norm": 1.03125, + "learning_rate": 8.599078341013824e-05, + "loss": 2.2008, + "step": 312 + }, + { + "epoch": 0.039675497528203825, + "grad_norm": 1.03125, + "learning_rate": 8.626728110599079e-05, + "loss": 2.1278, + "step": 313 + }, + { + "epoch": 0.03980225630624921, + "grad_norm": 1.078125, + "learning_rate": 8.654377880184333e-05, + "loss": 1.8647, + "step": 314 + }, + { + "epoch": 0.03992901508429459, + "grad_norm": 0.91796875, + "learning_rate": 8.682027649769585e-05, + "loss": 2.0758, + "step": 315 + }, + { + "epoch": 0.040055773862339966, + "grad_norm": 1.1484375, + "learning_rate": 8.709677419354839e-05, + "loss": 2.0726, + "step": 316 + }, + { + "epoch": 0.04018253264038535, + "grad_norm": 1.1484375, + "learning_rate": 8.737327188940092e-05, + "loss": 2.6807, + "step": 317 + }, + { + "epoch": 0.040309291418430725, + "grad_norm": 0.98828125, + "learning_rate": 8.764976958525346e-05, + "loss": 2.0432, + "step": 318 + }, + { + "epoch": 0.04043605019647611, + "grad_norm": 0.94921875, + "learning_rate": 8.7926267281106e-05, + "loss": 2.4002, + "step": 319 + }, + { + "epoch": 0.04056280897452148, + "grad_norm": 1.1171875, + "learning_rate": 8.820276497695852e-05, + "loss": 2.0059, + "step": 320 + }, + { + "epoch": 0.040689567752566866, + "grad_norm": 1.078125, + "learning_rate": 8.847926267281106e-05, + "loss": 2.0576, + "step": 321 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 1.2265625, + "learning_rate": 8.87557603686636e-05, + "loss": 2.389, + "step": 322 + }, + { + "epoch": 0.040943085308657624, + "grad_norm": 1.0625, + "learning_rate": 8.903225806451614e-05, + "loss": 2.0016, + "step": 323 + }, + { + "epoch": 0.04106984408670301, + "grad_norm": 1.015625, + "learning_rate": 8.930875576036867e-05, + "loss": 1.918, + "step": 324 + }, + { + "epoch": 0.04119660286474838, + "grad_norm": 1.40625, + "learning_rate": 8.95852534562212e-05, + "loss": 1.5321, + "step": 325 + }, + { + "epoch": 0.041323361642793766, + "grad_norm": 0.93359375, + "learning_rate": 8.986175115207374e-05, + "loss": 1.9709, + "step": 326 + }, + { + "epoch": 0.04145012042083914, + "grad_norm": 1.0625, + "learning_rate": 9.013824884792627e-05, + "loss": 1.9838, + "step": 327 + }, + { + "epoch": 0.041576879198884524, + "grad_norm": 1.1015625, + "learning_rate": 9.04147465437788e-05, + "loss": 2.0751, + "step": 328 + }, + { + "epoch": 0.0417036379769299, + "grad_norm": 1.25, + "learning_rate": 9.069124423963133e-05, + "loss": 2.2254, + "step": 329 + }, + { + "epoch": 0.04183039675497528, + "grad_norm": 0.9453125, + "learning_rate": 9.096774193548387e-05, + "loss": 1.9971, + "step": 330 + }, + { + "epoch": 0.04195715553302066, + "grad_norm": 1.828125, + "learning_rate": 9.124423963133642e-05, + "loss": 2.3412, + "step": 331 + }, + { + "epoch": 0.04208391431106604, + "grad_norm": 0.9140625, + "learning_rate": 9.152073732718893e-05, + "loss": 2.3622, + "step": 332 + }, + { + "epoch": 0.042210673089111424, + "grad_norm": 1.1015625, + "learning_rate": 9.179723502304148e-05, + "loss": 2.0821, + "step": 333 + }, + { + "epoch": 0.0423374318671568, + "grad_norm": 0.87890625, + "learning_rate": 9.2073732718894e-05, + "loss": 2.4345, + "step": 334 + }, + { + "epoch": 0.04246419064520218, + "grad_norm": 0.9609375, + "learning_rate": 9.235023041474655e-05, + "loss": 2.0549, + "step": 335 + }, + { + "epoch": 0.04259094942324756, + "grad_norm": 1.0390625, + "learning_rate": 9.262672811059908e-05, + "loss": 2.4581, + "step": 336 + }, + { + "epoch": 0.04271770820129294, + "grad_norm": 1.0859375, + "learning_rate": 9.290322580645161e-05, + "loss": 2.3993, + "step": 337 + }, + { + "epoch": 0.042844466979338316, + "grad_norm": 1.2890625, + "learning_rate": 9.317972350230415e-05, + "loss": 2.5258, + "step": 338 + }, + { + "epoch": 0.0429712257573837, + "grad_norm": 0.98828125, + "learning_rate": 9.345622119815668e-05, + "loss": 2.0652, + "step": 339 + }, + { + "epoch": 0.04309798453542908, + "grad_norm": 1.0234375, + "learning_rate": 9.373271889400923e-05, + "loss": 2.0846, + "step": 340 + }, + { + "epoch": 0.04322474331347446, + "grad_norm": 0.9375, + "learning_rate": 9.400921658986176e-05, + "loss": 2.193, + "step": 341 + }, + { + "epoch": 0.04335150209151984, + "grad_norm": 0.9765625, + "learning_rate": 9.428571428571429e-05, + "loss": 2.1681, + "step": 342 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 1.046875, + "learning_rate": 9.456221198156683e-05, + "loss": 2.4051, + "step": 343 + }, + { + "epoch": 0.0436050196476106, + "grad_norm": 1.03125, + "learning_rate": 9.483870967741936e-05, + "loss": 2.7981, + "step": 344 + }, + { + "epoch": 0.043731778425655975, + "grad_norm": 1.171875, + "learning_rate": 9.511520737327189e-05, + "loss": 2.3485, + "step": 345 + }, + { + "epoch": 0.04385853720370136, + "grad_norm": 1.203125, + "learning_rate": 9.539170506912443e-05, + "loss": 2.6022, + "step": 346 + }, + { + "epoch": 0.04398529598174673, + "grad_norm": 1.0078125, + "learning_rate": 9.566820276497696e-05, + "loss": 2.3691, + "step": 347 + }, + { + "epoch": 0.044112054759792116, + "grad_norm": 0.98828125, + "learning_rate": 9.59447004608295e-05, + "loss": 2.1588, + "step": 348 + }, + { + "epoch": 0.0442388135378375, + "grad_norm": 1.125, + "learning_rate": 9.622119815668202e-05, + "loss": 2.2599, + "step": 349 + }, + { + "epoch": 0.044365572315882874, + "grad_norm": 0.98046875, + "learning_rate": 9.649769585253456e-05, + "loss": 2.129, + "step": 350 + }, + { + "epoch": 0.04449233109392826, + "grad_norm": 1.140625, + "learning_rate": 9.67741935483871e-05, + "loss": 2.103, + "step": 351 + }, + { + "epoch": 0.04461908987197363, + "grad_norm": 0.9921875, + "learning_rate": 9.705069124423964e-05, + "loss": 2.165, + "step": 352 + }, + { + "epoch": 0.044745848650019016, + "grad_norm": 0.96484375, + "learning_rate": 9.732718894009217e-05, + "loss": 2.0007, + "step": 353 + }, + { + "epoch": 0.04487260742806439, + "grad_norm": 0.9296875, + "learning_rate": 9.76036866359447e-05, + "loss": 1.9785, + "step": 354 + }, + { + "epoch": 0.044999366206109774, + "grad_norm": 1.2109375, + "learning_rate": 9.788018433179724e-05, + "loss": 2.2277, + "step": 355 + }, + { + "epoch": 0.04512612498415515, + "grad_norm": 1.0703125, + "learning_rate": 9.815668202764977e-05, + "loss": 2.4022, + "step": 356 + }, + { + "epoch": 0.04525288376220053, + "grad_norm": 0.9609375, + "learning_rate": 9.843317972350231e-05, + "loss": 2.4135, + "step": 357 + }, + { + "epoch": 0.045379642540245915, + "grad_norm": 1.109375, + "learning_rate": 9.870967741935484e-05, + "loss": 2.6693, + "step": 358 + }, + { + "epoch": 0.04550640131829129, + "grad_norm": 1.3515625, + "learning_rate": 9.898617511520737e-05, + "loss": 2.6541, + "step": 359 + }, + { + "epoch": 0.045633160096336674, + "grad_norm": 1.2421875, + "learning_rate": 9.926267281105992e-05, + "loss": 2.2509, + "step": 360 + }, + { + "epoch": 0.04575991887438205, + "grad_norm": 1.171875, + "learning_rate": 9.953917050691245e-05, + "loss": 2.212, + "step": 361 + }, + { + "epoch": 0.04588667765242743, + "grad_norm": 1.15625, + "learning_rate": 9.981566820276498e-05, + "loss": 2.0584, + "step": 362 + }, + { + "epoch": 0.04601343643047281, + "grad_norm": 1.0078125, + "learning_rate": 0.00010009216589861752, + "loss": 2.0944, + "step": 363 + }, + { + "epoch": 0.04614019520851819, + "grad_norm": 0.96875, + "learning_rate": 0.00010036866359447005, + "loss": 1.799, + "step": 364 + }, + { + "epoch": 0.046266953986563567, + "grad_norm": 0.97265625, + "learning_rate": 0.00010064516129032259, + "loss": 2.5278, + "step": 365 + }, + { + "epoch": 0.04639371276460895, + "grad_norm": 0.90234375, + "learning_rate": 0.00010092165898617511, + "loss": 2.1668, + "step": 366 + }, + { + "epoch": 0.04652047154265433, + "grad_norm": 1.0234375, + "learning_rate": 0.00010119815668202765, + "loss": 1.8589, + "step": 367 + }, + { + "epoch": 0.04664723032069971, + "grad_norm": 1.1484375, + "learning_rate": 0.00010147465437788018, + "loss": 2.1096, + "step": 368 + }, + { + "epoch": 0.04677398909874509, + "grad_norm": 0.984375, + "learning_rate": 0.00010175115207373273, + "loss": 2.3124, + "step": 369 + }, + { + "epoch": 0.046900747876790466, + "grad_norm": 0.94140625, + "learning_rate": 0.00010202764976958525, + "loss": 2.3002, + "step": 370 + }, + { + "epoch": 0.04702750665483585, + "grad_norm": 0.89453125, + "learning_rate": 0.00010230414746543778, + "loss": 2.5014, + "step": 371 + }, + { + "epoch": 0.047154265432881225, + "grad_norm": 1.0546875, + "learning_rate": 0.00010258064516129033, + "loss": 2.0533, + "step": 372 + }, + { + "epoch": 0.04728102421092661, + "grad_norm": 3.296875, + "learning_rate": 0.00010285714285714286, + "loss": 2.2184, + "step": 373 + }, + { + "epoch": 0.04740778298897198, + "grad_norm": 0.93359375, + "learning_rate": 0.00010313364055299539, + "loss": 2.3654, + "step": 374 + }, + { + "epoch": 0.047534541767017366, + "grad_norm": 0.9765625, + "learning_rate": 0.00010341013824884793, + "loss": 2.5627, + "step": 375 + }, + { + "epoch": 0.04766130054506275, + "grad_norm": 1.0390625, + "learning_rate": 0.00010368663594470046, + "loss": 2.3615, + "step": 376 + }, + { + "epoch": 0.047788059323108124, + "grad_norm": 1.015625, + "learning_rate": 0.000103963133640553, + "loss": 2.1589, + "step": 377 + }, + { + "epoch": 0.04791481810115351, + "grad_norm": 1.1328125, + "learning_rate": 0.00010423963133640553, + "loss": 1.9195, + "step": 378 + }, + { + "epoch": 0.04804157687919888, + "grad_norm": 1.0, + "learning_rate": 0.00010451612903225806, + "loss": 1.9344, + "step": 379 + }, + { + "epoch": 0.048168335657244266, + "grad_norm": 0.953125, + "learning_rate": 0.00010479262672811061, + "loss": 1.9311, + "step": 380 + }, + { + "epoch": 0.04829509443528964, + "grad_norm": 1.015625, + "learning_rate": 0.00010506912442396314, + "loss": 2.2372, + "step": 381 + }, + { + "epoch": 0.048421853213335024, + "grad_norm": 1.1171875, + "learning_rate": 0.00010534562211981568, + "loss": 1.9217, + "step": 382 + }, + { + "epoch": 0.0485486119913804, + "grad_norm": 1.015625, + "learning_rate": 0.0001056221198156682, + "loss": 2.1078, + "step": 383 + }, + { + "epoch": 0.04867537076942578, + "grad_norm": 1.0703125, + "learning_rate": 0.00010589861751152074, + "loss": 2.2819, + "step": 384 + }, + { + "epoch": 0.048802129547471165, + "grad_norm": 0.9765625, + "learning_rate": 0.00010617511520737328, + "loss": 2.2347, + "step": 385 + }, + { + "epoch": 0.04892888832551654, + "grad_norm": 1.1328125, + "learning_rate": 0.00010645161290322581, + "loss": 2.1271, + "step": 386 + }, + { + "epoch": 0.049055647103561924, + "grad_norm": 1.046875, + "learning_rate": 0.00010672811059907834, + "loss": 1.9699, + "step": 387 + }, + { + "epoch": 0.0491824058816073, + "grad_norm": 0.97265625, + "learning_rate": 0.00010700460829493087, + "loss": 2.63, + "step": 388 + }, + { + "epoch": 0.04930916465965268, + "grad_norm": 1.125, + "learning_rate": 0.00010728110599078342, + "loss": 2.3463, + "step": 389 + }, + { + "epoch": 0.04943592343769806, + "grad_norm": 1.046875, + "learning_rate": 0.00010755760368663595, + "loss": 2.006, + "step": 390 + }, + { + "epoch": 0.04956268221574344, + "grad_norm": 0.95703125, + "learning_rate": 0.00010783410138248848, + "loss": 1.9709, + "step": 391 + }, + { + "epoch": 0.049689440993788817, + "grad_norm": 0.98828125, + "learning_rate": 0.00010811059907834102, + "loss": 2.47, + "step": 392 + }, + { + "epoch": 0.0498161997718342, + "grad_norm": 0.96484375, + "learning_rate": 0.00010838709677419355, + "loss": 2.589, + "step": 393 + }, + { + "epoch": 0.04994295854987958, + "grad_norm": 1.078125, + "learning_rate": 0.00010866359447004609, + "loss": 2.0223, + "step": 394 + }, + { + "epoch": 0.05006971732792496, + "grad_norm": 1.1796875, + "learning_rate": 0.00010894009216589861, + "loss": 2.3123, + "step": 395 + }, + { + "epoch": 0.05019647610597034, + "grad_norm": 1.3046875, + "learning_rate": 0.00010921658986175115, + "loss": 2.0089, + "step": 396 + }, + { + "epoch": 0.050323234884015716, + "grad_norm": 1.0546875, + "learning_rate": 0.0001094930875576037, + "loss": 2.6241, + "step": 397 + }, + { + "epoch": 0.0504499936620611, + "grad_norm": 0.90625, + "learning_rate": 0.00010976958525345622, + "loss": 2.1172, + "step": 398 + }, + { + "epoch": 0.050576752440106475, + "grad_norm": 1.046875, + "learning_rate": 0.00011004608294930877, + "loss": 2.4678, + "step": 399 + }, + { + "epoch": 0.05070351121815186, + "grad_norm": 0.890625, + "learning_rate": 0.00011032258064516128, + "loss": 2.5936, + "step": 400 + }, + { + "epoch": 0.05083026999619724, + "grad_norm": 1.046875, + "learning_rate": 0.00011059907834101383, + "loss": 1.7543, + "step": 401 + }, + { + "epoch": 0.050957028774242616, + "grad_norm": 1.2109375, + "learning_rate": 0.00011087557603686637, + "loss": 2.1328, + "step": 402 + }, + { + "epoch": 0.051083787552288, + "grad_norm": 1.140625, + "learning_rate": 0.0001111520737327189, + "loss": 2.5187, + "step": 403 + }, + { + "epoch": 0.051210546330333374, + "grad_norm": 1.0078125, + "learning_rate": 0.00011142857142857143, + "loss": 2.5182, + "step": 404 + }, + { + "epoch": 0.05133730510837876, + "grad_norm": 0.91796875, + "learning_rate": 0.00011170506912442396, + "loss": 2.1758, + "step": 405 + }, + { + "epoch": 0.05146406388642413, + "grad_norm": 1.015625, + "learning_rate": 0.0001119815668202765, + "loss": 2.5022, + "step": 406 + }, + { + "epoch": 0.051590822664469516, + "grad_norm": 1.1171875, + "learning_rate": 0.00011225806451612903, + "loss": 2.5626, + "step": 407 + }, + { + "epoch": 0.05171758144251489, + "grad_norm": 1.1328125, + "learning_rate": 0.00011253456221198156, + "loss": 2.8027, + "step": 408 + }, + { + "epoch": 0.051844340220560274, + "grad_norm": 1.0234375, + "learning_rate": 0.0001128110599078341, + "loss": 2.2778, + "step": 409 + }, + { + "epoch": 0.05197109899860566, + "grad_norm": 1.0625, + "learning_rate": 0.00011308755760368664, + "loss": 1.9029, + "step": 410 + }, + { + "epoch": 0.05209785777665103, + "grad_norm": 0.98046875, + "learning_rate": 0.00011336405529953918, + "loss": 1.8463, + "step": 411 + }, + { + "epoch": 0.052224616554696415, + "grad_norm": 0.90625, + "learning_rate": 0.0001136405529953917, + "loss": 2.2628, + "step": 412 + }, + { + "epoch": 0.05235137533274179, + "grad_norm": 1.5546875, + "learning_rate": 0.00011391705069124424, + "loss": 2.696, + "step": 413 + }, + { + "epoch": 0.052478134110787174, + "grad_norm": 1.1640625, + "learning_rate": 0.00011419354838709678, + "loss": 2.8536, + "step": 414 + }, + { + "epoch": 0.05260489288883255, + "grad_norm": 0.95703125, + "learning_rate": 0.00011447004608294931, + "loss": 2.5778, + "step": 415 + }, + { + "epoch": 0.05273165166687793, + "grad_norm": 1.1328125, + "learning_rate": 0.00011474654377880186, + "loss": 2.0624, + "step": 416 + }, + { + "epoch": 0.05285841044492331, + "grad_norm": 1.0390625, + "learning_rate": 0.00011502304147465437, + "loss": 1.6379, + "step": 417 + }, + { + "epoch": 0.05298516922296869, + "grad_norm": 0.99609375, + "learning_rate": 0.00011529953917050692, + "loss": 2.1152, + "step": 418 + }, + { + "epoch": 0.05311192800101407, + "grad_norm": 2.453125, + "learning_rate": 0.00011557603686635946, + "loss": 2.7291, + "step": 419 + }, + { + "epoch": 0.05323868677905945, + "grad_norm": 0.97265625, + "learning_rate": 0.00011585253456221199, + "loss": 2.3185, + "step": 420 + }, + { + "epoch": 0.05336544555710483, + "grad_norm": 1.0390625, + "learning_rate": 0.00011612903225806452, + "loss": 1.657, + "step": 421 + }, + { + "epoch": 0.05349220433515021, + "grad_norm": 0.9609375, + "learning_rate": 0.00011640552995391705, + "loss": 2.2992, + "step": 422 + }, + { + "epoch": 0.05361896311319559, + "grad_norm": 0.97265625, + "learning_rate": 0.00011668202764976959, + "loss": 2.1632, + "step": 423 + }, + { + "epoch": 0.053745721891240966, + "grad_norm": 1.171875, + "learning_rate": 0.00011695852534562213, + "loss": 2.1941, + "step": 424 + }, + { + "epoch": 0.05387248066928635, + "grad_norm": 1.0703125, + "learning_rate": 0.00011723502304147465, + "loss": 2.4234, + "step": 425 + }, + { + "epoch": 0.053999239447331725, + "grad_norm": 0.98828125, + "learning_rate": 0.0001175115207373272, + "loss": 2.2055, + "step": 426 + }, + { + "epoch": 0.05412599822537711, + "grad_norm": 1.1328125, + "learning_rate": 0.00011778801843317972, + "loss": 2.3199, + "step": 427 + }, + { + "epoch": 0.05425275700342249, + "grad_norm": 0.96484375, + "learning_rate": 0.00011806451612903227, + "loss": 2.2118, + "step": 428 + }, + { + "epoch": 0.054379515781467866, + "grad_norm": 1.1796875, + "learning_rate": 0.00011834101382488478, + "loss": 2.4926, + "step": 429 + }, + { + "epoch": 0.05450627455951325, + "grad_norm": 0.89453125, + "learning_rate": 0.00011861751152073733, + "loss": 2.5658, + "step": 430 + }, + { + "epoch": 0.054633033337558624, + "grad_norm": 0.99609375, + "learning_rate": 0.00011889400921658987, + "loss": 1.9025, + "step": 431 + }, + { + "epoch": 0.05475979211560401, + "grad_norm": 0.8984375, + "learning_rate": 0.0001191705069124424, + "loss": 1.9359, + "step": 432 + }, + { + "epoch": 0.05488655089364938, + "grad_norm": 0.90625, + "learning_rate": 0.00011944700460829493, + "loss": 2.6443, + "step": 433 + }, + { + "epoch": 0.055013309671694766, + "grad_norm": 1.0859375, + "learning_rate": 0.00011972350230414746, + "loss": 2.2787, + "step": 434 + }, + { + "epoch": 0.05514006844974014, + "grad_norm": 1.0, + "learning_rate": 0.00012, + "loss": 1.9707, + "step": 435 + }, + { + "epoch": 0.055266827227785524, + "grad_norm": 0.8984375, + "learning_rate": 0.00011999999564342667, + "loss": 2.3204, + "step": 436 + }, + { + "epoch": 0.05539358600583091, + "grad_norm": 1.09375, + "learning_rate": 0.0001199999825737073, + "loss": 2.1407, + "step": 437 + }, + { + "epoch": 0.05552034478387628, + "grad_norm": 0.90234375, + "learning_rate": 0.00011999996079084382, + "loss": 2.2962, + "step": 438 + }, + { + "epoch": 0.055647103561921665, + "grad_norm": 1.0703125, + "learning_rate": 0.00011999993029483935, + "loss": 2.325, + "step": 439 + }, + { + "epoch": 0.05577386233996704, + "grad_norm": 1.078125, + "learning_rate": 0.00011999989108569834, + "loss": 2.3317, + "step": 440 + }, + { + "epoch": 0.055900621118012424, + "grad_norm": 2.609375, + "learning_rate": 0.0001199998431634265, + "loss": 2.0142, + "step": 441 + }, + { + "epoch": 0.0560273798960578, + "grad_norm": 1.1015625, + "learning_rate": 0.00011999978652803075, + "loss": 2.8675, + "step": 442 + }, + { + "epoch": 0.05615413867410318, + "grad_norm": 1.1171875, + "learning_rate": 0.00011999972117951936, + "loss": 2.1372, + "step": 443 + }, + { + "epoch": 0.05628089745214856, + "grad_norm": 1.0234375, + "learning_rate": 0.0001199996471179018, + "loss": 2.1987, + "step": 444 + }, + { + "epoch": 0.05640765623019394, + "grad_norm": 0.9140625, + "learning_rate": 0.00011999956434318879, + "loss": 2.0152, + "step": 445 + }, + { + "epoch": 0.056534415008239323, + "grad_norm": 1.0234375, + "learning_rate": 0.00011999947285539242, + "loss": 2.7334, + "step": 446 + }, + { + "epoch": 0.0566611737862847, + "grad_norm": 0.953125, + "learning_rate": 0.00011999937265452592, + "loss": 2.0989, + "step": 447 + }, + { + "epoch": 0.05678793256433008, + "grad_norm": 0.98046875, + "learning_rate": 0.00011999926374060386, + "loss": 1.8519, + "step": 448 + }, + { + "epoch": 0.05691469134237546, + "grad_norm": 0.93359375, + "learning_rate": 0.00011999914611364205, + "loss": 1.8668, + "step": 449 + }, + { + "epoch": 0.05704145012042084, + "grad_norm": 0.94921875, + "learning_rate": 0.00011999901977365759, + "loss": 2.182, + "step": 450 + }, + { + "epoch": 0.057168208898466216, + "grad_norm": 1.0078125, + "learning_rate": 0.0001199988847206688, + "loss": 2.5253, + "step": 451 + }, + { + "epoch": 0.0572949676765116, + "grad_norm": 0.98828125, + "learning_rate": 0.00011999874095469532, + "loss": 1.9986, + "step": 452 + }, + { + "epoch": 0.057421726454556975, + "grad_norm": 1.046875, + "learning_rate": 0.000119998588475758, + "loss": 2.2526, + "step": 453 + }, + { + "epoch": 0.05754848523260236, + "grad_norm": 1.015625, + "learning_rate": 0.00011999842728387901, + "loss": 2.3086, + "step": 454 + }, + { + "epoch": 0.05767524401064774, + "grad_norm": 1.0390625, + "learning_rate": 0.00011999825737908175, + "loss": 2.2628, + "step": 455 + }, + { + "epoch": 0.057802002788693116, + "grad_norm": 1.0390625, + "learning_rate": 0.00011999807876139088, + "loss": 2.4269, + "step": 456 + }, + { + "epoch": 0.0579287615667385, + "grad_norm": 1.0078125, + "learning_rate": 0.00011999789143083236, + "loss": 2.2286, + "step": 457 + }, + { + "epoch": 0.058055520344783874, + "grad_norm": 1.0, + "learning_rate": 0.00011999769538743336, + "loss": 1.9248, + "step": 458 + }, + { + "epoch": 0.05818227912282926, + "grad_norm": 0.9921875, + "learning_rate": 0.00011999749063122237, + "loss": 2.3115, + "step": 459 + }, + { + "epoch": 0.05830903790087463, + "grad_norm": 1.03125, + "learning_rate": 0.00011999727716222914, + "loss": 2.4456, + "step": 460 + }, + { + "epoch": 0.058435796678920016, + "grad_norm": 0.96875, + "learning_rate": 0.00011999705498048465, + "loss": 2.441, + "step": 461 + }, + { + "epoch": 0.05856255545696539, + "grad_norm": 1.453125, + "learning_rate": 0.00011999682408602119, + "loss": 2.6528, + "step": 462 + }, + { + "epoch": 0.058689314235010774, + "grad_norm": 1.015625, + "learning_rate": 0.00011999658447887225, + "loss": 1.8711, + "step": 463 + }, + { + "epoch": 0.05881607301305616, + "grad_norm": 1.09375, + "learning_rate": 0.00011999633615907265, + "loss": 2.5684, + "step": 464 + }, + { + "epoch": 0.05894283179110153, + "grad_norm": 1.09375, + "learning_rate": 0.00011999607912665845, + "loss": 2.3015, + "step": 465 + }, + { + "epoch": 0.059069590569146915, + "grad_norm": 1.078125, + "learning_rate": 0.00011999581338166696, + "loss": 2.2543, + "step": 466 + }, + { + "epoch": 0.05919634934719229, + "grad_norm": 0.9375, + "learning_rate": 0.00011999553892413681, + "loss": 2.1107, + "step": 467 + }, + { + "epoch": 0.059323108125237674, + "grad_norm": 1.0859375, + "learning_rate": 0.00011999525575410781, + "loss": 2.3834, + "step": 468 + }, + { + "epoch": 0.05944986690328305, + "grad_norm": 0.9375, + "learning_rate": 0.0001199949638716211, + "loss": 2.1325, + "step": 469 + }, + { + "epoch": 0.05957662568132843, + "grad_norm": 1.765625, + "learning_rate": 0.00011999466327671907, + "loss": 2.8059, + "step": 470 + }, + { + "epoch": 0.059703384459373815, + "grad_norm": 0.9765625, + "learning_rate": 0.00011999435396944537, + "loss": 2.0948, + "step": 471 + }, + { + "epoch": 0.05983014323741919, + "grad_norm": 0.87109375, + "learning_rate": 0.00011999403594984492, + "loss": 2.1935, + "step": 472 + }, + { + "epoch": 0.059956902015464573, + "grad_norm": 1.0625, + "learning_rate": 0.00011999370921796391, + "loss": 1.9857, + "step": 473 + }, + { + "epoch": 0.06008366079350995, + "grad_norm": 1.125, + "learning_rate": 0.00011999337377384979, + "loss": 2.4936, + "step": 474 + }, + { + "epoch": 0.06021041957155533, + "grad_norm": 0.99609375, + "learning_rate": 0.00011999302961755125, + "loss": 2.361, + "step": 475 + }, + { + "epoch": 0.06033717834960071, + "grad_norm": 1.015625, + "learning_rate": 0.00011999267674911826, + "loss": 2.0592, + "step": 476 + }, + { + "epoch": 0.06046393712764609, + "grad_norm": 0.94921875, + "learning_rate": 0.0001199923151686021, + "loss": 2.2916, + "step": 477 + }, + { + "epoch": 0.060590695905691466, + "grad_norm": 1.0390625, + "learning_rate": 0.00011999194487605526, + "loss": 2.1491, + "step": 478 + }, + { + "epoch": 0.06071745468373685, + "grad_norm": 0.95703125, + "learning_rate": 0.00011999156587153153, + "loss": 2.0982, + "step": 479 + }, + { + "epoch": 0.06084421346178223, + "grad_norm": 1.0234375, + "learning_rate": 0.00011999117815508591, + "loss": 2.0827, + "step": 480 + }, + { + "epoch": 0.06097097223982761, + "grad_norm": 0.91796875, + "learning_rate": 0.00011999078172677474, + "loss": 2.1167, + "step": 481 + }, + { + "epoch": 0.06109773101787299, + "grad_norm": 0.859375, + "learning_rate": 0.00011999037658665559, + "loss": 1.9961, + "step": 482 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 0.9765625, + "learning_rate": 0.00011998996273478724, + "loss": 2.4519, + "step": 483 + }, + { + "epoch": 0.06135124857396375, + "grad_norm": 1.03125, + "learning_rate": 0.00011998954017122987, + "loss": 2.3454, + "step": 484 + }, + { + "epoch": 0.061478007352009124, + "grad_norm": 0.98828125, + "learning_rate": 0.00011998910889604478, + "loss": 2.5932, + "step": 485 + }, + { + "epoch": 0.06160476613005451, + "grad_norm": 0.8515625, + "learning_rate": 0.00011998866890929464, + "loss": 2.3735, + "step": 486 + }, + { + "epoch": 0.06173152490809988, + "grad_norm": 0.9921875, + "learning_rate": 0.00011998822021104332, + "loss": 2.9586, + "step": 487 + }, + { + "epoch": 0.061858283686145266, + "grad_norm": 0.83203125, + "learning_rate": 0.00011998776280135599, + "loss": 2.3048, + "step": 488 + }, + { + "epoch": 0.06198504246419065, + "grad_norm": 1.015625, + "learning_rate": 0.00011998729668029908, + "loss": 2.1469, + "step": 489 + }, + { + "epoch": 0.062111801242236024, + "grad_norm": 0.90234375, + "learning_rate": 0.00011998682184794025, + "loss": 2.1199, + "step": 490 + }, + { + "epoch": 0.06223856002028141, + "grad_norm": 1.015625, + "learning_rate": 0.00011998633830434847, + "loss": 2.0902, + "step": 491 + }, + { + "epoch": 0.06236531879832678, + "grad_norm": 0.96484375, + "learning_rate": 0.000119985846049594, + "loss": 2.0548, + "step": 492 + }, + { + "epoch": 0.062492077576372165, + "grad_norm": 0.90234375, + "learning_rate": 0.00011998534508374828, + "loss": 2.055, + "step": 493 + }, + { + "epoch": 0.06261883635441755, + "grad_norm": 0.99609375, + "learning_rate": 0.00011998483540688406, + "loss": 2.606, + "step": 494 + }, + { + "epoch": 0.06274559513246292, + "grad_norm": 1.09375, + "learning_rate": 0.00011998431701907537, + "loss": 1.7809, + "step": 495 + }, + { + "epoch": 0.0628723539105083, + "grad_norm": 1.15625, + "learning_rate": 0.00011998378992039749, + "loss": 2.2643, + "step": 496 + }, + { + "epoch": 0.06299911268855368, + "grad_norm": 1.1015625, + "learning_rate": 0.00011998325411092697, + "loss": 1.9097, + "step": 497 + }, + { + "epoch": 0.06312587146659907, + "grad_norm": 1.03125, + "learning_rate": 0.00011998270959074158, + "loss": 2.3142, + "step": 498 + }, + { + "epoch": 0.06325263024464445, + "grad_norm": 0.9765625, + "learning_rate": 0.00011998215635992044, + "loss": 2.7631, + "step": 499 + }, + { + "epoch": 0.06337938902268982, + "grad_norm": 1.046875, + "learning_rate": 0.00011998159441854389, + "loss": 2.0316, + "step": 500 + }, + { + "epoch": 0.0635061478007352, + "grad_norm": 0.9609375, + "learning_rate": 0.0001199810237666935, + "loss": 1.8626, + "step": 501 + }, + { + "epoch": 0.06363290657878058, + "grad_norm": 1.0859375, + "learning_rate": 0.00011998044440445218, + "loss": 3.0385, + "step": 502 + }, + { + "epoch": 0.06375966535682596, + "grad_norm": 1.0234375, + "learning_rate": 0.00011997985633190403, + "loss": 2.4948, + "step": 503 + }, + { + "epoch": 0.06388642413487133, + "grad_norm": 0.95703125, + "learning_rate": 0.00011997925954913445, + "loss": 2.323, + "step": 504 + }, + { + "epoch": 0.06401318291291672, + "grad_norm": 1.0, + "learning_rate": 0.00011997865405623012, + "loss": 2.3155, + "step": 505 + }, + { + "epoch": 0.0641399416909621, + "grad_norm": 1.0703125, + "learning_rate": 0.00011997803985327898, + "loss": 2.2917, + "step": 506 + }, + { + "epoch": 0.06426670046900748, + "grad_norm": 0.9453125, + "learning_rate": 0.00011997741694037022, + "loss": 2.2503, + "step": 507 + }, + { + "epoch": 0.06439345924705286, + "grad_norm": 0.94921875, + "learning_rate": 0.00011997678531759427, + "loss": 2.3282, + "step": 508 + }, + { + "epoch": 0.06452021802509823, + "grad_norm": 0.97265625, + "learning_rate": 0.00011997614498504287, + "loss": 2.0595, + "step": 509 + }, + { + "epoch": 0.06464697680314362, + "grad_norm": 0.93359375, + "learning_rate": 0.00011997549594280903, + "loss": 1.6902, + "step": 510 + }, + { + "epoch": 0.064773735581189, + "grad_norm": 0.921875, + "learning_rate": 0.00011997483819098696, + "loss": 2.2141, + "step": 511 + }, + { + "epoch": 0.06490049435923438, + "grad_norm": 0.99609375, + "learning_rate": 0.0001199741717296722, + "loss": 1.9506, + "step": 512 + }, + { + "epoch": 0.06502725313727975, + "grad_norm": 1.03125, + "learning_rate": 0.00011997349655896156, + "loss": 2.2917, + "step": 513 + }, + { + "epoch": 0.06515401191532513, + "grad_norm": 0.9296875, + "learning_rate": 0.00011997281267895306, + "loss": 1.6998, + "step": 514 + }, + { + "epoch": 0.06528077069337052, + "grad_norm": 1.0, + "learning_rate": 0.00011997212008974602, + "loss": 1.9828, + "step": 515 + }, + { + "epoch": 0.0654075294714159, + "grad_norm": 0.88671875, + "learning_rate": 0.00011997141879144099, + "loss": 2.0288, + "step": 516 + }, + { + "epoch": 0.06553428824946128, + "grad_norm": 0.96875, + "learning_rate": 0.00011997070878413985, + "loss": 2.0933, + "step": 517 + }, + { + "epoch": 0.06566104702750665, + "grad_norm": 1.0234375, + "learning_rate": 0.0001199699900679457, + "loss": 1.6551, + "step": 518 + }, + { + "epoch": 0.06578780580555203, + "grad_norm": 1.0234375, + "learning_rate": 0.00011996926264296288, + "loss": 2.3048, + "step": 519 + }, + { + "epoch": 0.06591456458359742, + "grad_norm": 1.015625, + "learning_rate": 0.00011996852650929706, + "loss": 1.6582, + "step": 520 + }, + { + "epoch": 0.0660413233616428, + "grad_norm": 0.953125, + "learning_rate": 0.00011996778166705514, + "loss": 2.3414, + "step": 521 + }, + { + "epoch": 0.06616808213968817, + "grad_norm": 1.0859375, + "learning_rate": 0.00011996702811634526, + "loss": 1.8715, + "step": 522 + }, + { + "epoch": 0.06629484091773355, + "grad_norm": 1.09375, + "learning_rate": 0.00011996626585727685, + "loss": 2.2727, + "step": 523 + }, + { + "epoch": 0.06642159969577893, + "grad_norm": 0.94140625, + "learning_rate": 0.00011996549488996066, + "loss": 2.2925, + "step": 524 + }, + { + "epoch": 0.06654835847382432, + "grad_norm": 1.1171875, + "learning_rate": 0.00011996471521450858, + "loss": 2.1146, + "step": 525 + }, + { + "epoch": 0.0666751172518697, + "grad_norm": 1.1015625, + "learning_rate": 0.00011996392683103387, + "loss": 2.3482, + "step": 526 + }, + { + "epoch": 0.06680187602991507, + "grad_norm": 1.03125, + "learning_rate": 0.00011996312973965099, + "loss": 2.6984, + "step": 527 + }, + { + "epoch": 0.06692863480796045, + "grad_norm": 1.0234375, + "learning_rate": 0.00011996232394047575, + "loss": 2.4002, + "step": 528 + }, + { + "epoch": 0.06705539358600583, + "grad_norm": 0.99609375, + "learning_rate": 0.00011996150943362511, + "loss": 1.9119, + "step": 529 + }, + { + "epoch": 0.06718215236405121, + "grad_norm": 0.9296875, + "learning_rate": 0.00011996068621921738, + "loss": 2.3112, + "step": 530 + }, + { + "epoch": 0.06730891114209658, + "grad_norm": 0.921875, + "learning_rate": 0.0001199598542973721, + "loss": 2.1442, + "step": 531 + }, + { + "epoch": 0.06743566992014197, + "grad_norm": 0.9375, + "learning_rate": 0.00011995901366821007, + "loss": 2.1507, + "step": 532 + }, + { + "epoch": 0.06756242869818735, + "grad_norm": 0.81640625, + "learning_rate": 0.00011995816433185337, + "loss": 1.9828, + "step": 533 + }, + { + "epoch": 0.06768918747623273, + "grad_norm": 0.9296875, + "learning_rate": 0.00011995730628842537, + "loss": 2.3836, + "step": 534 + }, + { + "epoch": 0.06781594625427811, + "grad_norm": 0.8671875, + "learning_rate": 0.00011995643953805062, + "loss": 1.911, + "step": 535 + }, + { + "epoch": 0.06794270503232348, + "grad_norm": 0.85546875, + "learning_rate": 0.00011995556408085505, + "loss": 1.9752, + "step": 536 + }, + { + "epoch": 0.06806946381036887, + "grad_norm": 1.09375, + "learning_rate": 0.00011995467991696576, + "loss": 2.4653, + "step": 537 + }, + { + "epoch": 0.06819622258841425, + "grad_norm": 1.078125, + "learning_rate": 0.00011995378704651113, + "loss": 2.3855, + "step": 538 + }, + { + "epoch": 0.06832298136645963, + "grad_norm": 1.1484375, + "learning_rate": 0.00011995288546962085, + "loss": 2.1457, + "step": 539 + }, + { + "epoch": 0.068449740144505, + "grad_norm": 1.2109375, + "learning_rate": 0.00011995197518642582, + "loss": 2.0958, + "step": 540 + }, + { + "epoch": 0.06857649892255038, + "grad_norm": 0.97265625, + "learning_rate": 0.00011995105619705828, + "loss": 2.4348, + "step": 541 + }, + { + "epoch": 0.06870325770059577, + "grad_norm": 1.09375, + "learning_rate": 0.00011995012850165164, + "loss": 1.7981, + "step": 542 + }, + { + "epoch": 0.06883001647864115, + "grad_norm": 0.99609375, + "learning_rate": 0.00011994919210034063, + "loss": 1.9514, + "step": 543 + }, + { + "epoch": 0.06895677525668653, + "grad_norm": 0.94140625, + "learning_rate": 0.00011994824699326122, + "loss": 2.7192, + "step": 544 + }, + { + "epoch": 0.0690835340347319, + "grad_norm": 0.9296875, + "learning_rate": 0.00011994729318055069, + "loss": 2.3608, + "step": 545 + }, + { + "epoch": 0.06921029281277728, + "grad_norm": 0.96875, + "learning_rate": 0.00011994633066234753, + "loss": 2.338, + "step": 546 + }, + { + "epoch": 0.06933705159082267, + "grad_norm": 0.9921875, + "learning_rate": 0.00011994535943879152, + "loss": 1.9992, + "step": 547 + }, + { + "epoch": 0.06946381036886805, + "grad_norm": 1.015625, + "learning_rate": 0.00011994437951002371, + "loss": 1.6627, + "step": 548 + }, + { + "epoch": 0.06959056914691342, + "grad_norm": 0.921875, + "learning_rate": 0.00011994339087618638, + "loss": 2.2918, + "step": 549 + }, + { + "epoch": 0.0697173279249588, + "grad_norm": 0.875, + "learning_rate": 0.00011994239353742312, + "loss": 1.7978, + "step": 550 + }, + { + "epoch": 0.06984408670300418, + "grad_norm": 0.98046875, + "learning_rate": 0.00011994138749387876, + "loss": 2.1075, + "step": 551 + }, + { + "epoch": 0.06997084548104957, + "grad_norm": 0.9609375, + "learning_rate": 0.00011994037274569938, + "loss": 2.4512, + "step": 552 + }, + { + "epoch": 0.07009760425909495, + "grad_norm": 0.9375, + "learning_rate": 0.00011993934929303237, + "loss": 1.7422, + "step": 553 + }, + { + "epoch": 0.07022436303714032, + "grad_norm": 0.9921875, + "learning_rate": 0.00011993831713602633, + "loss": 2.8057, + "step": 554 + }, + { + "epoch": 0.0703511218151857, + "grad_norm": 1.046875, + "learning_rate": 0.00011993727627483116, + "loss": 2.0537, + "step": 555 + }, + { + "epoch": 0.07047788059323108, + "grad_norm": 1.046875, + "learning_rate": 0.000119936226709598, + "loss": 2.1056, + "step": 556 + }, + { + "epoch": 0.07060463937127646, + "grad_norm": 0.953125, + "learning_rate": 0.00011993516844047931, + "loss": 2.4015, + "step": 557 + }, + { + "epoch": 0.07073139814932185, + "grad_norm": 1.0078125, + "learning_rate": 0.00011993410146762871, + "loss": 1.9852, + "step": 558 + }, + { + "epoch": 0.07085815692736722, + "grad_norm": 1.1796875, + "learning_rate": 0.00011993302579120118, + "loss": 2.0538, + "step": 559 + }, + { + "epoch": 0.0709849157054126, + "grad_norm": 1.1875, + "learning_rate": 0.00011993194141135293, + "loss": 2.4099, + "step": 560 + }, + { + "epoch": 0.07111167448345798, + "grad_norm": 0.84765625, + "learning_rate": 0.0001199308483282414, + "loss": 2.1438, + "step": 561 + }, + { + "epoch": 0.07123843326150336, + "grad_norm": 0.94921875, + "learning_rate": 0.00011992974654202539, + "loss": 1.7242, + "step": 562 + }, + { + "epoch": 0.07136519203954873, + "grad_norm": 0.8984375, + "learning_rate": 0.00011992863605286483, + "loss": 1.9941, + "step": 563 + }, + { + "epoch": 0.07149195081759412, + "grad_norm": 1.0078125, + "learning_rate": 0.00011992751686092103, + "loss": 2.4498, + "step": 564 + }, + { + "epoch": 0.0716187095956395, + "grad_norm": 0.98828125, + "learning_rate": 0.00011992638896635651, + "loss": 1.9882, + "step": 565 + }, + { + "epoch": 0.07174546837368488, + "grad_norm": 0.94921875, + "learning_rate": 0.00011992525236933504, + "loss": 2.1906, + "step": 566 + }, + { + "epoch": 0.07187222715173026, + "grad_norm": 0.9921875, + "learning_rate": 0.00011992410707002168, + "loss": 2.5371, + "step": 567 + }, + { + "epoch": 0.07199898592977563, + "grad_norm": 0.9609375, + "learning_rate": 0.0001199229530685828, + "loss": 2.6712, + "step": 568 + }, + { + "epoch": 0.07212574470782102, + "grad_norm": 0.95703125, + "learning_rate": 0.00011992179036518592, + "loss": 2.1417, + "step": 569 + }, + { + "epoch": 0.0722525034858664, + "grad_norm": 1.671875, + "learning_rate": 0.0001199206189599999, + "loss": 2.8154, + "step": 570 + }, + { + "epoch": 0.07237926226391178, + "grad_norm": 1.0078125, + "learning_rate": 0.00011991943885319489, + "loss": 2.2032, + "step": 571 + }, + { + "epoch": 0.07250602104195715, + "grad_norm": 1.0703125, + "learning_rate": 0.00011991825004494222, + "loss": 2.1526, + "step": 572 + }, + { + "epoch": 0.07263277982000253, + "grad_norm": 0.86328125, + "learning_rate": 0.00011991705253541455, + "loss": 1.9459, + "step": 573 + }, + { + "epoch": 0.07275953859804792, + "grad_norm": 0.84765625, + "learning_rate": 0.00011991584632478576, + "loss": 1.9034, + "step": 574 + }, + { + "epoch": 0.0728862973760933, + "grad_norm": 1.0, + "learning_rate": 0.00011991463141323103, + "loss": 2.1874, + "step": 575 + }, + { + "epoch": 0.07301305615413868, + "grad_norm": 0.97265625, + "learning_rate": 0.00011991340780092679, + "loss": 2.4372, + "step": 576 + }, + { + "epoch": 0.07313981493218405, + "grad_norm": 0.89453125, + "learning_rate": 0.00011991217548805074, + "loss": 2.0467, + "step": 577 + }, + { + "epoch": 0.07326657371022943, + "grad_norm": 1.015625, + "learning_rate": 0.00011991093447478183, + "loss": 2.1048, + "step": 578 + }, + { + "epoch": 0.07339333248827482, + "grad_norm": 1.046875, + "learning_rate": 0.00011990968476130024, + "loss": 2.3717, + "step": 579 + }, + { + "epoch": 0.0735200912663202, + "grad_norm": 1.1015625, + "learning_rate": 0.00011990842634778751, + "loss": 2.0886, + "step": 580 + }, + { + "epoch": 0.07364685004436557, + "grad_norm": 1.0078125, + "learning_rate": 0.00011990715923442637, + "loss": 1.8821, + "step": 581 + }, + { + "epoch": 0.07377360882241095, + "grad_norm": 0.92578125, + "learning_rate": 0.0001199058834214008, + "loss": 1.9976, + "step": 582 + }, + { + "epoch": 0.07390036760045633, + "grad_norm": 0.9453125, + "learning_rate": 0.0001199045989088961, + "loss": 2.4496, + "step": 583 + }, + { + "epoch": 0.07402712637850171, + "grad_norm": 1.2109375, + "learning_rate": 0.0001199033056970988, + "loss": 2.4148, + "step": 584 + }, + { + "epoch": 0.0741538851565471, + "grad_norm": 1.03125, + "learning_rate": 0.0001199020037861967, + "loss": 2.2053, + "step": 585 + }, + { + "epoch": 0.07428064393459247, + "grad_norm": 0.90625, + "learning_rate": 0.00011990069317637886, + "loss": 2.4354, + "step": 586 + }, + { + "epoch": 0.07440740271263785, + "grad_norm": 0.8359375, + "learning_rate": 0.0001198993738678356, + "loss": 2.3413, + "step": 587 + }, + { + "epoch": 0.07453416149068323, + "grad_norm": 0.890625, + "learning_rate": 0.00011989804586075852, + "loss": 1.8386, + "step": 588 + }, + { + "epoch": 0.07466092026872861, + "grad_norm": 1.0625, + "learning_rate": 0.00011989670915534047, + "loss": 1.7969, + "step": 589 + }, + { + "epoch": 0.07478767904677398, + "grad_norm": 0.89453125, + "learning_rate": 0.00011989536375177556, + "loss": 2.0728, + "step": 590 + }, + { + "epoch": 0.07491443782481937, + "grad_norm": 0.92578125, + "learning_rate": 0.00011989400965025915, + "loss": 1.5632, + "step": 591 + }, + { + "epoch": 0.07504119660286475, + "grad_norm": 1.0625, + "learning_rate": 0.00011989264685098793, + "loss": 2.2075, + "step": 592 + }, + { + "epoch": 0.07516795538091013, + "grad_norm": 1.0859375, + "learning_rate": 0.00011989127535415976, + "loss": 2.5995, + "step": 593 + }, + { + "epoch": 0.07529471415895551, + "grad_norm": 1.15625, + "learning_rate": 0.00011988989515997382, + "loss": 2.0291, + "step": 594 + }, + { + "epoch": 0.07542147293700088, + "grad_norm": 0.96875, + "learning_rate": 0.00011988850626863055, + "loss": 2.3419, + "step": 595 + }, + { + "epoch": 0.07554823171504627, + "grad_norm": 1.015625, + "learning_rate": 0.00011988710868033165, + "loss": 1.6881, + "step": 596 + }, + { + "epoch": 0.07567499049309165, + "grad_norm": 0.91796875, + "learning_rate": 0.00011988570239528005, + "loss": 1.6839, + "step": 597 + }, + { + "epoch": 0.07580174927113703, + "grad_norm": 0.91015625, + "learning_rate": 0.00011988428741367999, + "loss": 2.0041, + "step": 598 + }, + { + "epoch": 0.0759285080491824, + "grad_norm": 1.0703125, + "learning_rate": 0.00011988286373573693, + "loss": 2.347, + "step": 599 + }, + { + "epoch": 0.07605526682722778, + "grad_norm": 1.0625, + "learning_rate": 0.00011988143136165767, + "loss": 2.8373, + "step": 600 + }, + { + "epoch": 0.07618202560527317, + "grad_norm": 1.0703125, + "learning_rate": 0.00011987999029165015, + "loss": 2.2407, + "step": 601 + }, + { + "epoch": 0.07630878438331855, + "grad_norm": 0.984375, + "learning_rate": 0.00011987854052592368, + "loss": 1.8534, + "step": 602 + }, + { + "epoch": 0.07643554316136393, + "grad_norm": 0.95703125, + "learning_rate": 0.00011987708206468878, + "loss": 2.4979, + "step": 603 + }, + { + "epoch": 0.0765623019394093, + "grad_norm": 0.98046875, + "learning_rate": 0.00011987561490815723, + "loss": 2.3751, + "step": 604 + }, + { + "epoch": 0.07668906071745468, + "grad_norm": 0.96875, + "learning_rate": 0.00011987413905654213, + "loss": 2.1247, + "step": 605 + }, + { + "epoch": 0.07681581949550007, + "grad_norm": 0.953125, + "learning_rate": 0.00011987265451005778, + "loss": 2.1684, + "step": 606 + }, + { + "epoch": 0.07694257827354545, + "grad_norm": 1.1953125, + "learning_rate": 0.00011987116126891977, + "loss": 2.2431, + "step": 607 + }, + { + "epoch": 0.07706933705159082, + "grad_norm": 0.984375, + "learning_rate": 0.00011986965933334493, + "loss": 1.834, + "step": 608 + }, + { + "epoch": 0.0771960958296362, + "grad_norm": 0.93359375, + "learning_rate": 0.0001198681487035514, + "loss": 2.3618, + "step": 609 + }, + { + "epoch": 0.07732285460768158, + "grad_norm": 0.9375, + "learning_rate": 0.00011986662937975852, + "loss": 2.0986, + "step": 610 + }, + { + "epoch": 0.07744961338572696, + "grad_norm": 0.8984375, + "learning_rate": 0.00011986510136218695, + "loss": 1.9086, + "step": 611 + }, + { + "epoch": 0.07757637216377235, + "grad_norm": 0.984375, + "learning_rate": 0.00011986356465105856, + "loss": 2.4112, + "step": 612 + }, + { + "epoch": 0.07770313094181772, + "grad_norm": 1.0859375, + "learning_rate": 0.00011986201924659656, + "loss": 1.9192, + "step": 613 + }, + { + "epoch": 0.0778298897198631, + "grad_norm": 0.98828125, + "learning_rate": 0.00011986046514902532, + "loss": 2.5386, + "step": 614 + }, + { + "epoch": 0.07795664849790848, + "grad_norm": 0.94140625, + "learning_rate": 0.00011985890235857053, + "loss": 3.2956, + "step": 615 + }, + { + "epoch": 0.07808340727595386, + "grad_norm": 1.0, + "learning_rate": 0.00011985733087545917, + "loss": 2.4149, + "step": 616 + }, + { + "epoch": 0.07821016605399923, + "grad_norm": 0.98046875, + "learning_rate": 0.00011985575069991942, + "loss": 2.296, + "step": 617 + }, + { + "epoch": 0.07833692483204462, + "grad_norm": 0.88671875, + "learning_rate": 0.00011985416183218078, + "loss": 1.956, + "step": 618 + }, + { + "epoch": 0.07846368361009, + "grad_norm": 0.84375, + "learning_rate": 0.00011985256427247395, + "loss": 2.044, + "step": 619 + }, + { + "epoch": 0.07859044238813538, + "grad_norm": 0.93359375, + "learning_rate": 0.00011985095802103094, + "loss": 2.2935, + "step": 620 + }, + { + "epoch": 0.07871720116618076, + "grad_norm": 0.99609375, + "learning_rate": 0.00011984934307808502, + "loss": 1.8738, + "step": 621 + }, + { + "epoch": 0.07884395994422613, + "grad_norm": 1.0078125, + "learning_rate": 0.00011984771944387071, + "loss": 2.1287, + "step": 622 + }, + { + "epoch": 0.07897071872227152, + "grad_norm": 0.9765625, + "learning_rate": 0.00011984608711862376, + "loss": 1.948, + "step": 623 + }, + { + "epoch": 0.0790974775003169, + "grad_norm": 0.88671875, + "learning_rate": 0.00011984444610258125, + "loss": 2.0518, + "step": 624 + }, + { + "epoch": 0.07922423627836228, + "grad_norm": 0.9296875, + "learning_rate": 0.00011984279639598149, + "loss": 1.7571, + "step": 625 + }, + { + "epoch": 0.07935099505640765, + "grad_norm": 0.98046875, + "learning_rate": 0.00011984113799906402, + "loss": 1.9322, + "step": 626 + }, + { + "epoch": 0.07947775383445303, + "grad_norm": 1.0078125, + "learning_rate": 0.00011983947091206968, + "loss": 2.4565, + "step": 627 + }, + { + "epoch": 0.07960451261249842, + "grad_norm": 1.3203125, + "learning_rate": 0.00011983779513524058, + "loss": 2.3367, + "step": 628 + }, + { + "epoch": 0.0797312713905438, + "grad_norm": 0.96875, + "learning_rate": 0.00011983611066882005, + "loss": 2.5078, + "step": 629 + }, + { + "epoch": 0.07985803016858918, + "grad_norm": 0.97265625, + "learning_rate": 0.00011983441751305274, + "loss": 2.3488, + "step": 630 + }, + { + "epoch": 0.07998478894663455, + "grad_norm": 1.0703125, + "learning_rate": 0.00011983271566818449, + "loss": 1.6402, + "step": 631 + }, + { + "epoch": 0.08011154772467993, + "grad_norm": 0.9921875, + "learning_rate": 0.00011983100513446247, + "loss": 2.0724, + "step": 632 + }, + { + "epoch": 0.08023830650272532, + "grad_norm": 1.03125, + "learning_rate": 0.00011982928591213507, + "loss": 2.1708, + "step": 633 + }, + { + "epoch": 0.0803650652807707, + "grad_norm": 1.1328125, + "learning_rate": 0.00011982755800145196, + "loss": 2.2252, + "step": 634 + }, + { + "epoch": 0.08049182405881607, + "grad_norm": 1.234375, + "learning_rate": 0.00011982582140266405, + "loss": 2.4464, + "step": 635 + }, + { + "epoch": 0.08061858283686145, + "grad_norm": 0.98828125, + "learning_rate": 0.00011982407611602353, + "loss": 2.0225, + "step": 636 + }, + { + "epoch": 0.08074534161490683, + "grad_norm": 1.0859375, + "learning_rate": 0.00011982232214178389, + "loss": 1.8768, + "step": 637 + }, + { + "epoch": 0.08087210039295221, + "grad_norm": 0.984375, + "learning_rate": 0.00011982055948019977, + "loss": 2.2465, + "step": 638 + }, + { + "epoch": 0.0809988591709976, + "grad_norm": 1.140625, + "learning_rate": 0.00011981878813152721, + "loss": 1.8231, + "step": 639 + }, + { + "epoch": 0.08112561794904297, + "grad_norm": 0.953125, + "learning_rate": 0.0001198170080960234, + "loss": 1.9809, + "step": 640 + }, + { + "epoch": 0.08125237672708835, + "grad_norm": 1.0078125, + "learning_rate": 0.00011981521937394684, + "loss": 1.7854, + "step": 641 + }, + { + "epoch": 0.08137913550513373, + "grad_norm": 0.9296875, + "learning_rate": 0.00011981342196555732, + "loss": 2.0439, + "step": 642 + }, + { + "epoch": 0.08150589428317911, + "grad_norm": 1.0859375, + "learning_rate": 0.0001198116158711158, + "loss": 2.6965, + "step": 643 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.91015625, + "learning_rate": 0.00011980980109088462, + "loss": 1.985, + "step": 644 + }, + { + "epoch": 0.08175941183926987, + "grad_norm": 0.8984375, + "learning_rate": 0.0001198079776251273, + "loss": 1.7421, + "step": 645 + }, + { + "epoch": 0.08188617061731525, + "grad_norm": 0.9609375, + "learning_rate": 0.00011980614547410861, + "loss": 2.3657, + "step": 646 + }, + { + "epoch": 0.08201292939536063, + "grad_norm": 0.97265625, + "learning_rate": 0.00011980430463809464, + "loss": 1.9375, + "step": 647 + }, + { + "epoch": 0.08213968817340601, + "grad_norm": 0.80078125, + "learning_rate": 0.00011980245511735273, + "loss": 1.8679, + "step": 648 + }, + { + "epoch": 0.08226644695145138, + "grad_norm": 1.0390625, + "learning_rate": 0.00011980059691215143, + "loss": 2.269, + "step": 649 + }, + { + "epoch": 0.08239320572949677, + "grad_norm": 0.953125, + "learning_rate": 0.00011979873002276063, + "loss": 1.7489, + "step": 650 + }, + { + "epoch": 0.08251996450754215, + "grad_norm": 0.8984375, + "learning_rate": 0.00011979685444945141, + "loss": 1.9268, + "step": 651 + }, + { + "epoch": 0.08264672328558753, + "grad_norm": 1.3671875, + "learning_rate": 0.00011979497019249612, + "loss": 1.6361, + "step": 652 + }, + { + "epoch": 0.0827734820636329, + "grad_norm": 1.0625, + "learning_rate": 0.00011979307725216843, + "loss": 2.2859, + "step": 653 + }, + { + "epoch": 0.08290024084167828, + "grad_norm": 0.96484375, + "learning_rate": 0.00011979117562874322, + "loss": 1.7728, + "step": 654 + }, + { + "epoch": 0.08302699961972367, + "grad_norm": 1.03125, + "learning_rate": 0.00011978926532249663, + "loss": 1.7889, + "step": 655 + }, + { + "epoch": 0.08315375839776905, + "grad_norm": 1.140625, + "learning_rate": 0.00011978734633370608, + "loss": 2.2557, + "step": 656 + }, + { + "epoch": 0.08328051717581443, + "grad_norm": 0.95703125, + "learning_rate": 0.00011978541866265025, + "loss": 2.1239, + "step": 657 + }, + { + "epoch": 0.0834072759538598, + "grad_norm": 1.0625, + "learning_rate": 0.00011978348230960906, + "loss": 2.6118, + "step": 658 + }, + { + "epoch": 0.08353403473190518, + "grad_norm": 1.0234375, + "learning_rate": 0.00011978153727486372, + "loss": 2.7797, + "step": 659 + }, + { + "epoch": 0.08366079350995057, + "grad_norm": 0.9765625, + "learning_rate": 0.00011977958355869667, + "loss": 1.9079, + "step": 660 + }, + { + "epoch": 0.08378755228799595, + "grad_norm": 0.84765625, + "learning_rate": 0.00011977762116139164, + "loss": 2.0391, + "step": 661 + }, + { + "epoch": 0.08391431106604132, + "grad_norm": 0.890625, + "learning_rate": 0.00011977565008323361, + "loss": 1.9109, + "step": 662 + }, + { + "epoch": 0.0840410698440867, + "grad_norm": 0.89453125, + "learning_rate": 0.0001197736703245088, + "loss": 1.8121, + "step": 663 + }, + { + "epoch": 0.08416782862213208, + "grad_norm": 1.0234375, + "learning_rate": 0.00011977168188550474, + "loss": 1.6679, + "step": 664 + }, + { + "epoch": 0.08429458740017746, + "grad_norm": 1.171875, + "learning_rate": 0.00011976968476651016, + "loss": 2.61, + "step": 665 + }, + { + "epoch": 0.08442134617822285, + "grad_norm": 1.09375, + "learning_rate": 0.00011976767896781508, + "loss": 2.5032, + "step": 666 + }, + { + "epoch": 0.08454810495626822, + "grad_norm": 1.2265625, + "learning_rate": 0.00011976566448971082, + "loss": 2.2979, + "step": 667 + }, + { + "epoch": 0.0846748637343136, + "grad_norm": 0.91015625, + "learning_rate": 0.00011976364133248985, + "loss": 1.7313, + "step": 668 + }, + { + "epoch": 0.08480162251235898, + "grad_norm": 0.8671875, + "learning_rate": 0.00011976160949644604, + "loss": 2.395, + "step": 669 + }, + { + "epoch": 0.08492838129040436, + "grad_norm": 1.0078125, + "learning_rate": 0.00011975956898187444, + "loss": 2.5355, + "step": 670 + }, + { + "epoch": 0.08505514006844973, + "grad_norm": 0.97265625, + "learning_rate": 0.00011975751978907132, + "loss": 2.0805, + "step": 671 + }, + { + "epoch": 0.08518189884649512, + "grad_norm": 0.93359375, + "learning_rate": 0.00011975546191833432, + "loss": 2.1901, + "step": 672 + }, + { + "epoch": 0.0853086576245405, + "grad_norm": 0.9453125, + "learning_rate": 0.00011975339536996225, + "loss": 2.3183, + "step": 673 + }, + { + "epoch": 0.08543541640258588, + "grad_norm": 1.0078125, + "learning_rate": 0.00011975132014425523, + "loss": 2.1307, + "step": 674 + }, + { + "epoch": 0.08556217518063126, + "grad_norm": 0.90234375, + "learning_rate": 0.00011974923624151461, + "loss": 1.8928, + "step": 675 + }, + { + "epoch": 0.08568893395867663, + "grad_norm": 0.921875, + "learning_rate": 0.00011974714366204301, + "loss": 1.9199, + "step": 676 + }, + { + "epoch": 0.08581569273672202, + "grad_norm": 0.90234375, + "learning_rate": 0.00011974504240614434, + "loss": 2.1302, + "step": 677 + }, + { + "epoch": 0.0859424515147674, + "grad_norm": 1.2578125, + "learning_rate": 0.0001197429324741237, + "loss": 2.2597, + "step": 678 + }, + { + "epoch": 0.08606921029281278, + "grad_norm": 0.98828125, + "learning_rate": 0.00011974081386628754, + "loss": 2.5662, + "step": 679 + }, + { + "epoch": 0.08619596907085816, + "grad_norm": 1.3515625, + "learning_rate": 0.00011973868658294348, + "loss": 2.3477, + "step": 680 + }, + { + "epoch": 0.08632272784890353, + "grad_norm": 1.1796875, + "learning_rate": 0.00011973655062440045, + "loss": 2.0649, + "step": 681 + }, + { + "epoch": 0.08644948662694892, + "grad_norm": 1.09375, + "learning_rate": 0.00011973440599096864, + "loss": 1.9167, + "step": 682 + }, + { + "epoch": 0.0865762454049943, + "grad_norm": 1.078125, + "learning_rate": 0.00011973225268295953, + "loss": 1.9955, + "step": 683 + }, + { + "epoch": 0.08670300418303968, + "grad_norm": 1.0625, + "learning_rate": 0.00011973009070068575, + "loss": 2.5441, + "step": 684 + }, + { + "epoch": 0.08682976296108505, + "grad_norm": 1.03125, + "learning_rate": 0.00011972792004446131, + "loss": 1.5964, + "step": 685 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 1.078125, + "learning_rate": 0.0001197257407146014, + "loss": 2.15, + "step": 686 + }, + { + "epoch": 0.08708328051717582, + "grad_norm": 0.98046875, + "learning_rate": 0.00011972355271142253, + "loss": 2.0484, + "step": 687 + }, + { + "epoch": 0.0872100392952212, + "grad_norm": 1.0625, + "learning_rate": 0.00011972135603524243, + "loss": 2.1226, + "step": 688 + }, + { + "epoch": 0.08733679807326658, + "grad_norm": 1.0078125, + "learning_rate": 0.00011971915068638009, + "loss": 2.4057, + "step": 689 + }, + { + "epoch": 0.08746355685131195, + "grad_norm": 1.3359375, + "learning_rate": 0.00011971693666515577, + "loss": 2.2777, + "step": 690 + }, + { + "epoch": 0.08759031562935733, + "grad_norm": 0.96875, + "learning_rate": 0.00011971471397189101, + "loss": 2.3298, + "step": 691 + }, + { + "epoch": 0.08771707440740271, + "grad_norm": 0.91796875, + "learning_rate": 0.00011971248260690855, + "loss": 1.7263, + "step": 692 + }, + { + "epoch": 0.0878438331854481, + "grad_norm": 1.0234375, + "learning_rate": 0.00011971024257053246, + "loss": 1.8121, + "step": 693 + }, + { + "epoch": 0.08797059196349347, + "grad_norm": 0.9765625, + "learning_rate": 0.00011970799386308802, + "loss": 2.1231, + "step": 694 + }, + { + "epoch": 0.08809735074153885, + "grad_norm": 0.921875, + "learning_rate": 0.00011970573648490178, + "loss": 1.5007, + "step": 695 + }, + { + "epoch": 0.08822410951958423, + "grad_norm": 0.953125, + "learning_rate": 0.00011970347043630157, + "loss": 2.1878, + "step": 696 + }, + { + "epoch": 0.08835086829762961, + "grad_norm": 0.8984375, + "learning_rate": 0.00011970119571761648, + "loss": 1.8548, + "step": 697 + }, + { + "epoch": 0.088477627075675, + "grad_norm": 1.03125, + "learning_rate": 0.00011969891232917678, + "loss": 2.0327, + "step": 698 + }, + { + "epoch": 0.08860438585372037, + "grad_norm": 0.83984375, + "learning_rate": 0.00011969662027131412, + "loss": 2.5235, + "step": 699 + }, + { + "epoch": 0.08873114463176575, + "grad_norm": 1.0390625, + "learning_rate": 0.00011969431954436135, + "loss": 1.8969, + "step": 700 + }, + { + "epoch": 0.08885790340981113, + "grad_norm": 1.109375, + "learning_rate": 0.00011969201014865251, + "loss": 2.3541, + "step": 701 + }, + { + "epoch": 0.08898466218785651, + "grad_norm": 0.95703125, + "learning_rate": 0.00011968969208452307, + "loss": 2.3007, + "step": 702 + }, + { + "epoch": 0.08911142096590188, + "grad_norm": 0.87890625, + "learning_rate": 0.00011968736535230958, + "loss": 2.0309, + "step": 703 + }, + { + "epoch": 0.08923817974394727, + "grad_norm": 0.9765625, + "learning_rate": 0.00011968502995234996, + "loss": 1.8516, + "step": 704 + }, + { + "epoch": 0.08936493852199265, + "grad_norm": 1.0703125, + "learning_rate": 0.00011968268588498334, + "loss": 2.1298, + "step": 705 + }, + { + "epoch": 0.08949169730003803, + "grad_norm": 0.9140625, + "learning_rate": 0.00011968033315055015, + "loss": 1.8224, + "step": 706 + }, + { + "epoch": 0.08961845607808341, + "grad_norm": 1.0859375, + "learning_rate": 0.00011967797174939201, + "loss": 1.7282, + "step": 707 + }, + { + "epoch": 0.08974521485612878, + "grad_norm": 1.8984375, + "learning_rate": 0.00011967560168185188, + "loss": 1.8683, + "step": 708 + }, + { + "epoch": 0.08987197363417417, + "grad_norm": 0.94921875, + "learning_rate": 0.00011967322294827391, + "loss": 2.2558, + "step": 709 + }, + { + "epoch": 0.08999873241221955, + "grad_norm": 0.9765625, + "learning_rate": 0.00011967083554900356, + "loss": 2.0726, + "step": 710 + }, + { + "epoch": 0.09012549119026493, + "grad_norm": 1.2109375, + "learning_rate": 0.00011966843948438751, + "loss": 2.7413, + "step": 711 + }, + { + "epoch": 0.0902522499683103, + "grad_norm": 0.90234375, + "learning_rate": 0.00011966603475477373, + "loss": 2.3527, + "step": 712 + }, + { + "epoch": 0.09037900874635568, + "grad_norm": 1.1640625, + "learning_rate": 0.00011966362136051142, + "loss": 2.3866, + "step": 713 + }, + { + "epoch": 0.09050576752440107, + "grad_norm": 0.9296875, + "learning_rate": 0.00011966119930195105, + "loss": 1.8672, + "step": 714 + }, + { + "epoch": 0.09063252630244645, + "grad_norm": 1.1171875, + "learning_rate": 0.00011965876857944435, + "loss": 2.241, + "step": 715 + }, + { + "epoch": 0.09075928508049183, + "grad_norm": 0.91796875, + "learning_rate": 0.00011965632919334432, + "loss": 1.8195, + "step": 716 + }, + { + "epoch": 0.0908860438585372, + "grad_norm": 1.0234375, + "learning_rate": 0.0001196538811440052, + "loss": 2.044, + "step": 717 + }, + { + "epoch": 0.09101280263658258, + "grad_norm": 1.03125, + "learning_rate": 0.00011965142443178247, + "loss": 1.8305, + "step": 718 + }, + { + "epoch": 0.09113956141462796, + "grad_norm": 1.0, + "learning_rate": 0.00011964895905703293, + "loss": 2.2118, + "step": 719 + }, + { + "epoch": 0.09126632019267335, + "grad_norm": 1.3515625, + "learning_rate": 0.00011964648502011455, + "loss": 2.5554, + "step": 720 + }, + { + "epoch": 0.09139307897071872, + "grad_norm": 0.90234375, + "learning_rate": 0.00011964400232138668, + "loss": 1.9718, + "step": 721 + }, + { + "epoch": 0.0915198377487641, + "grad_norm": 0.8125, + "learning_rate": 0.00011964151096120979, + "loss": 1.6874, + "step": 722 + }, + { + "epoch": 0.09164659652680948, + "grad_norm": 0.82421875, + "learning_rate": 0.0001196390109399457, + "loss": 2.1046, + "step": 723 + }, + { + "epoch": 0.09177335530485486, + "grad_norm": 0.8671875, + "learning_rate": 0.00011963650225795745, + "loss": 2.0455, + "step": 724 + }, + { + "epoch": 0.09190011408290025, + "grad_norm": 1.015625, + "learning_rate": 0.00011963398491560935, + "loss": 2.0469, + "step": 725 + }, + { + "epoch": 0.09202687286094562, + "grad_norm": 0.9453125, + "learning_rate": 0.000119631458913267, + "loss": 2.1176, + "step": 726 + }, + { + "epoch": 0.092153631638991, + "grad_norm": 0.94140625, + "learning_rate": 0.00011962892425129717, + "loss": 2.1013, + "step": 727 + }, + { + "epoch": 0.09228039041703638, + "grad_norm": 1.1875, + "learning_rate": 0.000119626380930068, + "loss": 2.6649, + "step": 728 + }, + { + "epoch": 0.09240714919508176, + "grad_norm": 0.94921875, + "learning_rate": 0.00011962382894994878, + "loss": 2.2427, + "step": 729 + }, + { + "epoch": 0.09253390797312713, + "grad_norm": 0.97265625, + "learning_rate": 0.0001196212683113101, + "loss": 2.2411, + "step": 730 + }, + { + "epoch": 0.09266066675117252, + "grad_norm": 1.0703125, + "learning_rate": 0.00011961869901452387, + "loss": 2.2934, + "step": 731 + }, + { + "epoch": 0.0927874255292179, + "grad_norm": 0.89453125, + "learning_rate": 0.00011961612105996313, + "loss": 1.7202, + "step": 732 + }, + { + "epoch": 0.09291418430726328, + "grad_norm": 0.8671875, + "learning_rate": 0.00011961353444800231, + "loss": 2.1739, + "step": 733 + }, + { + "epoch": 0.09304094308530866, + "grad_norm": 1.078125, + "learning_rate": 0.00011961093917901702, + "loss": 2.2496, + "step": 734 + }, + { + "epoch": 0.09316770186335403, + "grad_norm": 0.91796875, + "learning_rate": 0.00011960833525338412, + "loss": 2.0156, + "step": 735 + }, + { + "epoch": 0.09329446064139942, + "grad_norm": 0.97265625, + "learning_rate": 0.00011960572267148176, + "loss": 2.3718, + "step": 736 + }, + { + "epoch": 0.0934212194194448, + "grad_norm": 1.1640625, + "learning_rate": 0.00011960310143368936, + "loss": 2.9012, + "step": 737 + }, + { + "epoch": 0.09354797819749018, + "grad_norm": 0.98046875, + "learning_rate": 0.00011960047154038753, + "loss": 2.3472, + "step": 738 + }, + { + "epoch": 0.09367473697553555, + "grad_norm": 0.96875, + "learning_rate": 0.00011959783299195821, + "loss": 2.6858, + "step": 739 + }, + { + "epoch": 0.09380149575358093, + "grad_norm": 0.84765625, + "learning_rate": 0.00011959518578878457, + "loss": 2.0731, + "step": 740 + }, + { + "epoch": 0.09392825453162632, + "grad_norm": 1.0390625, + "learning_rate": 0.00011959252993125104, + "loss": 2.0515, + "step": 741 + }, + { + "epoch": 0.0940550133096717, + "grad_norm": 0.90625, + "learning_rate": 0.00011958986541974326, + "loss": 2.0835, + "step": 742 + }, + { + "epoch": 0.09418177208771708, + "grad_norm": 1.171875, + "learning_rate": 0.00011958719225464821, + "loss": 2.1471, + "step": 743 + }, + { + "epoch": 0.09430853086576245, + "grad_norm": 0.90234375, + "learning_rate": 0.00011958451043635406, + "loss": 1.9637, + "step": 744 + }, + { + "epoch": 0.09443528964380783, + "grad_norm": 0.9375, + "learning_rate": 0.00011958181996525029, + "loss": 2.4131, + "step": 745 + }, + { + "epoch": 0.09456204842185321, + "grad_norm": 1.015625, + "learning_rate": 0.00011957912084172758, + "loss": 1.9366, + "step": 746 + }, + { + "epoch": 0.0946888071998986, + "grad_norm": 0.8984375, + "learning_rate": 0.0001195764130661779, + "loss": 2.0421, + "step": 747 + }, + { + "epoch": 0.09481556597794397, + "grad_norm": 1.0078125, + "learning_rate": 0.0001195736966389945, + "loss": 2.4487, + "step": 748 + }, + { + "epoch": 0.09494232475598935, + "grad_norm": 1.0234375, + "learning_rate": 0.00011957097156057179, + "loss": 2.1194, + "step": 749 + }, + { + "epoch": 0.09506908353403473, + "grad_norm": 0.94921875, + "learning_rate": 0.00011956823783130558, + "loss": 2.0232, + "step": 750 + }, + { + "epoch": 0.09519584231208011, + "grad_norm": 1.0234375, + "learning_rate": 0.00011956549545159281, + "loss": 1.7714, + "step": 751 + }, + { + "epoch": 0.0953226010901255, + "grad_norm": 0.9609375, + "learning_rate": 0.00011956274442183175, + "loss": 1.8993, + "step": 752 + }, + { + "epoch": 0.09544935986817087, + "grad_norm": 1.078125, + "learning_rate": 0.0001195599847424219, + "loss": 2.0853, + "step": 753 + }, + { + "epoch": 0.09557611864621625, + "grad_norm": 1.0703125, + "learning_rate": 0.00011955721641376398, + "loss": 2.4571, + "step": 754 + }, + { + "epoch": 0.09570287742426163, + "grad_norm": 1.03125, + "learning_rate": 0.00011955443943626006, + "loss": 2.002, + "step": 755 + }, + { + "epoch": 0.09582963620230701, + "grad_norm": 0.9921875, + "learning_rate": 0.00011955165381031339, + "loss": 1.9114, + "step": 756 + }, + { + "epoch": 0.09595639498035238, + "grad_norm": 1.0625, + "learning_rate": 0.00011954885953632848, + "loss": 1.9698, + "step": 757 + }, + { + "epoch": 0.09608315375839777, + "grad_norm": 1.109375, + "learning_rate": 0.00011954605661471113, + "loss": 2.376, + "step": 758 + }, + { + "epoch": 0.09620991253644315, + "grad_norm": 0.953125, + "learning_rate": 0.00011954324504586837, + "loss": 2.3287, + "step": 759 + }, + { + "epoch": 0.09633667131448853, + "grad_norm": 0.98828125, + "learning_rate": 0.0001195404248302085, + "loss": 2.1746, + "step": 760 + }, + { + "epoch": 0.09646343009253391, + "grad_norm": 0.9609375, + "learning_rate": 0.00011953759596814105, + "loss": 1.9198, + "step": 761 + }, + { + "epoch": 0.09659018887057928, + "grad_norm": 0.953125, + "learning_rate": 0.00011953475846007686, + "loss": 2.4026, + "step": 762 + }, + { + "epoch": 0.09671694764862467, + "grad_norm": 0.9921875, + "learning_rate": 0.00011953191230642796, + "loss": 1.7982, + "step": 763 + }, + { + "epoch": 0.09684370642667005, + "grad_norm": 0.91015625, + "learning_rate": 0.00011952905750760767, + "loss": 1.6029, + "step": 764 + }, + { + "epoch": 0.09697046520471543, + "grad_norm": 0.91015625, + "learning_rate": 0.00011952619406403057, + "loss": 2.3202, + "step": 765 + }, + { + "epoch": 0.0970972239827608, + "grad_norm": 0.9921875, + "learning_rate": 0.0001195233219761125, + "loss": 1.938, + "step": 766 + }, + { + "epoch": 0.09722398276080618, + "grad_norm": 0.875, + "learning_rate": 0.00011952044124427051, + "loss": 2.004, + "step": 767 + }, + { + "epoch": 0.09735074153885157, + "grad_norm": 0.9453125, + "learning_rate": 0.00011951755186892294, + "loss": 1.8755, + "step": 768 + }, + { + "epoch": 0.09747750031689695, + "grad_norm": 0.9921875, + "learning_rate": 0.00011951465385048943, + "loss": 1.9519, + "step": 769 + }, + { + "epoch": 0.09760425909494233, + "grad_norm": 1.0703125, + "learning_rate": 0.00011951174718939076, + "loss": 2.5218, + "step": 770 + }, + { + "epoch": 0.0977310178729877, + "grad_norm": 0.88671875, + "learning_rate": 0.0001195088318860491, + "loss": 1.939, + "step": 771 + }, + { + "epoch": 0.09785777665103308, + "grad_norm": 0.94921875, + "learning_rate": 0.00011950590794088774, + "loss": 1.885, + "step": 772 + }, + { + "epoch": 0.09798453542907846, + "grad_norm": 0.8828125, + "learning_rate": 0.00011950297535433134, + "loss": 1.7499, + "step": 773 + }, + { + "epoch": 0.09811129420712385, + "grad_norm": 0.8671875, + "learning_rate": 0.00011950003412680576, + "loss": 1.4875, + "step": 774 + }, + { + "epoch": 0.09823805298516922, + "grad_norm": 1.140625, + "learning_rate": 0.00011949708425873811, + "loss": 2.3029, + "step": 775 + }, + { + "epoch": 0.0983648117632146, + "grad_norm": 0.99609375, + "learning_rate": 0.00011949412575055678, + "loss": 2.1795, + "step": 776 + }, + { + "epoch": 0.09849157054125998, + "grad_norm": 0.9375, + "learning_rate": 0.0001194911586026914, + "loss": 1.8516, + "step": 777 + }, + { + "epoch": 0.09861832931930536, + "grad_norm": 0.9375, + "learning_rate": 0.00011948818281557285, + "loss": 2.0553, + "step": 778 + }, + { + "epoch": 0.09874508809735075, + "grad_norm": 0.96875, + "learning_rate": 0.00011948519838963327, + "loss": 2.2203, + "step": 779 + }, + { + "epoch": 0.09887184687539612, + "grad_norm": 1.6640625, + "learning_rate": 0.00011948220532530605, + "loss": 2.6721, + "step": 780 + }, + { + "epoch": 0.0989986056534415, + "grad_norm": 1.0, + "learning_rate": 0.00011947920362302586, + "loss": 2.2112, + "step": 781 + }, + { + "epoch": 0.09912536443148688, + "grad_norm": 0.8828125, + "learning_rate": 0.00011947619328322861, + "loss": 1.5344, + "step": 782 + }, + { + "epoch": 0.09925212320953226, + "grad_norm": 0.85546875, + "learning_rate": 0.00011947317430635142, + "loss": 1.8941, + "step": 783 + }, + { + "epoch": 0.09937888198757763, + "grad_norm": 0.97265625, + "learning_rate": 0.00011947014669283273, + "loss": 2.3235, + "step": 784 + }, + { + "epoch": 0.09950564076562302, + "grad_norm": 0.859375, + "learning_rate": 0.0001194671104431122, + "loss": 2.0323, + "step": 785 + }, + { + "epoch": 0.0996323995436684, + "grad_norm": 0.9375, + "learning_rate": 0.00011946406555763074, + "loss": 2.2627, + "step": 786 + }, + { + "epoch": 0.09975915832171378, + "grad_norm": 1.046875, + "learning_rate": 0.00011946101203683057, + "loss": 2.0878, + "step": 787 + }, + { + "epoch": 0.09988591709975916, + "grad_norm": 1.2109375, + "learning_rate": 0.00011945794988115509, + "loss": 2.3721, + "step": 788 + }, + { + "epoch": 0.10001267587780453, + "grad_norm": 0.98828125, + "learning_rate": 0.00011945487909104896, + "loss": 1.7971, + "step": 789 + }, + { + "epoch": 0.10013943465584992, + "grad_norm": 0.95703125, + "learning_rate": 0.00011945179966695816, + "loss": 2.027, + "step": 790 + }, + { + "epoch": 0.1002661934338953, + "grad_norm": 0.9609375, + "learning_rate": 0.00011944871160932986, + "loss": 1.7246, + "step": 791 + }, + { + "epoch": 0.10039295221194068, + "grad_norm": 0.90625, + "learning_rate": 0.0001194456149186125, + "loss": 1.9774, + "step": 792 + }, + { + "epoch": 0.10051971098998605, + "grad_norm": 1.171875, + "learning_rate": 0.00011944250959525579, + "loss": 1.9793, + "step": 793 + }, + { + "epoch": 0.10064646976803143, + "grad_norm": 0.8671875, + "learning_rate": 0.00011943939563971068, + "loss": 1.6943, + "step": 794 + }, + { + "epoch": 0.10077322854607682, + "grad_norm": 0.890625, + "learning_rate": 0.00011943627305242937, + "loss": 1.7865, + "step": 795 + }, + { + "epoch": 0.1008999873241222, + "grad_norm": 0.98046875, + "learning_rate": 0.00011943314183386531, + "loss": 2.0512, + "step": 796 + }, + { + "epoch": 0.10102674610216758, + "grad_norm": 0.91015625, + "learning_rate": 0.00011943000198447325, + "loss": 2.1047, + "step": 797 + }, + { + "epoch": 0.10115350488021295, + "grad_norm": 1.0390625, + "learning_rate": 0.00011942685350470912, + "loss": 1.9063, + "step": 798 + }, + { + "epoch": 0.10128026365825833, + "grad_norm": 0.9921875, + "learning_rate": 0.00011942369639503015, + "loss": 2.6003, + "step": 799 + }, + { + "epoch": 0.10140702243630371, + "grad_norm": 0.86328125, + "learning_rate": 0.00011942053065589483, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.1015337812143491, + "grad_norm": 0.99609375, + "learning_rate": 0.00011941735628776285, + "loss": 2.2075, + "step": 801 + }, + { + "epoch": 0.10166053999239448, + "grad_norm": 0.99609375, + "learning_rate": 0.00011941417329109522, + "loss": 2.4068, + "step": 802 + }, + { + "epoch": 0.10178729877043985, + "grad_norm": 0.94921875, + "learning_rate": 0.00011941098166635416, + "loss": 2.1338, + "step": 803 + }, + { + "epoch": 0.10191405754848523, + "grad_norm": 1.1484375, + "learning_rate": 0.00011940778141400316, + "loss": 2.2051, + "step": 804 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 0.9375, + "learning_rate": 0.00011940457253450694, + "loss": 1.949, + "step": 805 + }, + { + "epoch": 0.102167575104576, + "grad_norm": 0.9453125, + "learning_rate": 0.0001194013550283315, + "loss": 2.0288, + "step": 806 + }, + { + "epoch": 0.10229433388262137, + "grad_norm": 0.94921875, + "learning_rate": 0.00011939812889594411, + "loss": 1.9459, + "step": 807 + }, + { + "epoch": 0.10242109266066675, + "grad_norm": 1.0546875, + "learning_rate": 0.00011939489413781323, + "loss": 2.3006, + "step": 808 + }, + { + "epoch": 0.10254785143871213, + "grad_norm": 0.9609375, + "learning_rate": 0.00011939165075440863, + "loss": 2.0992, + "step": 809 + }, + { + "epoch": 0.10267461021675751, + "grad_norm": 0.8828125, + "learning_rate": 0.0001193883987462013, + "loss": 2.0763, + "step": 810 + }, + { + "epoch": 0.1028013689948029, + "grad_norm": 0.9296875, + "learning_rate": 0.00011938513811366349, + "loss": 1.8806, + "step": 811 + }, + { + "epoch": 0.10292812777284827, + "grad_norm": 0.99609375, + "learning_rate": 0.00011938186885726873, + "loss": 2.4763, + "step": 812 + }, + { + "epoch": 0.10305488655089365, + "grad_norm": 1.4453125, + "learning_rate": 0.00011937859097749175, + "loss": 1.8215, + "step": 813 + }, + { + "epoch": 0.10318164532893903, + "grad_norm": 0.8515625, + "learning_rate": 0.00011937530447480858, + "loss": 2.2605, + "step": 814 + }, + { + "epoch": 0.10330840410698441, + "grad_norm": 0.8984375, + "learning_rate": 0.00011937200934969648, + "loss": 1.9445, + "step": 815 + }, + { + "epoch": 0.10343516288502978, + "grad_norm": 0.85546875, + "learning_rate": 0.00011936870560263394, + "loss": 1.9748, + "step": 816 + }, + { + "epoch": 0.10356192166307517, + "grad_norm": 1.0, + "learning_rate": 0.00011936539323410077, + "loss": 2.0225, + "step": 817 + }, + { + "epoch": 0.10368868044112055, + "grad_norm": 1.03125, + "learning_rate": 0.00011936207224457796, + "loss": 2.5079, + "step": 818 + }, + { + "epoch": 0.10381543921916593, + "grad_norm": 1.0078125, + "learning_rate": 0.00011935874263454779, + "loss": 2.2673, + "step": 819 + }, + { + "epoch": 0.10394219799721131, + "grad_norm": 0.84375, + "learning_rate": 0.00011935540440449378, + "loss": 1.8439, + "step": 820 + }, + { + "epoch": 0.10406895677525668, + "grad_norm": 1.15625, + "learning_rate": 0.0001193520575549007, + "loss": 2.134, + "step": 821 + }, + { + "epoch": 0.10419571555330207, + "grad_norm": 0.98828125, + "learning_rate": 0.00011934870208625458, + "loss": 2.3663, + "step": 822 + }, + { + "epoch": 0.10432247433134745, + "grad_norm": 1.0703125, + "learning_rate": 0.0001193453379990427, + "loss": 2.3123, + "step": 823 + }, + { + "epoch": 0.10444923310939283, + "grad_norm": 0.9296875, + "learning_rate": 0.00011934196529375362, + "loss": 1.8325, + "step": 824 + }, + { + "epoch": 0.1045759918874382, + "grad_norm": 1.0, + "learning_rate": 0.00011933858397087707, + "loss": 2.1381, + "step": 825 + }, + { + "epoch": 0.10470275066548358, + "grad_norm": 0.8984375, + "learning_rate": 0.0001193351940309041, + "loss": 1.7612, + "step": 826 + }, + { + "epoch": 0.10482950944352896, + "grad_norm": 0.9375, + "learning_rate": 0.000119331795474327, + "loss": 1.5592, + "step": 827 + }, + { + "epoch": 0.10495626822157435, + "grad_norm": 0.97265625, + "learning_rate": 0.00011932838830163932, + "loss": 2.3928, + "step": 828 + }, + { + "epoch": 0.10508302699961973, + "grad_norm": 0.87890625, + "learning_rate": 0.00011932497251333582, + "loss": 2.0957, + "step": 829 + }, + { + "epoch": 0.1052097857776651, + "grad_norm": 0.890625, + "learning_rate": 0.00011932154810991255, + "loss": 2.2185, + "step": 830 + }, + { + "epoch": 0.10533654455571048, + "grad_norm": 0.88671875, + "learning_rate": 0.0001193181150918668, + "loss": 2.3615, + "step": 831 + }, + { + "epoch": 0.10546330333375586, + "grad_norm": 1.03125, + "learning_rate": 0.0001193146734596971, + "loss": 2.3545, + "step": 832 + }, + { + "epoch": 0.10559006211180125, + "grad_norm": 1.0625, + "learning_rate": 0.00011931122321390325, + "loss": 1.8813, + "step": 833 + }, + { + "epoch": 0.10571682088984662, + "grad_norm": 0.94140625, + "learning_rate": 0.0001193077643549863, + "loss": 2.3247, + "step": 834 + }, + { + "epoch": 0.105843579667892, + "grad_norm": 0.94921875, + "learning_rate": 0.00011930429688344852, + "loss": 1.9465, + "step": 835 + }, + { + "epoch": 0.10597033844593738, + "grad_norm": 0.89453125, + "learning_rate": 0.00011930082079979346, + "loss": 1.6975, + "step": 836 + }, + { + "epoch": 0.10609709722398276, + "grad_norm": 1.21875, + "learning_rate": 0.00011929733610452594, + "loss": 2.9427, + "step": 837 + }, + { + "epoch": 0.10622385600202815, + "grad_norm": 0.921875, + "learning_rate": 0.00011929384279815197, + "loss": 2.4923, + "step": 838 + }, + { + "epoch": 0.10635061478007352, + "grad_norm": 1.0078125, + "learning_rate": 0.00011929034088117887, + "loss": 2.6193, + "step": 839 + }, + { + "epoch": 0.1064773735581189, + "grad_norm": 1.3125, + "learning_rate": 0.00011928683035411516, + "loss": 2.2488, + "step": 840 + }, + { + "epoch": 0.10660413233616428, + "grad_norm": 0.92578125, + "learning_rate": 0.00011928331121747065, + "loss": 2.3346, + "step": 841 + }, + { + "epoch": 0.10673089111420966, + "grad_norm": 1.0390625, + "learning_rate": 0.00011927978347175639, + "loss": 1.835, + "step": 842 + }, + { + "epoch": 0.10685764989225503, + "grad_norm": 0.9609375, + "learning_rate": 0.00011927624711748467, + "loss": 1.8929, + "step": 843 + }, + { + "epoch": 0.10698440867030042, + "grad_norm": 1.0546875, + "learning_rate": 0.00011927270215516903, + "loss": 2.0617, + "step": 844 + }, + { + "epoch": 0.1071111674483458, + "grad_norm": 0.9609375, + "learning_rate": 0.00011926914858532429, + "loss": 2.0454, + "step": 845 + }, + { + "epoch": 0.10723792622639118, + "grad_norm": 0.953125, + "learning_rate": 0.00011926558640846647, + "loss": 2.1259, + "step": 846 + }, + { + "epoch": 0.10736468500443656, + "grad_norm": 2.328125, + "learning_rate": 0.00011926201562511287, + "loss": 2.4442, + "step": 847 + }, + { + "epoch": 0.10749144378248193, + "grad_norm": 0.9140625, + "learning_rate": 0.00011925843623578205, + "loss": 1.5956, + "step": 848 + }, + { + "epoch": 0.10761820256052732, + "grad_norm": 0.98046875, + "learning_rate": 0.00011925484824099378, + "loss": 2.1079, + "step": 849 + }, + { + "epoch": 0.1077449613385727, + "grad_norm": 0.89453125, + "learning_rate": 0.00011925125164126916, + "loss": 2.1141, + "step": 850 + }, + { + "epoch": 0.10787172011661808, + "grad_norm": 0.98046875, + "learning_rate": 0.0001192476464371304, + "loss": 2.0957, + "step": 851 + }, + { + "epoch": 0.10799847889466345, + "grad_norm": 0.88671875, + "learning_rate": 0.00011924403262910113, + "loss": 2.2518, + "step": 852 + }, + { + "epoch": 0.10812523767270883, + "grad_norm": 0.92578125, + "learning_rate": 0.00011924041021770611, + "loss": 2.2177, + "step": 853 + }, + { + "epoch": 0.10825199645075421, + "grad_norm": 1.046875, + "learning_rate": 0.00011923677920347135, + "loss": 2.1927, + "step": 854 + }, + { + "epoch": 0.1083787552287996, + "grad_norm": 1.03125, + "learning_rate": 0.00011923313958692419, + "loss": 2.0256, + "step": 855 + }, + { + "epoch": 0.10850551400684498, + "grad_norm": 0.91015625, + "learning_rate": 0.00011922949136859316, + "loss": 2.3203, + "step": 856 + }, + { + "epoch": 0.10863227278489035, + "grad_norm": 0.8984375, + "learning_rate": 0.00011922583454900804, + "loss": 2.2034, + "step": 857 + }, + { + "epoch": 0.10875903156293573, + "grad_norm": 0.83984375, + "learning_rate": 0.00011922216912869987, + "loss": 2.2134, + "step": 858 + }, + { + "epoch": 0.10888579034098111, + "grad_norm": 0.88671875, + "learning_rate": 0.00011921849510820093, + "loss": 2.4265, + "step": 859 + }, + { + "epoch": 0.1090125491190265, + "grad_norm": 1.5, + "learning_rate": 0.00011921481248804481, + "loss": 1.8787, + "step": 860 + }, + { + "epoch": 0.10913930789707187, + "grad_norm": 1.03125, + "learning_rate": 0.00011921112126876625, + "loss": 1.6549, + "step": 861 + }, + { + "epoch": 0.10926606667511725, + "grad_norm": 0.93359375, + "learning_rate": 0.00011920742145090128, + "loss": 2.3385, + "step": 862 + }, + { + "epoch": 0.10939282545316263, + "grad_norm": 0.9296875, + "learning_rate": 0.0001192037130349872, + "loss": 2.4479, + "step": 863 + }, + { + "epoch": 0.10951958423120801, + "grad_norm": 0.99609375, + "learning_rate": 0.00011919999602156256, + "loss": 2.0568, + "step": 864 + }, + { + "epoch": 0.1096463430092534, + "grad_norm": 0.875, + "learning_rate": 0.00011919627041116709, + "loss": 2.3132, + "step": 865 + }, + { + "epoch": 0.10977310178729877, + "grad_norm": 0.91015625, + "learning_rate": 0.0001191925362043419, + "loss": 2.0651, + "step": 866 + }, + { + "epoch": 0.10989986056534415, + "grad_norm": 0.83984375, + "learning_rate": 0.00011918879340162918, + "loss": 1.8188, + "step": 867 + }, + { + "epoch": 0.11002661934338953, + "grad_norm": 0.90625, + "learning_rate": 0.00011918504200357251, + "loss": 1.7025, + "step": 868 + }, + { + "epoch": 0.11015337812143491, + "grad_norm": 0.93359375, + "learning_rate": 0.00011918128201071667, + "loss": 1.9744, + "step": 869 + }, + { + "epoch": 0.11028013689948028, + "grad_norm": 1.1015625, + "learning_rate": 0.00011917751342360765, + "loss": 1.9545, + "step": 870 + }, + { + "epoch": 0.11040689567752567, + "grad_norm": 0.9375, + "learning_rate": 0.00011917373624279276, + "loss": 2.4713, + "step": 871 + }, + { + "epoch": 0.11053365445557105, + "grad_norm": 0.8984375, + "learning_rate": 0.00011916995046882045, + "loss": 2.1163, + "step": 872 + }, + { + "epoch": 0.11066041323361643, + "grad_norm": 0.94140625, + "learning_rate": 0.00011916615610224058, + "loss": 2.5615, + "step": 873 + }, + { + "epoch": 0.11078717201166181, + "grad_norm": 0.96875, + "learning_rate": 0.00011916235314360408, + "loss": 1.7737, + "step": 874 + }, + { + "epoch": 0.11091393078970718, + "grad_norm": 0.88671875, + "learning_rate": 0.00011915854159346326, + "loss": 2.237, + "step": 875 + }, + { + "epoch": 0.11104068956775257, + "grad_norm": 0.9453125, + "learning_rate": 0.00011915472145237163, + "loss": 1.9866, + "step": 876 + }, + { + "epoch": 0.11116744834579795, + "grad_norm": 0.81640625, + "learning_rate": 0.00011915089272088392, + "loss": 2.194, + "step": 877 + }, + { + "epoch": 0.11129420712384333, + "grad_norm": 0.82421875, + "learning_rate": 0.00011914705539955616, + "loss": 2.4161, + "step": 878 + }, + { + "epoch": 0.1114209659018887, + "grad_norm": 0.98046875, + "learning_rate": 0.00011914320948894561, + "loss": 2.0445, + "step": 879 + }, + { + "epoch": 0.11154772467993408, + "grad_norm": 0.87890625, + "learning_rate": 0.00011913935498961073, + "loss": 2.358, + "step": 880 + }, + { + "epoch": 0.11167448345797946, + "grad_norm": 0.92578125, + "learning_rate": 0.00011913549190211129, + "loss": 1.9631, + "step": 881 + }, + { + "epoch": 0.11180124223602485, + "grad_norm": 1.015625, + "learning_rate": 0.0001191316202270083, + "loss": 2.1943, + "step": 882 + }, + { + "epoch": 0.11192800101407023, + "grad_norm": 0.953125, + "learning_rate": 0.00011912773996486399, + "loss": 1.7102, + "step": 883 + }, + { + "epoch": 0.1120547597921156, + "grad_norm": 1.09375, + "learning_rate": 0.00011912385111624182, + "loss": 2.1962, + "step": 884 + }, + { + "epoch": 0.11218151857016098, + "grad_norm": 0.9375, + "learning_rate": 0.00011911995368170656, + "loss": 1.858, + "step": 885 + }, + { + "epoch": 0.11230827734820636, + "grad_norm": 0.9375, + "learning_rate": 0.00011911604766182418, + "loss": 1.8814, + "step": 886 + }, + { + "epoch": 0.11243503612625175, + "grad_norm": 0.921875, + "learning_rate": 0.00011911213305716191, + "loss": 2.0915, + "step": 887 + }, + { + "epoch": 0.11256179490429712, + "grad_norm": 0.98828125, + "learning_rate": 0.00011910820986828823, + "loss": 2.1956, + "step": 888 + }, + { + "epoch": 0.1126885536823425, + "grad_norm": 0.96875, + "learning_rate": 0.00011910427809577285, + "loss": 1.9686, + "step": 889 + }, + { + "epoch": 0.11281531246038788, + "grad_norm": 0.890625, + "learning_rate": 0.00011910033774018675, + "loss": 2.0356, + "step": 890 + }, + { + "epoch": 0.11294207123843326, + "grad_norm": 1.03125, + "learning_rate": 0.00011909638880210214, + "loss": 2.3523, + "step": 891 + }, + { + "epoch": 0.11306883001647865, + "grad_norm": 1.125, + "learning_rate": 0.0001190924312820925, + "loss": 2.0312, + "step": 892 + }, + { + "epoch": 0.11319558879452402, + "grad_norm": 1.046875, + "learning_rate": 0.00011908846518073248, + "loss": 1.9166, + "step": 893 + }, + { + "epoch": 0.1133223475725694, + "grad_norm": 0.84765625, + "learning_rate": 0.0001190844904985981, + "loss": 2.4145, + "step": 894 + }, + { + "epoch": 0.11344910635061478, + "grad_norm": 0.90625, + "learning_rate": 0.00011908050723626653, + "loss": 2.2743, + "step": 895 + }, + { + "epoch": 0.11357586512866016, + "grad_norm": 0.90625, + "learning_rate": 0.00011907651539431621, + "loss": 1.7663, + "step": 896 + }, + { + "epoch": 0.11370262390670553, + "grad_norm": 0.9921875, + "learning_rate": 0.00011907251497332685, + "loss": 2.7327, + "step": 897 + }, + { + "epoch": 0.11382938268475092, + "grad_norm": 0.7734375, + "learning_rate": 0.00011906850597387938, + "loss": 2.132, + "step": 898 + }, + { + "epoch": 0.1139561414627963, + "grad_norm": 0.8984375, + "learning_rate": 0.00011906448839655597, + "loss": 1.7172, + "step": 899 + }, + { + "epoch": 0.11408290024084168, + "grad_norm": 1.0, + "learning_rate": 0.00011906046224194008, + "loss": 1.7345, + "step": 900 + }, + { + "epoch": 0.11420965901888706, + "grad_norm": 0.921875, + "learning_rate": 0.00011905642751061635, + "loss": 1.8894, + "step": 901 + }, + { + "epoch": 0.11433641779693243, + "grad_norm": 0.890625, + "learning_rate": 0.00011905238420317071, + "loss": 2.0753, + "step": 902 + }, + { + "epoch": 0.11446317657497782, + "grad_norm": 0.91796875, + "learning_rate": 0.00011904833232019036, + "loss": 1.8567, + "step": 903 + }, + { + "epoch": 0.1145899353530232, + "grad_norm": 0.8984375, + "learning_rate": 0.00011904427186226365, + "loss": 1.9942, + "step": 904 + }, + { + "epoch": 0.11471669413106858, + "grad_norm": 0.94921875, + "learning_rate": 0.00011904020282998028, + "loss": 2.0263, + "step": 905 + }, + { + "epoch": 0.11484345290911395, + "grad_norm": 0.8359375, + "learning_rate": 0.00011903612522393114, + "loss": 2.0286, + "step": 906 + }, + { + "epoch": 0.11497021168715933, + "grad_norm": 0.8671875, + "learning_rate": 0.00011903203904470837, + "loss": 2.0145, + "step": 907 + }, + { + "epoch": 0.11509697046520471, + "grad_norm": 0.890625, + "learning_rate": 0.00011902794429290535, + "loss": 1.6808, + "step": 908 + }, + { + "epoch": 0.1152237292432501, + "grad_norm": 1.125, + "learning_rate": 0.00011902384096911677, + "loss": 2.4509, + "step": 909 + }, + { + "epoch": 0.11535048802129548, + "grad_norm": 0.95703125, + "learning_rate": 0.00011901972907393845, + "loss": 2.5562, + "step": 910 + }, + { + "epoch": 0.11547724679934085, + "grad_norm": 0.94140625, + "learning_rate": 0.00011901560860796754, + "loss": 2.1583, + "step": 911 + }, + { + "epoch": 0.11560400557738623, + "grad_norm": 1.046875, + "learning_rate": 0.00011901147957180243, + "loss": 2.1251, + "step": 912 + }, + { + "epoch": 0.11573076435543161, + "grad_norm": 0.953125, + "learning_rate": 0.00011900734196604268, + "loss": 1.7442, + "step": 913 + }, + { + "epoch": 0.115857523133477, + "grad_norm": 0.8359375, + "learning_rate": 0.00011900319579128921, + "loss": 2.1894, + "step": 914 + }, + { + "epoch": 0.11598428191152237, + "grad_norm": 1.0, + "learning_rate": 0.0001189990410481441, + "loss": 1.6838, + "step": 915 + }, + { + "epoch": 0.11611104068956775, + "grad_norm": 0.96875, + "learning_rate": 0.00011899487773721069, + "loss": 1.7357, + "step": 916 + }, + { + "epoch": 0.11623779946761313, + "grad_norm": 0.9921875, + "learning_rate": 0.00011899070585909357, + "loss": 2.0866, + "step": 917 + }, + { + "epoch": 0.11636455824565851, + "grad_norm": 1.046875, + "learning_rate": 0.0001189865254143986, + "loss": 2.235, + "step": 918 + }, + { + "epoch": 0.1164913170237039, + "grad_norm": 0.9453125, + "learning_rate": 0.00011898233640373284, + "loss": 2.0509, + "step": 919 + }, + { + "epoch": 0.11661807580174927, + "grad_norm": 0.96484375, + "learning_rate": 0.0001189781388277046, + "loss": 2.0683, + "step": 920 + }, + { + "epoch": 0.11674483457979465, + "grad_norm": 0.859375, + "learning_rate": 0.00011897393268692349, + "loss": 1.9045, + "step": 921 + }, + { + "epoch": 0.11687159335784003, + "grad_norm": 0.90625, + "learning_rate": 0.0001189697179820003, + "loss": 2.1663, + "step": 922 + }, + { + "epoch": 0.11699835213588541, + "grad_norm": 0.96875, + "learning_rate": 0.00011896549471354708, + "loss": 1.9907, + "step": 923 + }, + { + "epoch": 0.11712511091393078, + "grad_norm": 1.125, + "learning_rate": 0.00011896126288217714, + "loss": 2.1921, + "step": 924 + }, + { + "epoch": 0.11725186969197617, + "grad_norm": 1.03125, + "learning_rate": 0.00011895702248850502, + "loss": 2.297, + "step": 925 + }, + { + "epoch": 0.11737862847002155, + "grad_norm": 0.9921875, + "learning_rate": 0.0001189527735331465, + "loss": 2.1541, + "step": 926 + }, + { + "epoch": 0.11750538724806693, + "grad_norm": 0.9140625, + "learning_rate": 0.00011894851601671861, + "loss": 2.194, + "step": 927 + }, + { + "epoch": 0.11763214602611231, + "grad_norm": 0.88671875, + "learning_rate": 0.00011894424993983962, + "loss": 1.8901, + "step": 928 + }, + { + "epoch": 0.11775890480415768, + "grad_norm": 0.9609375, + "learning_rate": 0.00011893997530312906, + "loss": 1.9794, + "step": 929 + }, + { + "epoch": 0.11788566358220307, + "grad_norm": 1.375, + "learning_rate": 0.00011893569210720768, + "loss": 3.0634, + "step": 930 + }, + { + "epoch": 0.11801242236024845, + "grad_norm": 1.21875, + "learning_rate": 0.00011893140035269749, + "loss": 2.7166, + "step": 931 + }, + { + "epoch": 0.11813918113829383, + "grad_norm": 1.03125, + "learning_rate": 0.00011892710004022173, + "loss": 2.2892, + "step": 932 + }, + { + "epoch": 0.11826593991633921, + "grad_norm": 1.109375, + "learning_rate": 0.00011892279117040488, + "loss": 2.1269, + "step": 933 + }, + { + "epoch": 0.11839269869438458, + "grad_norm": 0.99609375, + "learning_rate": 0.00011891847374387267, + "loss": 2.269, + "step": 934 + }, + { + "epoch": 0.11851945747242996, + "grad_norm": 0.76171875, + "learning_rate": 0.0001189141477612521, + "loss": 1.9353, + "step": 935 + }, + { + "epoch": 0.11864621625047535, + "grad_norm": 1.0, + "learning_rate": 0.00011890981322317133, + "loss": 1.5698, + "step": 936 + }, + { + "epoch": 0.11877297502852073, + "grad_norm": 0.88671875, + "learning_rate": 0.00011890547013025986, + "loss": 2.4581, + "step": 937 + }, + { + "epoch": 0.1188997338065661, + "grad_norm": 1.09375, + "learning_rate": 0.0001189011184831484, + "loss": 2.5244, + "step": 938 + }, + { + "epoch": 0.11902649258461148, + "grad_norm": 1.0234375, + "learning_rate": 0.00011889675828246886, + "loss": 2.262, + "step": 939 + }, + { + "epoch": 0.11915325136265686, + "grad_norm": 0.8984375, + "learning_rate": 0.00011889238952885444, + "loss": 2.3245, + "step": 940 + }, + { + "epoch": 0.11928001014070225, + "grad_norm": 1.1796875, + "learning_rate": 0.00011888801222293957, + "loss": 2.4705, + "step": 941 + }, + { + "epoch": 0.11940676891874763, + "grad_norm": 0.921875, + "learning_rate": 0.00011888362636535991, + "loss": 1.9687, + "step": 942 + }, + { + "epoch": 0.119533527696793, + "grad_norm": 0.98828125, + "learning_rate": 0.00011887923195675237, + "loss": 2.3092, + "step": 943 + }, + { + "epoch": 0.11966028647483838, + "grad_norm": 0.94140625, + "learning_rate": 0.00011887482899775511, + "loss": 1.6603, + "step": 944 + }, + { + "epoch": 0.11978704525288376, + "grad_norm": 0.984375, + "learning_rate": 0.00011887041748900754, + "loss": 1.8861, + "step": 945 + }, + { + "epoch": 0.11991380403092915, + "grad_norm": 0.8203125, + "learning_rate": 0.00011886599743115027, + "loss": 2.3836, + "step": 946 + }, + { + "epoch": 0.12004056280897452, + "grad_norm": 0.99609375, + "learning_rate": 0.00011886156882482515, + "loss": 2.3982, + "step": 947 + }, + { + "epoch": 0.1201673215870199, + "grad_norm": 1.03125, + "learning_rate": 0.00011885713167067536, + "loss": 1.7543, + "step": 948 + }, + { + "epoch": 0.12029408036506528, + "grad_norm": 0.96484375, + "learning_rate": 0.00011885268596934522, + "loss": 2.0612, + "step": 949 + }, + { + "epoch": 0.12042083914311066, + "grad_norm": 0.94921875, + "learning_rate": 0.00011884823172148033, + "loss": 2.6011, + "step": 950 + }, + { + "epoch": 0.12054759792115605, + "grad_norm": 0.97265625, + "learning_rate": 0.00011884376892772756, + "loss": 2.3846, + "step": 951 + }, + { + "epoch": 0.12067435669920142, + "grad_norm": 0.87890625, + "learning_rate": 0.00011883929758873495, + "loss": 2.112, + "step": 952 + }, + { + "epoch": 0.1208011154772468, + "grad_norm": 0.859375, + "learning_rate": 0.00011883481770515186, + "loss": 2.3332, + "step": 953 + }, + { + "epoch": 0.12092787425529218, + "grad_norm": 0.88671875, + "learning_rate": 0.00011883032927762887, + "loss": 2.358, + "step": 954 + }, + { + "epoch": 0.12105463303333756, + "grad_norm": 1.1171875, + "learning_rate": 0.00011882583230681773, + "loss": 2.8022, + "step": 955 + }, + { + "epoch": 0.12118139181138293, + "grad_norm": 0.83203125, + "learning_rate": 0.00011882132679337154, + "loss": 2.041, + "step": 956 + }, + { + "epoch": 0.12130815058942832, + "grad_norm": 1.015625, + "learning_rate": 0.00011881681273794454, + "loss": 2.3209, + "step": 957 + }, + { + "epoch": 0.1214349093674737, + "grad_norm": 0.85546875, + "learning_rate": 0.0001188122901411923, + "loss": 2.1664, + "step": 958 + }, + { + "epoch": 0.12156166814551908, + "grad_norm": 0.94921875, + "learning_rate": 0.00011880775900377155, + "loss": 2.1146, + "step": 959 + }, + { + "epoch": 0.12168842692356446, + "grad_norm": 1.0546875, + "learning_rate": 0.00011880321932634031, + "loss": 2.1281, + "step": 960 + }, + { + "epoch": 0.12181518570160983, + "grad_norm": 0.8671875, + "learning_rate": 0.00011879867110955784, + "loss": 2.0122, + "step": 961 + }, + { + "epoch": 0.12194194447965521, + "grad_norm": 0.90625, + "learning_rate": 0.00011879411435408465, + "loss": 2.1311, + "step": 962 + }, + { + "epoch": 0.1220687032577006, + "grad_norm": 1.0703125, + "learning_rate": 0.00011878954906058239, + "loss": 2.6127, + "step": 963 + }, + { + "epoch": 0.12219546203574598, + "grad_norm": 1.109375, + "learning_rate": 0.0001187849752297141, + "loss": 2.0978, + "step": 964 + }, + { + "epoch": 0.12232222081379135, + "grad_norm": 0.9609375, + "learning_rate": 0.00011878039286214396, + "loss": 1.7482, + "step": 965 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 0.8984375, + "learning_rate": 0.00011877580195853745, + "loss": 1.8181, + "step": 966 + }, + { + "epoch": 0.12257573836988211, + "grad_norm": 0.86328125, + "learning_rate": 0.00011877120251956118, + "loss": 2.082, + "step": 967 + }, + { + "epoch": 0.1227024971479275, + "grad_norm": 0.890625, + "learning_rate": 0.00011876659454588316, + "loss": 2.0985, + "step": 968 + }, + { + "epoch": 0.12282925592597288, + "grad_norm": 1.0078125, + "learning_rate": 0.0001187619780381725, + "loss": 2.2291, + "step": 969 + }, + { + "epoch": 0.12295601470401825, + "grad_norm": 1.140625, + "learning_rate": 0.00011875735299709962, + "loss": 2.54, + "step": 970 + }, + { + "epoch": 0.12308277348206363, + "grad_norm": 0.91015625, + "learning_rate": 0.00011875271942333619, + "loss": 2.2419, + "step": 971 + }, + { + "epoch": 0.12320953226010901, + "grad_norm": 1.0234375, + "learning_rate": 0.00011874807731755508, + "loss": 2.433, + "step": 972 + }, + { + "epoch": 0.1233362910381544, + "grad_norm": 1.0625, + "learning_rate": 0.00011874342668043039, + "loss": 2.8132, + "step": 973 + }, + { + "epoch": 0.12346304981619977, + "grad_norm": 0.86328125, + "learning_rate": 0.0001187387675126375, + "loss": 1.6897, + "step": 974 + }, + { + "epoch": 0.12358980859424515, + "grad_norm": 0.8515625, + "learning_rate": 0.00011873409981485302, + "loss": 2.1097, + "step": 975 + }, + { + "epoch": 0.12371656737229053, + "grad_norm": 0.9375, + "learning_rate": 0.00011872942358775475, + "loss": 2.0381, + "step": 976 + }, + { + "epoch": 0.12384332615033591, + "grad_norm": 0.96484375, + "learning_rate": 0.00011872473883202182, + "loss": 1.7534, + "step": 977 + }, + { + "epoch": 0.1239700849283813, + "grad_norm": 0.91796875, + "learning_rate": 0.0001187200455483345, + "loss": 2.208, + "step": 978 + }, + { + "epoch": 0.12409684370642667, + "grad_norm": 0.99609375, + "learning_rate": 0.00011871534373737438, + "loss": 1.5478, + "step": 979 + }, + { + "epoch": 0.12422360248447205, + "grad_norm": 0.92578125, + "learning_rate": 0.00011871063339982424, + "loss": 2.1046, + "step": 980 + }, + { + "epoch": 0.12435036126251743, + "grad_norm": 1.046875, + "learning_rate": 0.0001187059145363681, + "loss": 2.331, + "step": 981 + }, + { + "epoch": 0.12447712004056281, + "grad_norm": 0.984375, + "learning_rate": 0.00011870118714769123, + "loss": 2.9785, + "step": 982 + }, + { + "epoch": 0.12460387881860818, + "grad_norm": 1.1796875, + "learning_rate": 0.00011869645123448015, + "loss": 2.5089, + "step": 983 + }, + { + "epoch": 0.12473063759665357, + "grad_norm": 0.84375, + "learning_rate": 0.00011869170679742261, + "loss": 2.3834, + "step": 984 + }, + { + "epoch": 0.12485739637469895, + "grad_norm": 0.97265625, + "learning_rate": 0.00011868695383720758, + "loss": 2.3377, + "step": 985 + }, + { + "epoch": 0.12498415515274433, + "grad_norm": 0.8359375, + "learning_rate": 0.00011868219235452527, + "loss": 1.8265, + "step": 986 + }, + { + "epoch": 0.1251109139307897, + "grad_norm": 1.0234375, + "learning_rate": 0.00011867742235006717, + "loss": 2.054, + "step": 987 + }, + { + "epoch": 0.1252376727088351, + "grad_norm": 0.98046875, + "learning_rate": 0.00011867264382452595, + "loss": 1.718, + "step": 988 + }, + { + "epoch": 0.12536443148688048, + "grad_norm": 0.9140625, + "learning_rate": 0.00011866785677859555, + "loss": 1.9682, + "step": 989 + }, + { + "epoch": 0.12549119026492583, + "grad_norm": 0.921875, + "learning_rate": 0.00011866306121297115, + "loss": 1.7012, + "step": 990 + }, + { + "epoch": 0.12561794904297122, + "grad_norm": 1.046875, + "learning_rate": 0.00011865825712834912, + "loss": 2.1602, + "step": 991 + }, + { + "epoch": 0.1257447078210166, + "grad_norm": 0.953125, + "learning_rate": 0.00011865344452542716, + "loss": 1.7084, + "step": 992 + }, + { + "epoch": 0.12587146659906198, + "grad_norm": 0.7890625, + "learning_rate": 0.00011864862340490413, + "loss": 2.1667, + "step": 993 + }, + { + "epoch": 0.12599822537710736, + "grad_norm": 1.0625, + "learning_rate": 0.00011864379376748013, + "loss": 2.05, + "step": 994 + }, + { + "epoch": 0.12612498415515275, + "grad_norm": 0.85546875, + "learning_rate": 0.00011863895561385653, + "loss": 2.0009, + "step": 995 + }, + { + "epoch": 0.12625174293319813, + "grad_norm": 0.91015625, + "learning_rate": 0.00011863410894473594, + "loss": 1.8643, + "step": 996 + }, + { + "epoch": 0.1263785017112435, + "grad_norm": 0.99609375, + "learning_rate": 0.00011862925376082218, + "loss": 1.9145, + "step": 997 + }, + { + "epoch": 0.1265052604892889, + "grad_norm": 0.94140625, + "learning_rate": 0.00011862439006282028, + "loss": 1.9072, + "step": 998 + }, + { + "epoch": 0.12663201926733425, + "grad_norm": 0.8984375, + "learning_rate": 0.00011861951785143657, + "loss": 2.0977, + "step": 999 + }, + { + "epoch": 0.12675877804537963, + "grad_norm": 1.1328125, + "learning_rate": 0.00011861463712737859, + "loss": 2.2561, + "step": 1000 + }, + { + "epoch": 0.12688553682342502, + "grad_norm": 1.0, + "learning_rate": 0.00011860974789135512, + "loss": 2.7327, + "step": 1001 + }, + { + "epoch": 0.1270122956014704, + "grad_norm": 0.87890625, + "learning_rate": 0.00011860485014407617, + "loss": 2.2819, + "step": 1002 + }, + { + "epoch": 0.12713905437951578, + "grad_norm": 0.9296875, + "learning_rate": 0.00011859994388625296, + "loss": 2.5569, + "step": 1003 + }, + { + "epoch": 0.12726581315756116, + "grad_norm": 1.015625, + "learning_rate": 0.00011859502911859801, + "loss": 2.0616, + "step": 1004 + }, + { + "epoch": 0.12739257193560655, + "grad_norm": 0.9609375, + "learning_rate": 0.000118590105841825, + "loss": 1.8455, + "step": 1005 + }, + { + "epoch": 0.12751933071365193, + "grad_norm": 0.9609375, + "learning_rate": 0.00011858517405664891, + "loss": 2.0145, + "step": 1006 + }, + { + "epoch": 0.1276460894916973, + "grad_norm": 0.94921875, + "learning_rate": 0.00011858023376378592, + "loss": 2.0677, + "step": 1007 + }, + { + "epoch": 0.12777284826974267, + "grad_norm": 0.8359375, + "learning_rate": 0.00011857528496395348, + "loss": 2.1184, + "step": 1008 + }, + { + "epoch": 0.12789960704778805, + "grad_norm": 0.98828125, + "learning_rate": 0.00011857032765787021, + "loss": 2.2325, + "step": 1009 + }, + { + "epoch": 0.12802636582583343, + "grad_norm": 1.1796875, + "learning_rate": 0.00011856536184625603, + "loss": 2.4459, + "step": 1010 + }, + { + "epoch": 0.12815312460387882, + "grad_norm": 0.86328125, + "learning_rate": 0.00011856038752983207, + "loss": 1.7825, + "step": 1011 + }, + { + "epoch": 0.1282798833819242, + "grad_norm": 0.8515625, + "learning_rate": 0.00011855540470932067, + "loss": 2.0643, + "step": 1012 + }, + { + "epoch": 0.12840664215996958, + "grad_norm": 1.0078125, + "learning_rate": 0.00011855041338544546, + "loss": 1.8856, + "step": 1013 + }, + { + "epoch": 0.12853340093801496, + "grad_norm": 0.984375, + "learning_rate": 0.00011854541355893127, + "loss": 1.6081, + "step": 1014 + }, + { + "epoch": 0.12866015971606035, + "grad_norm": 0.94140625, + "learning_rate": 0.00011854040523050417, + "loss": 1.9381, + "step": 1015 + }, + { + "epoch": 0.12878691849410573, + "grad_norm": 0.83203125, + "learning_rate": 0.00011853538840089145, + "loss": 2.2497, + "step": 1016 + }, + { + "epoch": 0.12891367727215108, + "grad_norm": 0.87890625, + "learning_rate": 0.00011853036307082166, + "loss": 2.2958, + "step": 1017 + }, + { + "epoch": 0.12904043605019647, + "grad_norm": 0.91015625, + "learning_rate": 0.00011852532924102459, + "loss": 1.8932, + "step": 1018 + }, + { + "epoch": 0.12916719482824185, + "grad_norm": 0.94921875, + "learning_rate": 0.00011852028691223123, + "loss": 2.3335, + "step": 1019 + }, + { + "epoch": 0.12929395360628723, + "grad_norm": 1.0546875, + "learning_rate": 0.00011851523608517384, + "loss": 1.9809, + "step": 1020 + }, + { + "epoch": 0.12942071238433261, + "grad_norm": 0.9453125, + "learning_rate": 0.00011851017676058585, + "loss": 1.6886, + "step": 1021 + }, + { + "epoch": 0.129547471162378, + "grad_norm": 0.93359375, + "learning_rate": 0.00011850510893920202, + "loss": 2.5225, + "step": 1022 + }, + { + "epoch": 0.12967422994042338, + "grad_norm": 0.99609375, + "learning_rate": 0.00011850003262175829, + "loss": 2.3927, + "step": 1023 + }, + { + "epoch": 0.12980098871846876, + "grad_norm": 0.95703125, + "learning_rate": 0.00011849494780899181, + "loss": 2.3379, + "step": 1024 + }, + { + "epoch": 0.12992774749651415, + "grad_norm": 1.015625, + "learning_rate": 0.000118489854501641, + "loss": 2.1308, + "step": 1025 + }, + { + "epoch": 0.1300545062745595, + "grad_norm": 0.9296875, + "learning_rate": 0.00011848475270044554, + "loss": 2.0078, + "step": 1026 + }, + { + "epoch": 0.13018126505260488, + "grad_norm": 1.0703125, + "learning_rate": 0.00011847964240614627, + "loss": 2.0975, + "step": 1027 + }, + { + "epoch": 0.13030802383065027, + "grad_norm": 0.87890625, + "learning_rate": 0.00011847452361948531, + "loss": 2.3797, + "step": 1028 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.8984375, + "learning_rate": 0.00011846939634120604, + "loss": 2.2901, + "step": 1029 + }, + { + "epoch": 0.13056154138674103, + "grad_norm": 0.95703125, + "learning_rate": 0.00011846426057205297, + "loss": 2.0321, + "step": 1030 + }, + { + "epoch": 0.13068830016478641, + "grad_norm": 0.96484375, + "learning_rate": 0.00011845911631277197, + "loss": 2.3257, + "step": 1031 + }, + { + "epoch": 0.1308150589428318, + "grad_norm": 0.859375, + "learning_rate": 0.00011845396356411007, + "loss": 2.2145, + "step": 1032 + }, + { + "epoch": 0.13094181772087718, + "grad_norm": 0.96875, + "learning_rate": 0.00011844880232681553, + "loss": 1.8059, + "step": 1033 + }, + { + "epoch": 0.13106857649892256, + "grad_norm": 0.8515625, + "learning_rate": 0.00011844363260163788, + "loss": 2.2591, + "step": 1034 + }, + { + "epoch": 0.13119533527696792, + "grad_norm": 0.984375, + "learning_rate": 0.00011843845438932787, + "loss": 1.7012, + "step": 1035 + }, + { + "epoch": 0.1313220940550133, + "grad_norm": 0.89453125, + "learning_rate": 0.00011843326769063743, + "loss": 2.4046, + "step": 1036 + }, + { + "epoch": 0.13144885283305868, + "grad_norm": 0.90234375, + "learning_rate": 0.00011842807250631985, + "loss": 1.6827, + "step": 1037 + }, + { + "epoch": 0.13157561161110407, + "grad_norm": 1.046875, + "learning_rate": 0.00011842286883712948, + "loss": 1.6645, + "step": 1038 + }, + { + "epoch": 0.13170237038914945, + "grad_norm": 0.98046875, + "learning_rate": 0.00011841765668382204, + "loss": 2.2063, + "step": 1039 + }, + { + "epoch": 0.13182912916719483, + "grad_norm": 0.85546875, + "learning_rate": 0.00011841243604715444, + "loss": 2.4301, + "step": 1040 + }, + { + "epoch": 0.1319558879452402, + "grad_norm": 0.890625, + "learning_rate": 0.00011840720692788479, + "loss": 1.7588, + "step": 1041 + }, + { + "epoch": 0.1320826467232856, + "grad_norm": 0.9140625, + "learning_rate": 0.00011840196932677247, + "loss": 2.3297, + "step": 1042 + }, + { + "epoch": 0.13220940550133098, + "grad_norm": 0.921875, + "learning_rate": 0.0001183967232445781, + "loss": 2.1185, + "step": 1043 + }, + { + "epoch": 0.13233616427937633, + "grad_norm": 0.9140625, + "learning_rate": 0.00011839146868206348, + "loss": 1.351, + "step": 1044 + }, + { + "epoch": 0.13246292305742172, + "grad_norm": 0.89453125, + "learning_rate": 0.00011838620563999168, + "loss": 1.7055, + "step": 1045 + }, + { + "epoch": 0.1325896818354671, + "grad_norm": 0.8671875, + "learning_rate": 0.00011838093411912701, + "loss": 1.7632, + "step": 1046 + }, + { + "epoch": 0.13271644061351248, + "grad_norm": 0.9453125, + "learning_rate": 0.00011837565412023498, + "loss": 2.445, + "step": 1047 + }, + { + "epoch": 0.13284319939155786, + "grad_norm": 1.015625, + "learning_rate": 0.00011837036564408236, + "loss": 1.59, + "step": 1048 + }, + { + "epoch": 0.13296995816960325, + "grad_norm": 1.015625, + "learning_rate": 0.0001183650686914371, + "loss": 2.0756, + "step": 1049 + }, + { + "epoch": 0.13309671694764863, + "grad_norm": 0.98046875, + "learning_rate": 0.00011835976326306847, + "loss": 2.0833, + "step": 1050 + }, + { + "epoch": 0.133223475725694, + "grad_norm": 0.9609375, + "learning_rate": 0.00011835444935974691, + "loss": 2.2792, + "step": 1051 + }, + { + "epoch": 0.1333502345037394, + "grad_norm": 1.0390625, + "learning_rate": 0.00011834912698224406, + "loss": 1.9627, + "step": 1052 + }, + { + "epoch": 0.13347699328178475, + "grad_norm": 1.0234375, + "learning_rate": 0.00011834379613133287, + "loss": 1.5645, + "step": 1053 + }, + { + "epoch": 0.13360375205983013, + "grad_norm": 1.015625, + "learning_rate": 0.00011833845680778748, + "loss": 1.8105, + "step": 1054 + }, + { + "epoch": 0.13373051083787552, + "grad_norm": 0.98828125, + "learning_rate": 0.00011833310901238326, + "loss": 2.5772, + "step": 1055 + }, + { + "epoch": 0.1338572696159209, + "grad_norm": 1.046875, + "learning_rate": 0.00011832775274589678, + "loss": 2.4272, + "step": 1056 + }, + { + "epoch": 0.13398402839396628, + "grad_norm": 0.96484375, + "learning_rate": 0.00011832238800910591, + "loss": 1.9244, + "step": 1057 + }, + { + "epoch": 0.13411078717201166, + "grad_norm": 0.92578125, + "learning_rate": 0.00011831701480278969, + "loss": 2.0246, + "step": 1058 + }, + { + "epoch": 0.13423754595005705, + "grad_norm": 1.09375, + "learning_rate": 0.00011831163312772844, + "loss": 2.3409, + "step": 1059 + }, + { + "epoch": 0.13436430472810243, + "grad_norm": 0.90234375, + "learning_rate": 0.00011830624298470365, + "loss": 2.0585, + "step": 1060 + }, + { + "epoch": 0.1344910635061478, + "grad_norm": 0.93359375, + "learning_rate": 0.00011830084437449809, + "loss": 2.0481, + "step": 1061 + }, + { + "epoch": 0.13461782228419317, + "grad_norm": 0.875, + "learning_rate": 0.00011829543729789573, + "loss": 2.2935, + "step": 1062 + }, + { + "epoch": 0.13474458106223855, + "grad_norm": 0.8671875, + "learning_rate": 0.00011829002175568179, + "loss": 2.4912, + "step": 1063 + }, + { + "epoch": 0.13487133984028393, + "grad_norm": 0.88671875, + "learning_rate": 0.00011828459774864272, + "loss": 2.2551, + "step": 1064 + }, + { + "epoch": 0.13499809861832932, + "grad_norm": 0.875, + "learning_rate": 0.00011827916527756617, + "loss": 1.6272, + "step": 1065 + }, + { + "epoch": 0.1351248573963747, + "grad_norm": 0.91015625, + "learning_rate": 0.00011827372434324102, + "loss": 2.2448, + "step": 1066 + }, + { + "epoch": 0.13525161617442008, + "grad_norm": 1.0390625, + "learning_rate": 0.00011826827494645745, + "loss": 1.815, + "step": 1067 + }, + { + "epoch": 0.13537837495246546, + "grad_norm": 0.8359375, + "learning_rate": 0.00011826281708800679, + "loss": 2.3024, + "step": 1068 + }, + { + "epoch": 0.13550513373051085, + "grad_norm": 0.98046875, + "learning_rate": 0.00011825735076868163, + "loss": 1.9046, + "step": 1069 + }, + { + "epoch": 0.13563189250855623, + "grad_norm": 0.98046875, + "learning_rate": 0.00011825187598927576, + "loss": 1.8616, + "step": 1070 + }, + { + "epoch": 0.13575865128660158, + "grad_norm": 1.015625, + "learning_rate": 0.00011824639275058424, + "loss": 2.5407, + "step": 1071 + }, + { + "epoch": 0.13588541006464697, + "grad_norm": 0.8671875, + "learning_rate": 0.00011824090105340336, + "loss": 1.4736, + "step": 1072 + }, + { + "epoch": 0.13601216884269235, + "grad_norm": 0.9375, + "learning_rate": 0.00011823540089853061, + "loss": 1.7291, + "step": 1073 + }, + { + "epoch": 0.13613892762073773, + "grad_norm": 0.8984375, + "learning_rate": 0.0001182298922867647, + "loss": 1.4393, + "step": 1074 + }, + { + "epoch": 0.13626568639878311, + "grad_norm": 0.921875, + "learning_rate": 0.00011822437521890559, + "loss": 1.6238, + "step": 1075 + }, + { + "epoch": 0.1363924451768285, + "grad_norm": 0.9921875, + "learning_rate": 0.00011821884969575446, + "loss": 2.4057, + "step": 1076 + }, + { + "epoch": 0.13651920395487388, + "grad_norm": 1.078125, + "learning_rate": 0.00011821331571811375, + "loss": 2.2121, + "step": 1077 + }, + { + "epoch": 0.13664596273291926, + "grad_norm": 1.0703125, + "learning_rate": 0.00011820777328678706, + "loss": 1.858, + "step": 1078 + }, + { + "epoch": 0.13677272151096465, + "grad_norm": 0.8828125, + "learning_rate": 0.0001182022224025793, + "loss": 2.3449, + "step": 1079 + }, + { + "epoch": 0.13689948028901, + "grad_norm": 1.0, + "learning_rate": 0.00011819666306629652, + "loss": 1.7973, + "step": 1080 + }, + { + "epoch": 0.13702623906705538, + "grad_norm": 0.96875, + "learning_rate": 0.00011819109527874608, + "loss": 2.4057, + "step": 1081 + }, + { + "epoch": 0.13715299784510077, + "grad_norm": 1.0078125, + "learning_rate": 0.00011818551904073652, + "loss": 1.5188, + "step": 1082 + }, + { + "epoch": 0.13727975662314615, + "grad_norm": 0.99609375, + "learning_rate": 0.00011817993435307758, + "loss": 2.4068, + "step": 1083 + }, + { + "epoch": 0.13740651540119153, + "grad_norm": 0.94140625, + "learning_rate": 0.00011817434121658032, + "loss": 1.9747, + "step": 1084 + }, + { + "epoch": 0.13753327417923691, + "grad_norm": 1.0390625, + "learning_rate": 0.00011816873963205692, + "loss": 1.7333, + "step": 1085 + }, + { + "epoch": 0.1376600329572823, + "grad_norm": 0.9375, + "learning_rate": 0.00011816312960032089, + "loss": 1.8735, + "step": 1086 + }, + { + "epoch": 0.13778679173532768, + "grad_norm": 0.8671875, + "learning_rate": 0.00011815751112218687, + "loss": 2.4663, + "step": 1087 + }, + { + "epoch": 0.13791355051337306, + "grad_norm": 1.078125, + "learning_rate": 0.00011815188419847078, + "loss": 1.8528, + "step": 1088 + }, + { + "epoch": 0.13804030929141842, + "grad_norm": 0.97265625, + "learning_rate": 0.00011814624882998975, + "loss": 1.8944, + "step": 1089 + }, + { + "epoch": 0.1381670680694638, + "grad_norm": 0.96484375, + "learning_rate": 0.00011814060501756216, + "loss": 1.4727, + "step": 1090 + }, + { + "epoch": 0.13829382684750918, + "grad_norm": 0.8515625, + "learning_rate": 0.0001181349527620076, + "loss": 1.9502, + "step": 1091 + }, + { + "epoch": 0.13842058562555457, + "grad_norm": 0.9765625, + "learning_rate": 0.00011812929206414688, + "loss": 1.8005, + "step": 1092 + }, + { + "epoch": 0.13854734440359995, + "grad_norm": 1.1015625, + "learning_rate": 0.00011812362292480204, + "loss": 1.675, + "step": 1093 + }, + { + "epoch": 0.13867410318164533, + "grad_norm": 1.3515625, + "learning_rate": 0.00011811794534479633, + "loss": 2.5253, + "step": 1094 + }, + { + "epoch": 0.1388008619596907, + "grad_norm": 0.96875, + "learning_rate": 0.00011811225932495428, + "loss": 1.7797, + "step": 1095 + }, + { + "epoch": 0.1389276207377361, + "grad_norm": 0.98828125, + "learning_rate": 0.0001181065648661016, + "loss": 1.9547, + "step": 1096 + }, + { + "epoch": 0.13905437951578148, + "grad_norm": 0.96484375, + "learning_rate": 0.0001181008619690652, + "loss": 2.0714, + "step": 1097 + }, + { + "epoch": 0.13918113829382683, + "grad_norm": 1.0078125, + "learning_rate": 0.00011809515063467329, + "loss": 1.884, + "step": 1098 + }, + { + "epoch": 0.13930789707187222, + "grad_norm": 1.03125, + "learning_rate": 0.00011808943086375524, + "loss": 1.9434, + "step": 1099 + }, + { + "epoch": 0.1394346558499176, + "grad_norm": 0.83203125, + "learning_rate": 0.00011808370265714169, + "loss": 1.551, + "step": 1100 + }, + { + "epoch": 0.13956141462796298, + "grad_norm": 1.0546875, + "learning_rate": 0.00011807796601566446, + "loss": 1.7796, + "step": 1101 + }, + { + "epoch": 0.13968817340600836, + "grad_norm": 0.9296875, + "learning_rate": 0.00011807222094015664, + "loss": 1.8309, + "step": 1102 + }, + { + "epoch": 0.13981493218405375, + "grad_norm": 0.92578125, + "learning_rate": 0.00011806646743145252, + "loss": 1.9375, + "step": 1103 + }, + { + "epoch": 0.13994169096209913, + "grad_norm": 0.98828125, + "learning_rate": 0.0001180607054903876, + "loss": 2.7575, + "step": 1104 + }, + { + "epoch": 0.1400684497401445, + "grad_norm": 0.8828125, + "learning_rate": 0.00011805493511779867, + "loss": 2.1476, + "step": 1105 + }, + { + "epoch": 0.1401952085181899, + "grad_norm": 0.98046875, + "learning_rate": 0.00011804915631452366, + "loss": 2.0541, + "step": 1106 + }, + { + "epoch": 0.14032196729623525, + "grad_norm": 0.9375, + "learning_rate": 0.00011804336908140176, + "loss": 2.0684, + "step": 1107 + }, + { + "epoch": 0.14044872607428063, + "grad_norm": 0.921875, + "learning_rate": 0.0001180375734192734, + "loss": 1.6989, + "step": 1108 + }, + { + "epoch": 0.14057548485232602, + "grad_norm": 0.84765625, + "learning_rate": 0.00011803176932898024, + "loss": 2.156, + "step": 1109 + }, + { + "epoch": 0.1407022436303714, + "grad_norm": 0.875, + "learning_rate": 0.0001180259568113651, + "loss": 1.7985, + "step": 1110 + }, + { + "epoch": 0.14082900240841678, + "grad_norm": 0.94921875, + "learning_rate": 0.00011802013586727213, + "loss": 1.9365, + "step": 1111 + }, + { + "epoch": 0.14095576118646216, + "grad_norm": 1.28125, + "learning_rate": 0.0001180143064975466, + "loss": 1.6625, + "step": 1112 + }, + { + "epoch": 0.14108251996450755, + "grad_norm": 1.0, + "learning_rate": 0.00011800846870303501, + "loss": 2.0967, + "step": 1113 + }, + { + "epoch": 0.14120927874255293, + "grad_norm": 0.828125, + "learning_rate": 0.00011800262248458521, + "loss": 2.0942, + "step": 1114 + }, + { + "epoch": 0.1413360375205983, + "grad_norm": 1.0, + "learning_rate": 0.00011799676784304612, + "loss": 2.1922, + "step": 1115 + }, + { + "epoch": 0.1414627962986437, + "grad_norm": 0.984375, + "learning_rate": 0.00011799090477926795, + "loss": 1.8842, + "step": 1116 + }, + { + "epoch": 0.14158955507668905, + "grad_norm": 1.0703125, + "learning_rate": 0.00011798503329410214, + "loss": 2.5449, + "step": 1117 + }, + { + "epoch": 0.14171631385473443, + "grad_norm": 1.078125, + "learning_rate": 0.00011797915338840135, + "loss": 2.113, + "step": 1118 + }, + { + "epoch": 0.14184307263277982, + "grad_norm": 0.890625, + "learning_rate": 0.00011797326506301943, + "loss": 1.9245, + "step": 1119 + }, + { + "epoch": 0.1419698314108252, + "grad_norm": 1.125, + "learning_rate": 0.00011796736831881152, + "loss": 1.8401, + "step": 1120 + }, + { + "epoch": 0.14209659018887058, + "grad_norm": 0.85546875, + "learning_rate": 0.00011796146315663389, + "loss": 1.7462, + "step": 1121 + }, + { + "epoch": 0.14222334896691596, + "grad_norm": 0.93359375, + "learning_rate": 0.0001179555495773441, + "loss": 1.9618, + "step": 1122 + }, + { + "epoch": 0.14235010774496135, + "grad_norm": 0.91015625, + "learning_rate": 0.00011794962758180092, + "loss": 1.9509, + "step": 1123 + }, + { + "epoch": 0.14247686652300673, + "grad_norm": 0.9921875, + "learning_rate": 0.00011794369717086435, + "loss": 2.2101, + "step": 1124 + }, + { + "epoch": 0.1426036253010521, + "grad_norm": 0.8515625, + "learning_rate": 0.0001179377583453956, + "loss": 2.23, + "step": 1125 + }, + { + "epoch": 0.14273038407909747, + "grad_norm": 0.890625, + "learning_rate": 0.00011793181110625706, + "loss": 2.0891, + "step": 1126 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.94140625, + "learning_rate": 0.00011792585545431243, + "loss": 2.3124, + "step": 1127 + }, + { + "epoch": 0.14298390163518823, + "grad_norm": 0.90234375, + "learning_rate": 0.00011791989139042655, + "loss": 2.303, + "step": 1128 + }, + { + "epoch": 0.14311066041323361, + "grad_norm": 1.203125, + "learning_rate": 0.00011791391891546554, + "loss": 1.9508, + "step": 1129 + }, + { + "epoch": 0.143237419191279, + "grad_norm": 0.90234375, + "learning_rate": 0.0001179079380302967, + "loss": 1.7895, + "step": 1130 + }, + { + "epoch": 0.14336417796932438, + "grad_norm": 1.453125, + "learning_rate": 0.00011790194873578857, + "loss": 1.5249, + "step": 1131 + }, + { + "epoch": 0.14349093674736976, + "grad_norm": 0.859375, + "learning_rate": 0.00011789595103281093, + "loss": 1.668, + "step": 1132 + }, + { + "epoch": 0.14361769552541515, + "grad_norm": 0.91796875, + "learning_rate": 0.00011788994492223476, + "loss": 2.4371, + "step": 1133 + }, + { + "epoch": 0.14374445430346053, + "grad_norm": 0.89453125, + "learning_rate": 0.00011788393040493223, + "loss": 1.8612, + "step": 1134 + }, + { + "epoch": 0.14387121308150588, + "grad_norm": 0.9609375, + "learning_rate": 0.00011787790748177679, + "loss": 1.727, + "step": 1135 + }, + { + "epoch": 0.14399797185955127, + "grad_norm": 0.95703125, + "learning_rate": 0.00011787187615364307, + "loss": 2.2304, + "step": 1136 + }, + { + "epoch": 0.14412473063759665, + "grad_norm": 0.95703125, + "learning_rate": 0.00011786583642140695, + "loss": 2.2087, + "step": 1137 + }, + { + "epoch": 0.14425148941564203, + "grad_norm": 0.92578125, + "learning_rate": 0.00011785978828594547, + "loss": 2.4493, + "step": 1138 + }, + { + "epoch": 0.14437824819368741, + "grad_norm": 0.87890625, + "learning_rate": 0.00011785373174813702, + "loss": 2.0195, + "step": 1139 + }, + { + "epoch": 0.1445050069717328, + "grad_norm": 0.87109375, + "learning_rate": 0.00011784766680886104, + "loss": 1.9201, + "step": 1140 + }, + { + "epoch": 0.14463176574977818, + "grad_norm": 0.8828125, + "learning_rate": 0.00011784159346899832, + "loss": 2.634, + "step": 1141 + }, + { + "epoch": 0.14475852452782356, + "grad_norm": 1.0546875, + "learning_rate": 0.00011783551172943083, + "loss": 1.7474, + "step": 1142 + }, + { + "epoch": 0.14488528330586894, + "grad_norm": 0.92578125, + "learning_rate": 0.00011782942159104171, + "loss": 2.0539, + "step": 1143 + }, + { + "epoch": 0.1450120420839143, + "grad_norm": 0.890625, + "learning_rate": 0.00011782332305471541, + "loss": 1.8865, + "step": 1144 + }, + { + "epoch": 0.14513880086195968, + "grad_norm": 0.94140625, + "learning_rate": 0.00011781721612133753, + "loss": 1.7505, + "step": 1145 + }, + { + "epoch": 0.14526555964000507, + "grad_norm": 0.91796875, + "learning_rate": 0.00011781110079179493, + "loss": 1.9142, + "step": 1146 + }, + { + "epoch": 0.14539231841805045, + "grad_norm": 1.0078125, + "learning_rate": 0.00011780497706697568, + "loss": 1.8523, + "step": 1147 + }, + { + "epoch": 0.14551907719609583, + "grad_norm": 0.91796875, + "learning_rate": 0.00011779884494776902, + "loss": 2.0329, + "step": 1148 + }, + { + "epoch": 0.1456458359741412, + "grad_norm": 0.9296875, + "learning_rate": 0.00011779270443506548, + "loss": 1.6421, + "step": 1149 + }, + { + "epoch": 0.1457725947521866, + "grad_norm": 0.859375, + "learning_rate": 0.0001177865555297568, + "loss": 2.0415, + "step": 1150 + }, + { + "epoch": 0.14589935353023198, + "grad_norm": 0.84765625, + "learning_rate": 0.00011778039823273588, + "loss": 1.8415, + "step": 1151 + }, + { + "epoch": 0.14602611230827736, + "grad_norm": 0.8125, + "learning_rate": 0.00011777423254489689, + "loss": 2.1517, + "step": 1152 + }, + { + "epoch": 0.14615287108632272, + "grad_norm": 1.0078125, + "learning_rate": 0.00011776805846713521, + "loss": 1.7325, + "step": 1153 + }, + { + "epoch": 0.1462796298643681, + "grad_norm": 1.0078125, + "learning_rate": 0.00011776187600034743, + "loss": 2.2662, + "step": 1154 + }, + { + "epoch": 0.14640638864241348, + "grad_norm": 0.91796875, + "learning_rate": 0.00011775568514543137, + "loss": 2.2717, + "step": 1155 + }, + { + "epoch": 0.14653314742045886, + "grad_norm": 0.921875, + "learning_rate": 0.00011774948590328605, + "loss": 1.9863, + "step": 1156 + }, + { + "epoch": 0.14665990619850425, + "grad_norm": 0.9765625, + "learning_rate": 0.00011774327827481174, + "loss": 1.915, + "step": 1157 + }, + { + "epoch": 0.14678666497654963, + "grad_norm": 1.0546875, + "learning_rate": 0.00011773706226090986, + "loss": 2.0438, + "step": 1158 + }, + { + "epoch": 0.146913423754595, + "grad_norm": 0.90625, + "learning_rate": 0.00011773083786248314, + "loss": 1.9095, + "step": 1159 + }, + { + "epoch": 0.1470401825326404, + "grad_norm": 1.0703125, + "learning_rate": 0.00011772460508043546, + "loss": 2.2996, + "step": 1160 + }, + { + "epoch": 0.14716694131068578, + "grad_norm": 1.0234375, + "learning_rate": 0.00011771836391567194, + "loss": 2.0049, + "step": 1161 + }, + { + "epoch": 0.14729370008873113, + "grad_norm": 0.984375, + "learning_rate": 0.00011771211436909895, + "loss": 1.7218, + "step": 1162 + }, + { + "epoch": 0.14742045886677652, + "grad_norm": 0.91015625, + "learning_rate": 0.00011770585644162399, + "loss": 1.4467, + "step": 1163 + }, + { + "epoch": 0.1475472176448219, + "grad_norm": 0.95703125, + "learning_rate": 0.00011769959013415586, + "loss": 1.9326, + "step": 1164 + }, + { + "epoch": 0.14767397642286728, + "grad_norm": 1.3359375, + "learning_rate": 0.00011769331544760455, + "loss": 2.2604, + "step": 1165 + }, + { + "epoch": 0.14780073520091266, + "grad_norm": 0.90625, + "learning_rate": 0.00011768703238288125, + "loss": 2.1964, + "step": 1166 + }, + { + "epoch": 0.14792749397895805, + "grad_norm": 1.0078125, + "learning_rate": 0.00011768074094089838, + "loss": 1.799, + "step": 1167 + }, + { + "epoch": 0.14805425275700343, + "grad_norm": 0.91796875, + "learning_rate": 0.0001176744411225696, + "loss": 1.8015, + "step": 1168 + }, + { + "epoch": 0.1481810115350488, + "grad_norm": 1.0625, + "learning_rate": 0.00011766813292880974, + "loss": 2.1253, + "step": 1169 + }, + { + "epoch": 0.1483077703130942, + "grad_norm": 1.0078125, + "learning_rate": 0.00011766181636053489, + "loss": 2.204, + "step": 1170 + }, + { + "epoch": 0.14843452909113955, + "grad_norm": 0.9453125, + "learning_rate": 0.00011765549141866233, + "loss": 1.8661, + "step": 1171 + }, + { + "epoch": 0.14856128786918493, + "grad_norm": 0.80859375, + "learning_rate": 0.00011764915810411054, + "loss": 2.2793, + "step": 1172 + }, + { + "epoch": 0.14868804664723032, + "grad_norm": 1.046875, + "learning_rate": 0.00011764281641779927, + "loss": 3.2794, + "step": 1173 + }, + { + "epoch": 0.1488148054252757, + "grad_norm": 0.8984375, + "learning_rate": 0.00011763646636064944, + "loss": 1.6779, + "step": 1174 + }, + { + "epoch": 0.14894156420332108, + "grad_norm": 0.953125, + "learning_rate": 0.0001176301079335832, + "loss": 2.0053, + "step": 1175 + }, + { + "epoch": 0.14906832298136646, + "grad_norm": 0.9921875, + "learning_rate": 0.00011762374113752392, + "loss": 2.0889, + "step": 1176 + }, + { + "epoch": 0.14919508175941185, + "grad_norm": 1.015625, + "learning_rate": 0.00011761736597339618, + "loss": 2.628, + "step": 1177 + }, + { + "epoch": 0.14932184053745723, + "grad_norm": 0.88671875, + "learning_rate": 0.00011761098244212576, + "loss": 1.6363, + "step": 1178 + }, + { + "epoch": 0.1494485993155026, + "grad_norm": 0.84375, + "learning_rate": 0.0001176045905446397, + "loss": 1.8584, + "step": 1179 + }, + { + "epoch": 0.14957535809354797, + "grad_norm": 1.1484375, + "learning_rate": 0.00011759819028186619, + "loss": 2.1749, + "step": 1180 + }, + { + "epoch": 0.14970211687159335, + "grad_norm": 0.9140625, + "learning_rate": 0.00011759178165473469, + "loss": 1.6844, + "step": 1181 + }, + { + "epoch": 0.14982887564963873, + "grad_norm": 0.94921875, + "learning_rate": 0.00011758536466417587, + "loss": 2.3646, + "step": 1182 + }, + { + "epoch": 0.14995563442768411, + "grad_norm": 0.86328125, + "learning_rate": 0.00011757893931112156, + "loss": 2.1912, + "step": 1183 + }, + { + "epoch": 0.1500823932057295, + "grad_norm": 1.046875, + "learning_rate": 0.0001175725055965049, + "loss": 2.5788, + "step": 1184 + }, + { + "epoch": 0.15020915198377488, + "grad_norm": 0.984375, + "learning_rate": 0.00011756606352126013, + "loss": 2.6087, + "step": 1185 + }, + { + "epoch": 0.15033591076182026, + "grad_norm": 0.8515625, + "learning_rate": 0.0001175596130863228, + "loss": 1.5276, + "step": 1186 + }, + { + "epoch": 0.15046266953986565, + "grad_norm": 1.0234375, + "learning_rate": 0.00011755315429262962, + "loss": 1.7347, + "step": 1187 + }, + { + "epoch": 0.15058942831791103, + "grad_norm": 0.8515625, + "learning_rate": 0.00011754668714111856, + "loss": 1.7463, + "step": 1188 + }, + { + "epoch": 0.15071618709595638, + "grad_norm": 0.94921875, + "learning_rate": 0.0001175402116327287, + "loss": 2.1952, + "step": 1189 + }, + { + "epoch": 0.15084294587400177, + "grad_norm": 0.8984375, + "learning_rate": 0.0001175337277684005, + "loss": 2.3937, + "step": 1190 + }, + { + "epoch": 0.15096970465204715, + "grad_norm": 0.890625, + "learning_rate": 0.00011752723554907549, + "loss": 2.3407, + "step": 1191 + }, + { + "epoch": 0.15109646343009253, + "grad_norm": 0.98046875, + "learning_rate": 0.00011752073497569647, + "loss": 2.4135, + "step": 1192 + }, + { + "epoch": 0.15122322220813791, + "grad_norm": 1.078125, + "learning_rate": 0.00011751422604920744, + "loss": 2.067, + "step": 1193 + }, + { + "epoch": 0.1513499809861833, + "grad_norm": 0.98046875, + "learning_rate": 0.00011750770877055364, + "loss": 1.8038, + "step": 1194 + }, + { + "epoch": 0.15147673976422868, + "grad_norm": 0.92578125, + "learning_rate": 0.00011750118314068149, + "loss": 2.1027, + "step": 1195 + }, + { + "epoch": 0.15160349854227406, + "grad_norm": 1.015625, + "learning_rate": 0.00011749464916053865, + "loss": 2.0465, + "step": 1196 + }, + { + "epoch": 0.15173025732031944, + "grad_norm": 0.91015625, + "learning_rate": 0.00011748810683107395, + "loss": 2.0565, + "step": 1197 + }, + { + "epoch": 0.1518570160983648, + "grad_norm": 0.91796875, + "learning_rate": 0.0001174815561532375, + "loss": 1.9503, + "step": 1198 + }, + { + "epoch": 0.15198377487641018, + "grad_norm": 0.9140625, + "learning_rate": 0.00011747499712798056, + "loss": 2.2056, + "step": 1199 + }, + { + "epoch": 0.15211053365445557, + "grad_norm": 0.953125, + "learning_rate": 0.00011746842975625561, + "loss": 1.8435, + "step": 1200 + }, + { + "epoch": 0.15223729243250095, + "grad_norm": 0.9375, + "learning_rate": 0.00011746185403901639, + "loss": 2.2389, + "step": 1201 + }, + { + "epoch": 0.15236405121054633, + "grad_norm": 0.93359375, + "learning_rate": 0.00011745526997721782, + "loss": 2.3989, + "step": 1202 + }, + { + "epoch": 0.1524908099885917, + "grad_norm": 1.0703125, + "learning_rate": 0.000117448677571816, + "loss": 2.0868, + "step": 1203 + }, + { + "epoch": 0.1526175687666371, + "grad_norm": 0.99609375, + "learning_rate": 0.00011744207682376831, + "loss": 1.8994, + "step": 1204 + }, + { + "epoch": 0.15274432754468248, + "grad_norm": 0.9140625, + "learning_rate": 0.00011743546773403327, + "loss": 2.3933, + "step": 1205 + }, + { + "epoch": 0.15287108632272786, + "grad_norm": 0.91015625, + "learning_rate": 0.00011742885030357067, + "loss": 2.0144, + "step": 1206 + }, + { + "epoch": 0.15299784510077322, + "grad_norm": 1.1015625, + "learning_rate": 0.00011742222453334148, + "loss": 1.8523, + "step": 1207 + }, + { + "epoch": 0.1531246038788186, + "grad_norm": 0.890625, + "learning_rate": 0.00011741559042430789, + "loss": 1.6526, + "step": 1208 + }, + { + "epoch": 0.15325136265686398, + "grad_norm": 0.9375, + "learning_rate": 0.0001174089479774333, + "loss": 2.4686, + "step": 1209 + }, + { + "epoch": 0.15337812143490936, + "grad_norm": 1.125, + "learning_rate": 0.00011740229719368231, + "loss": 2.2871, + "step": 1210 + }, + { + "epoch": 0.15350488021295475, + "grad_norm": 0.98046875, + "learning_rate": 0.00011739563807402075, + "loss": 2.117, + "step": 1211 + }, + { + "epoch": 0.15363163899100013, + "grad_norm": 1.03125, + "learning_rate": 0.00011738897061941566, + "loss": 1.9351, + "step": 1212 + }, + { + "epoch": 0.1537583977690455, + "grad_norm": 0.91015625, + "learning_rate": 0.00011738229483083528, + "loss": 2.3357, + "step": 1213 + }, + { + "epoch": 0.1538851565470909, + "grad_norm": 1.0, + "learning_rate": 0.00011737561070924903, + "loss": 2.2089, + "step": 1214 + }, + { + "epoch": 0.15401191532513628, + "grad_norm": 0.90234375, + "learning_rate": 0.00011736891825562761, + "loss": 1.9727, + "step": 1215 + }, + { + "epoch": 0.15413867410318163, + "grad_norm": 0.9609375, + "learning_rate": 0.00011736221747094288, + "loss": 2.0784, + "step": 1216 + }, + { + "epoch": 0.15426543288122702, + "grad_norm": 1.1640625, + "learning_rate": 0.00011735550835616793, + "loss": 2.2285, + "step": 1217 + }, + { + "epoch": 0.1543921916592724, + "grad_norm": 1.0546875, + "learning_rate": 0.00011734879091227703, + "loss": 2.1003, + "step": 1218 + }, + { + "epoch": 0.15451895043731778, + "grad_norm": 0.89453125, + "learning_rate": 0.0001173420651402457, + "loss": 2.2601, + "step": 1219 + }, + { + "epoch": 0.15464570921536316, + "grad_norm": 1.03125, + "learning_rate": 0.00011733533104105062, + "loss": 1.6037, + "step": 1220 + }, + { + "epoch": 0.15477246799340855, + "grad_norm": 0.9296875, + "learning_rate": 0.00011732858861566977, + "loss": 1.8671, + "step": 1221 + }, + { + "epoch": 0.15489922677145393, + "grad_norm": 1.0234375, + "learning_rate": 0.00011732183786508223, + "loss": 1.929, + "step": 1222 + }, + { + "epoch": 0.1550259855494993, + "grad_norm": 0.9765625, + "learning_rate": 0.00011731507879026835, + "loss": 2.0703, + "step": 1223 + }, + { + "epoch": 0.1551527443275447, + "grad_norm": 0.93359375, + "learning_rate": 0.00011730831139220967, + "loss": 2.0561, + "step": 1224 + }, + { + "epoch": 0.15527950310559005, + "grad_norm": 0.859375, + "learning_rate": 0.00011730153567188896, + "loss": 1.7941, + "step": 1225 + }, + { + "epoch": 0.15540626188363543, + "grad_norm": 0.92578125, + "learning_rate": 0.00011729475163029019, + "loss": 1.7607, + "step": 1226 + }, + { + "epoch": 0.15553302066168082, + "grad_norm": 0.96875, + "learning_rate": 0.0001172879592683985, + "loss": 1.5151, + "step": 1227 + }, + { + "epoch": 0.1556597794397262, + "grad_norm": 0.87109375, + "learning_rate": 0.00011728115858720032, + "loss": 2.0432, + "step": 1228 + }, + { + "epoch": 0.15578653821777158, + "grad_norm": 0.9609375, + "learning_rate": 0.0001172743495876832, + "loss": 1.8862, + "step": 1229 + }, + { + "epoch": 0.15591329699581696, + "grad_norm": 0.94140625, + "learning_rate": 0.00011726753227083593, + "loss": 2.0715, + "step": 1230 + }, + { + "epoch": 0.15604005577386235, + "grad_norm": 0.90625, + "learning_rate": 0.00011726070663764854, + "loss": 1.6279, + "step": 1231 + }, + { + "epoch": 0.15616681455190773, + "grad_norm": 0.87109375, + "learning_rate": 0.00011725387268911224, + "loss": 2.1064, + "step": 1232 + }, + { + "epoch": 0.1562935733299531, + "grad_norm": 1.0390625, + "learning_rate": 0.00011724703042621944, + "loss": 2.0368, + "step": 1233 + }, + { + "epoch": 0.15642033210799847, + "grad_norm": 0.90234375, + "learning_rate": 0.00011724017984996376, + "loss": 2.0223, + "step": 1234 + }, + { + "epoch": 0.15654709088604385, + "grad_norm": 0.9375, + "learning_rate": 0.00011723332096134006, + "loss": 1.7416, + "step": 1235 + }, + { + "epoch": 0.15667384966408923, + "grad_norm": 0.96484375, + "learning_rate": 0.00011722645376134437, + "loss": 2.2762, + "step": 1236 + }, + { + "epoch": 0.15680060844213461, + "grad_norm": 0.875, + "learning_rate": 0.00011721957825097394, + "loss": 1.9597, + "step": 1237 + }, + { + "epoch": 0.15692736722018, + "grad_norm": 1.1640625, + "learning_rate": 0.0001172126944312272, + "loss": 2.9127, + "step": 1238 + }, + { + "epoch": 0.15705412599822538, + "grad_norm": 0.94921875, + "learning_rate": 0.00011720580230310385, + "loss": 2.1764, + "step": 1239 + }, + { + "epoch": 0.15718088477627076, + "grad_norm": 1.015625, + "learning_rate": 0.00011719890186760474, + "loss": 1.9176, + "step": 1240 + }, + { + "epoch": 0.15730764355431615, + "grad_norm": 0.921875, + "learning_rate": 0.00011719199312573196, + "loss": 1.9535, + "step": 1241 + }, + { + "epoch": 0.15743440233236153, + "grad_norm": 0.9609375, + "learning_rate": 0.00011718507607848876, + "loss": 2.4568, + "step": 1242 + }, + { + "epoch": 0.15756116111040688, + "grad_norm": 3.859375, + "learning_rate": 0.00011717815072687965, + "loss": 1.9271, + "step": 1243 + }, + { + "epoch": 0.15768791988845227, + "grad_norm": 1.015625, + "learning_rate": 0.00011717121707191033, + "loss": 2.1812, + "step": 1244 + }, + { + "epoch": 0.15781467866649765, + "grad_norm": 0.875, + "learning_rate": 0.00011716427511458766, + "loss": 1.8522, + "step": 1245 + }, + { + "epoch": 0.15794143744454303, + "grad_norm": 0.84765625, + "learning_rate": 0.00011715732485591981, + "loss": 1.8697, + "step": 1246 + }, + { + "epoch": 0.15806819622258841, + "grad_norm": 0.9375, + "learning_rate": 0.00011715036629691603, + "loss": 1.8822, + "step": 1247 + }, + { + "epoch": 0.1581949550006338, + "grad_norm": 0.94921875, + "learning_rate": 0.00011714339943858689, + "loss": 2.06, + "step": 1248 + }, + { + "epoch": 0.15832171377867918, + "grad_norm": 0.9375, + "learning_rate": 0.00011713642428194406, + "loss": 1.8493, + "step": 1249 + }, + { + "epoch": 0.15844847255672456, + "grad_norm": 0.875, + "learning_rate": 0.00011712944082800048, + "loss": 2.1765, + "step": 1250 + }, + { + "epoch": 0.15857523133476994, + "grad_norm": 0.953125, + "learning_rate": 0.0001171224490777703, + "loss": 2.0089, + "step": 1251 + }, + { + "epoch": 0.1587019901128153, + "grad_norm": 0.87109375, + "learning_rate": 0.00011711544903226884, + "loss": 1.5771, + "step": 1252 + }, + { + "epoch": 0.15882874889086068, + "grad_norm": 1.078125, + "learning_rate": 0.00011710844069251264, + "loss": 1.9593, + "step": 1253 + }, + { + "epoch": 0.15895550766890607, + "grad_norm": 0.9453125, + "learning_rate": 0.00011710142405951946, + "loss": 1.9038, + "step": 1254 + }, + { + "epoch": 0.15908226644695145, + "grad_norm": 1.0390625, + "learning_rate": 0.00011709439913430823, + "loss": 2.2597, + "step": 1255 + }, + { + "epoch": 0.15920902522499683, + "grad_norm": 0.921875, + "learning_rate": 0.00011708736591789913, + "loss": 1.9644, + "step": 1256 + }, + { + "epoch": 0.1593357840030422, + "grad_norm": 0.91015625, + "learning_rate": 0.00011708032441131348, + "loss": 1.6797, + "step": 1257 + }, + { + "epoch": 0.1594625427810876, + "grad_norm": 0.92578125, + "learning_rate": 0.00011707327461557386, + "loss": 2.3476, + "step": 1258 + }, + { + "epoch": 0.15958930155913298, + "grad_norm": 1.015625, + "learning_rate": 0.00011706621653170406, + "loss": 2.1398, + "step": 1259 + }, + { + "epoch": 0.15971606033717836, + "grad_norm": 0.984375, + "learning_rate": 0.000117059150160729, + "loss": 1.8348, + "step": 1260 + }, + { + "epoch": 0.15984281911522372, + "grad_norm": 0.94921875, + "learning_rate": 0.00011705207550367488, + "loss": 2.1193, + "step": 1261 + }, + { + "epoch": 0.1599695778932691, + "grad_norm": 0.98046875, + "learning_rate": 0.00011704499256156908, + "loss": 1.8472, + "step": 1262 + }, + { + "epoch": 0.16009633667131448, + "grad_norm": 0.90625, + "learning_rate": 0.00011703790133544016, + "loss": 1.8676, + "step": 1263 + }, + { + "epoch": 0.16022309544935986, + "grad_norm": 0.796875, + "learning_rate": 0.00011703080182631792, + "loss": 1.7294, + "step": 1264 + }, + { + "epoch": 0.16034985422740525, + "grad_norm": 0.8515625, + "learning_rate": 0.00011702369403523333, + "loss": 1.8304, + "step": 1265 + }, + { + "epoch": 0.16047661300545063, + "grad_norm": 1.0, + "learning_rate": 0.00011701657796321858, + "loss": 1.5325, + "step": 1266 + }, + { + "epoch": 0.160603371783496, + "grad_norm": 0.87890625, + "learning_rate": 0.00011700945361130706, + "loss": 1.9624, + "step": 1267 + }, + { + "epoch": 0.1607301305615414, + "grad_norm": 0.98046875, + "learning_rate": 0.00011700232098053337, + "loss": 1.5178, + "step": 1268 + }, + { + "epoch": 0.16085688933958678, + "grad_norm": 0.87890625, + "learning_rate": 0.00011699518007193332, + "loss": 2.1261, + "step": 1269 + }, + { + "epoch": 0.16098364811763213, + "grad_norm": 0.86328125, + "learning_rate": 0.00011698803088654388, + "loss": 2.026, + "step": 1270 + }, + { + "epoch": 0.16111040689567752, + "grad_norm": 1.265625, + "learning_rate": 0.00011698087342540325, + "loss": 2.0495, + "step": 1271 + }, + { + "epoch": 0.1612371656737229, + "grad_norm": 0.90625, + "learning_rate": 0.00011697370768955082, + "loss": 1.6341, + "step": 1272 + }, + { + "epoch": 0.16136392445176828, + "grad_norm": 0.94921875, + "learning_rate": 0.00011696653368002722, + "loss": 1.685, + "step": 1273 + }, + { + "epoch": 0.16149068322981366, + "grad_norm": 0.92578125, + "learning_rate": 0.00011695935139787422, + "loss": 2.2273, + "step": 1274 + }, + { + "epoch": 0.16161744200785905, + "grad_norm": 0.94921875, + "learning_rate": 0.00011695216084413486, + "loss": 2.1221, + "step": 1275 + }, + { + "epoch": 0.16174420078590443, + "grad_norm": 1.03125, + "learning_rate": 0.00011694496201985333, + "loss": 1.8019, + "step": 1276 + }, + { + "epoch": 0.1618709595639498, + "grad_norm": 0.87890625, + "learning_rate": 0.00011693775492607504, + "loss": 1.783, + "step": 1277 + }, + { + "epoch": 0.1619977183419952, + "grad_norm": 0.9453125, + "learning_rate": 0.0001169305395638466, + "loss": 2.3844, + "step": 1278 + }, + { + "epoch": 0.16212447712004055, + "grad_norm": 1.109375, + "learning_rate": 0.00011692331593421579, + "loss": 1.5885, + "step": 1279 + }, + { + "epoch": 0.16225123589808593, + "grad_norm": 1.0078125, + "learning_rate": 0.00011691608403823165, + "loss": 2.1312, + "step": 1280 + }, + { + "epoch": 0.16237799467613132, + "grad_norm": 0.75390625, + "learning_rate": 0.0001169088438769444, + "loss": 2.1352, + "step": 1281 + }, + { + "epoch": 0.1625047534541767, + "grad_norm": 0.9921875, + "learning_rate": 0.00011690159545140541, + "loss": 2.2286, + "step": 1282 + }, + { + "epoch": 0.16263151223222208, + "grad_norm": 1.3125, + "learning_rate": 0.00011689433876266731, + "loss": 2.3093, + "step": 1283 + }, + { + "epoch": 0.16275827101026746, + "grad_norm": 0.9453125, + "learning_rate": 0.00011688707381178394, + "loss": 1.849, + "step": 1284 + }, + { + "epoch": 0.16288502978831285, + "grad_norm": 0.8671875, + "learning_rate": 0.00011687980059981026, + "loss": 1.8192, + "step": 1285 + }, + { + "epoch": 0.16301178856635823, + "grad_norm": 0.87890625, + "learning_rate": 0.00011687251912780249, + "loss": 2.2033, + "step": 1286 + }, + { + "epoch": 0.1631385473444036, + "grad_norm": 0.92578125, + "learning_rate": 0.00011686522939681806, + "loss": 2.23, + "step": 1287 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.90234375, + "learning_rate": 0.00011685793140791558, + "loss": 2.2655, + "step": 1288 + }, + { + "epoch": 0.16339206490049435, + "grad_norm": 0.96875, + "learning_rate": 0.00011685062516215483, + "loss": 2.3736, + "step": 1289 + }, + { + "epoch": 0.16351882367853973, + "grad_norm": 0.95703125, + "learning_rate": 0.00011684331066059682, + "loss": 1.6952, + "step": 1290 + }, + { + "epoch": 0.16364558245658511, + "grad_norm": 1.015625, + "learning_rate": 0.00011683598790430379, + "loss": 2.6982, + "step": 1291 + }, + { + "epoch": 0.1637723412346305, + "grad_norm": 1.0078125, + "learning_rate": 0.00011682865689433912, + "loss": 1.724, + "step": 1292 + }, + { + "epoch": 0.16389910001267588, + "grad_norm": 0.8828125, + "learning_rate": 0.00011682131763176739, + "loss": 1.9218, + "step": 1293 + }, + { + "epoch": 0.16402585879072126, + "grad_norm": 0.96484375, + "learning_rate": 0.00011681397011765444, + "loss": 1.9737, + "step": 1294 + }, + { + "epoch": 0.16415261756876665, + "grad_norm": 0.90625, + "learning_rate": 0.00011680661435306727, + "loss": 1.5576, + "step": 1295 + }, + { + "epoch": 0.16427937634681203, + "grad_norm": 0.79296875, + "learning_rate": 0.00011679925033907403, + "loss": 1.9893, + "step": 1296 + }, + { + "epoch": 0.16440613512485738, + "grad_norm": 1.765625, + "learning_rate": 0.00011679187807674417, + "loss": 2.1787, + "step": 1297 + }, + { + "epoch": 0.16453289390290277, + "grad_norm": 1.0, + "learning_rate": 0.00011678449756714826, + "loss": 2.3746, + "step": 1298 + }, + { + "epoch": 0.16465965268094815, + "grad_norm": 1.078125, + "learning_rate": 0.00011677710881135807, + "loss": 1.5504, + "step": 1299 + }, + { + "epoch": 0.16478641145899353, + "grad_norm": 0.953125, + "learning_rate": 0.00011676971181044664, + "loss": 2.0711, + "step": 1300 + }, + { + "epoch": 0.16491317023703891, + "grad_norm": 0.91015625, + "learning_rate": 0.00011676230656548811, + "loss": 2.4971, + "step": 1301 + }, + { + "epoch": 0.1650399290150843, + "grad_norm": 1.0390625, + "learning_rate": 0.00011675489307755788, + "loss": 1.7373, + "step": 1302 + }, + { + "epoch": 0.16516668779312968, + "grad_norm": 0.8125, + "learning_rate": 0.00011674747134773254, + "loss": 1.9925, + "step": 1303 + }, + { + "epoch": 0.16529344657117506, + "grad_norm": 0.9453125, + "learning_rate": 0.00011674004137708985, + "loss": 2.0737, + "step": 1304 + }, + { + "epoch": 0.16542020534922044, + "grad_norm": 1.0546875, + "learning_rate": 0.00011673260316670879, + "loss": 2.0186, + "step": 1305 + }, + { + "epoch": 0.1655469641272658, + "grad_norm": 1.0859375, + "learning_rate": 0.00011672515671766953, + "loss": 2.2546, + "step": 1306 + }, + { + "epoch": 0.16567372290531118, + "grad_norm": 0.94140625, + "learning_rate": 0.00011671770203105345, + "loss": 1.7104, + "step": 1307 + }, + { + "epoch": 0.16580048168335657, + "grad_norm": 1.03125, + "learning_rate": 0.00011671023910794308, + "loss": 1.8675, + "step": 1308 + }, + { + "epoch": 0.16592724046140195, + "grad_norm": 0.91796875, + "learning_rate": 0.0001167027679494222, + "loss": 1.9288, + "step": 1309 + }, + { + "epoch": 0.16605399923944733, + "grad_norm": 1.0625, + "learning_rate": 0.00011669528855657578, + "loss": 1.9568, + "step": 1310 + }, + { + "epoch": 0.1661807580174927, + "grad_norm": 1.0546875, + "learning_rate": 0.00011668780093048994, + "loss": 2.4522, + "step": 1311 + }, + { + "epoch": 0.1663075167955381, + "grad_norm": 1.0, + "learning_rate": 0.00011668030507225206, + "loss": 2.0694, + "step": 1312 + }, + { + "epoch": 0.16643427557358348, + "grad_norm": 0.9765625, + "learning_rate": 0.00011667280098295066, + "loss": 1.6021, + "step": 1313 + }, + { + "epoch": 0.16656103435162886, + "grad_norm": 1.1015625, + "learning_rate": 0.00011666528866367548, + "loss": 2.3724, + "step": 1314 + }, + { + "epoch": 0.16668779312967422, + "grad_norm": 0.953125, + "learning_rate": 0.00011665776811551743, + "loss": 2.2543, + "step": 1315 + }, + { + "epoch": 0.1668145519077196, + "grad_norm": 0.92578125, + "learning_rate": 0.00011665023933956869, + "loss": 1.9289, + "step": 1316 + }, + { + "epoch": 0.16694131068576498, + "grad_norm": 0.921875, + "learning_rate": 0.00011664270233692253, + "loss": 1.7157, + "step": 1317 + }, + { + "epoch": 0.16706806946381036, + "grad_norm": 0.87890625, + "learning_rate": 0.0001166351571086735, + "loss": 2.54, + "step": 1318 + }, + { + "epoch": 0.16719482824185575, + "grad_norm": 0.96875, + "learning_rate": 0.0001166276036559173, + "loss": 2.19, + "step": 1319 + }, + { + "epoch": 0.16732158701990113, + "grad_norm": 1.109375, + "learning_rate": 0.00011662004197975083, + "loss": 1.7336, + "step": 1320 + }, + { + "epoch": 0.1674483457979465, + "grad_norm": 0.93359375, + "learning_rate": 0.0001166124720812722, + "loss": 2.266, + "step": 1321 + }, + { + "epoch": 0.1675751045759919, + "grad_norm": 0.90625, + "learning_rate": 0.0001166048939615807, + "loss": 2.0758, + "step": 1322 + }, + { + "epoch": 0.16770186335403728, + "grad_norm": 0.8828125, + "learning_rate": 0.00011659730762177682, + "loss": 1.9477, + "step": 1323 + }, + { + "epoch": 0.16782862213208263, + "grad_norm": 0.84375, + "learning_rate": 0.00011658971306296224, + "loss": 1.9067, + "step": 1324 + }, + { + "epoch": 0.16795538091012802, + "grad_norm": 0.9375, + "learning_rate": 0.00011658211028623983, + "loss": 2.1705, + "step": 1325 + }, + { + "epoch": 0.1680821396881734, + "grad_norm": 0.97265625, + "learning_rate": 0.00011657449929271366, + "loss": 2.155, + "step": 1326 + }, + { + "epoch": 0.16820889846621878, + "grad_norm": 0.91015625, + "learning_rate": 0.00011656688008348902, + "loss": 1.9701, + "step": 1327 + }, + { + "epoch": 0.16833565724426416, + "grad_norm": 1.078125, + "learning_rate": 0.00011655925265967231, + "loss": 1.7513, + "step": 1328 + }, + { + "epoch": 0.16846241602230955, + "grad_norm": 1.0, + "learning_rate": 0.00011655161702237121, + "loss": 1.9852, + "step": 1329 + }, + { + "epoch": 0.16858917480035493, + "grad_norm": 0.8984375, + "learning_rate": 0.00011654397317269457, + "loss": 2.1549, + "step": 1330 + }, + { + "epoch": 0.1687159335784003, + "grad_norm": 0.859375, + "learning_rate": 0.0001165363211117524, + "loss": 1.8626, + "step": 1331 + }, + { + "epoch": 0.1688426923564457, + "grad_norm": 1.046875, + "learning_rate": 0.00011652866084065594, + "loss": 2.2102, + "step": 1332 + }, + { + "epoch": 0.16896945113449105, + "grad_norm": 0.8671875, + "learning_rate": 0.00011652099236051761, + "loss": 2.1832, + "step": 1333 + }, + { + "epoch": 0.16909620991253643, + "grad_norm": 1.0234375, + "learning_rate": 0.000116513315672451, + "loss": 2.1535, + "step": 1334 + }, + { + "epoch": 0.16922296869058182, + "grad_norm": 0.91796875, + "learning_rate": 0.00011650563077757095, + "loss": 1.6856, + "step": 1335 + }, + { + "epoch": 0.1693497274686272, + "grad_norm": 0.765625, + "learning_rate": 0.0001164979376769934, + "loss": 2.3066, + "step": 1336 + }, + { + "epoch": 0.16947648624667258, + "grad_norm": 0.953125, + "learning_rate": 0.00011649023637183559, + "loss": 1.8957, + "step": 1337 + }, + { + "epoch": 0.16960324502471796, + "grad_norm": 1.0390625, + "learning_rate": 0.00011648252686321586, + "loss": 2.3303, + "step": 1338 + }, + { + "epoch": 0.16973000380276335, + "grad_norm": 0.99609375, + "learning_rate": 0.0001164748091522538, + "loss": 2.186, + "step": 1339 + }, + { + "epoch": 0.16985676258080873, + "grad_norm": 0.96484375, + "learning_rate": 0.00011646708324007013, + "loss": 1.4996, + "step": 1340 + }, + { + "epoch": 0.1699835213588541, + "grad_norm": 1.0078125, + "learning_rate": 0.00011645934912778685, + "loss": 1.6884, + "step": 1341 + }, + { + "epoch": 0.17011028013689947, + "grad_norm": 1.0234375, + "learning_rate": 0.00011645160681652707, + "loss": 2.3821, + "step": 1342 + }, + { + "epoch": 0.17023703891494485, + "grad_norm": 1.0703125, + "learning_rate": 0.00011644385630741514, + "loss": 2.4489, + "step": 1343 + }, + { + "epoch": 0.17036379769299023, + "grad_norm": 0.92578125, + "learning_rate": 0.00011643609760157658, + "loss": 2.0797, + "step": 1344 + }, + { + "epoch": 0.17049055647103561, + "grad_norm": 0.953125, + "learning_rate": 0.00011642833070013807, + "loss": 1.7555, + "step": 1345 + }, + { + "epoch": 0.170617315249081, + "grad_norm": 0.93359375, + "learning_rate": 0.00011642055560422756, + "loss": 1.7697, + "step": 1346 + }, + { + "epoch": 0.17074407402712638, + "grad_norm": 0.953125, + "learning_rate": 0.00011641277231497411, + "loss": 2.1339, + "step": 1347 + }, + { + "epoch": 0.17087083280517176, + "grad_norm": 0.859375, + "learning_rate": 0.000116404980833508, + "loss": 1.6965, + "step": 1348 + }, + { + "epoch": 0.17099759158321715, + "grad_norm": 1.34375, + "learning_rate": 0.00011639718116096075, + "loss": 1.9666, + "step": 1349 + }, + { + "epoch": 0.17112435036126253, + "grad_norm": 0.9609375, + "learning_rate": 0.00011638937329846495, + "loss": 1.7666, + "step": 1350 + }, + { + "epoch": 0.17125110913930788, + "grad_norm": 1.0234375, + "learning_rate": 0.0001163815572471545, + "loss": 2.5994, + "step": 1351 + }, + { + "epoch": 0.17137786791735327, + "grad_norm": 0.921875, + "learning_rate": 0.00011637373300816444, + "loss": 1.911, + "step": 1352 + }, + { + "epoch": 0.17150462669539865, + "grad_norm": 0.9609375, + "learning_rate": 0.00011636590058263097, + "loss": 1.7797, + "step": 1353 + }, + { + "epoch": 0.17163138547344403, + "grad_norm": 1.03125, + "learning_rate": 0.00011635805997169153, + "loss": 1.6793, + "step": 1354 + }, + { + "epoch": 0.17175814425148941, + "grad_norm": 0.9921875, + "learning_rate": 0.00011635021117648471, + "loss": 2.3738, + "step": 1355 + }, + { + "epoch": 0.1718849030295348, + "grad_norm": 0.87109375, + "learning_rate": 0.00011634235419815033, + "loss": 1.9983, + "step": 1356 + }, + { + "epoch": 0.17201166180758018, + "grad_norm": 0.8828125, + "learning_rate": 0.00011633448903782936, + "loss": 2.091, + "step": 1357 + }, + { + "epoch": 0.17213842058562556, + "grad_norm": 0.9921875, + "learning_rate": 0.00011632661569666396, + "loss": 2.1014, + "step": 1358 + }, + { + "epoch": 0.17226517936367094, + "grad_norm": 0.89453125, + "learning_rate": 0.00011631873417579752, + "loss": 2.3214, + "step": 1359 + }, + { + "epoch": 0.17239193814171633, + "grad_norm": 0.9375, + "learning_rate": 0.00011631084447637455, + "loss": 1.8834, + "step": 1360 + }, + { + "epoch": 0.17251869691976168, + "grad_norm": 0.8984375, + "learning_rate": 0.0001163029465995408, + "loss": 1.3993, + "step": 1361 + }, + { + "epoch": 0.17264545569780707, + "grad_norm": 0.8828125, + "learning_rate": 0.0001162950405464432, + "loss": 2.1255, + "step": 1362 + }, + { + "epoch": 0.17277221447585245, + "grad_norm": 0.94140625, + "learning_rate": 0.00011628712631822985, + "loss": 2.4376, + "step": 1363 + }, + { + "epoch": 0.17289897325389783, + "grad_norm": 1.1171875, + "learning_rate": 0.00011627920391605006, + "loss": 2.189, + "step": 1364 + }, + { + "epoch": 0.1730257320319432, + "grad_norm": 0.89453125, + "learning_rate": 0.00011627127334105431, + "loss": 1.7081, + "step": 1365 + }, + { + "epoch": 0.1731524908099886, + "grad_norm": 0.90234375, + "learning_rate": 0.00011626333459439426, + "loss": 1.8611, + "step": 1366 + }, + { + "epoch": 0.17327924958803398, + "grad_norm": 0.96484375, + "learning_rate": 0.00011625538767722278, + "loss": 1.7965, + "step": 1367 + }, + { + "epoch": 0.17340600836607936, + "grad_norm": 0.8671875, + "learning_rate": 0.00011624743259069391, + "loss": 2.0881, + "step": 1368 + }, + { + "epoch": 0.17353276714412474, + "grad_norm": 1.0546875, + "learning_rate": 0.00011623946933596287, + "loss": 2.0016, + "step": 1369 + }, + { + "epoch": 0.1736595259221701, + "grad_norm": 1.0078125, + "learning_rate": 0.00011623149791418611, + "loss": 2.2606, + "step": 1370 + }, + { + "epoch": 0.17378628470021548, + "grad_norm": 0.8984375, + "learning_rate": 0.00011622351832652118, + "loss": 2.2285, + "step": 1371 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.96875, + "learning_rate": 0.00011621553057412692, + "loss": 1.9222, + "step": 1372 + }, + { + "epoch": 0.17403980225630625, + "grad_norm": 1.0546875, + "learning_rate": 0.00011620753465816328, + "loss": 2.5336, + "step": 1373 + }, + { + "epoch": 0.17416656103435163, + "grad_norm": 0.91015625, + "learning_rate": 0.00011619953057979142, + "loss": 2.153, + "step": 1374 + }, + { + "epoch": 0.174293319812397, + "grad_norm": 0.9765625, + "learning_rate": 0.00011619151834017369, + "loss": 2.2346, + "step": 1375 + }, + { + "epoch": 0.1744200785904424, + "grad_norm": 0.9296875, + "learning_rate": 0.00011618349794047361, + "loss": 1.9778, + "step": 1376 + }, + { + "epoch": 0.17454683736848778, + "grad_norm": 0.890625, + "learning_rate": 0.00011617546938185591, + "loss": 2.1729, + "step": 1377 + }, + { + "epoch": 0.17467359614653316, + "grad_norm": 0.97265625, + "learning_rate": 0.0001161674326654865, + "loss": 2.1126, + "step": 1378 + }, + { + "epoch": 0.17480035492457852, + "grad_norm": 0.9375, + "learning_rate": 0.00011615938779253243, + "loss": 2.2715, + "step": 1379 + }, + { + "epoch": 0.1749271137026239, + "grad_norm": 1.3359375, + "learning_rate": 0.00011615133476416198, + "loss": 2.2649, + "step": 1380 + }, + { + "epoch": 0.17505387248066928, + "grad_norm": 0.890625, + "learning_rate": 0.00011614327358154461, + "loss": 1.9336, + "step": 1381 + }, + { + "epoch": 0.17518063125871466, + "grad_norm": 1.015625, + "learning_rate": 0.00011613520424585097, + "loss": 2.2498, + "step": 1382 + }, + { + "epoch": 0.17530739003676005, + "grad_norm": 0.80078125, + "learning_rate": 0.00011612712675825288, + "loss": 1.8766, + "step": 1383 + }, + { + "epoch": 0.17543414881480543, + "grad_norm": 0.9296875, + "learning_rate": 0.00011611904111992333, + "loss": 2.2918, + "step": 1384 + }, + { + "epoch": 0.1755609075928508, + "grad_norm": 0.8984375, + "learning_rate": 0.00011611094733203652, + "loss": 2.3701, + "step": 1385 + }, + { + "epoch": 0.1756876663708962, + "grad_norm": 0.94921875, + "learning_rate": 0.00011610284539576782, + "loss": 2.2539, + "step": 1386 + }, + { + "epoch": 0.17581442514894158, + "grad_norm": 1.0703125, + "learning_rate": 0.00011609473531229376, + "loss": 2.128, + "step": 1387 + }, + { + "epoch": 0.17594118392698693, + "grad_norm": 0.87890625, + "learning_rate": 0.00011608661708279212, + "loss": 1.9965, + "step": 1388 + }, + { + "epoch": 0.17606794270503232, + "grad_norm": 0.9296875, + "learning_rate": 0.0001160784907084418, + "loss": 2.0458, + "step": 1389 + }, + { + "epoch": 0.1761947014830777, + "grad_norm": 1.0078125, + "learning_rate": 0.00011607035619042292, + "loss": 1.7919, + "step": 1390 + }, + { + "epoch": 0.17632146026112308, + "grad_norm": 1.1640625, + "learning_rate": 0.00011606221352991676, + "loss": 2.1364, + "step": 1391 + }, + { + "epoch": 0.17644821903916846, + "grad_norm": 0.86328125, + "learning_rate": 0.00011605406272810577, + "loss": 1.684, + "step": 1392 + }, + { + "epoch": 0.17657497781721385, + "grad_norm": 0.86328125, + "learning_rate": 0.00011604590378617363, + "loss": 2.0512, + "step": 1393 + }, + { + "epoch": 0.17670173659525923, + "grad_norm": 0.875, + "learning_rate": 0.00011603773670530517, + "loss": 2.3032, + "step": 1394 + }, + { + "epoch": 0.1768284953733046, + "grad_norm": 0.9765625, + "learning_rate": 0.00011602956148668639, + "loss": 2.1799, + "step": 1395 + }, + { + "epoch": 0.17695525415135, + "grad_norm": 0.90625, + "learning_rate": 0.00011602137813150451, + "loss": 1.6765, + "step": 1396 + }, + { + "epoch": 0.17708201292939535, + "grad_norm": 0.8984375, + "learning_rate": 0.0001160131866409479, + "loss": 1.8718, + "step": 1397 + }, + { + "epoch": 0.17720877170744073, + "grad_norm": 0.87890625, + "learning_rate": 0.0001160049870162061, + "loss": 2.271, + "step": 1398 + }, + { + "epoch": 0.17733553048548611, + "grad_norm": 1.0, + "learning_rate": 0.00011599677925846988, + "loss": 1.7783, + "step": 1399 + }, + { + "epoch": 0.1774622892635315, + "grad_norm": 0.8984375, + "learning_rate": 0.00011598856336893115, + "loss": 1.9059, + "step": 1400 + }, + { + "epoch": 0.17758904804157688, + "grad_norm": 0.953125, + "learning_rate": 0.00011598033934878304, + "loss": 2.0421, + "step": 1401 + }, + { + "epoch": 0.17771580681962226, + "grad_norm": 0.8515625, + "learning_rate": 0.00011597210719921979, + "loss": 1.9537, + "step": 1402 + }, + { + "epoch": 0.17784256559766765, + "grad_norm": 1.1796875, + "learning_rate": 0.00011596386692143689, + "loss": 2.1808, + "step": 1403 + }, + { + "epoch": 0.17796932437571303, + "grad_norm": 0.890625, + "learning_rate": 0.00011595561851663099, + "loss": 2.0229, + "step": 1404 + }, + { + "epoch": 0.1780960831537584, + "grad_norm": 0.8515625, + "learning_rate": 0.00011594736198599992, + "loss": 1.9081, + "step": 1405 + }, + { + "epoch": 0.17822284193180377, + "grad_norm": 1.4140625, + "learning_rate": 0.00011593909733074268, + "loss": 2.2144, + "step": 1406 + }, + { + "epoch": 0.17834960070984915, + "grad_norm": 1.4296875, + "learning_rate": 0.00011593082455205944, + "loss": 1.8181, + "step": 1407 + }, + { + "epoch": 0.17847635948789453, + "grad_norm": 0.94921875, + "learning_rate": 0.00011592254365115158, + "loss": 2.2321, + "step": 1408 + }, + { + "epoch": 0.17860311826593991, + "grad_norm": 0.92578125, + "learning_rate": 0.00011591425462922164, + "loss": 1.9074, + "step": 1409 + }, + { + "epoch": 0.1787298770439853, + "grad_norm": 1.0390625, + "learning_rate": 0.00011590595748747335, + "loss": 2.5156, + "step": 1410 + }, + { + "epoch": 0.17885663582203068, + "grad_norm": 0.9453125, + "learning_rate": 0.00011589765222711163, + "loss": 2.0608, + "step": 1411 + }, + { + "epoch": 0.17898339460007606, + "grad_norm": 1.0, + "learning_rate": 0.00011588933884934252, + "loss": 2.0114, + "step": 1412 + }, + { + "epoch": 0.17911015337812144, + "grad_norm": 1.390625, + "learning_rate": 0.00011588101735537331, + "loss": 2.1753, + "step": 1413 + }, + { + "epoch": 0.17923691215616683, + "grad_norm": 0.88671875, + "learning_rate": 0.00011587268774641244, + "loss": 1.8167, + "step": 1414 + }, + { + "epoch": 0.17936367093421218, + "grad_norm": 0.94921875, + "learning_rate": 0.00011586435002366953, + "loss": 2.1786, + "step": 1415 + }, + { + "epoch": 0.17949042971225757, + "grad_norm": 0.90234375, + "learning_rate": 0.00011585600418835535, + "loss": 2.0829, + "step": 1416 + }, + { + "epoch": 0.17961718849030295, + "grad_norm": 0.921875, + "learning_rate": 0.00011584765024168191, + "loss": 1.8147, + "step": 1417 + }, + { + "epoch": 0.17974394726834833, + "grad_norm": 0.875, + "learning_rate": 0.00011583928818486235, + "loss": 1.9458, + "step": 1418 + }, + { + "epoch": 0.1798707060463937, + "grad_norm": 0.8984375, + "learning_rate": 0.00011583091801911099, + "loss": 1.966, + "step": 1419 + }, + { + "epoch": 0.1799974648244391, + "grad_norm": 0.89453125, + "learning_rate": 0.00011582253974564335, + "loss": 2.1057, + "step": 1420 + }, + { + "epoch": 0.18012422360248448, + "grad_norm": 0.9765625, + "learning_rate": 0.0001158141533656761, + "loss": 1.9527, + "step": 1421 + }, + { + "epoch": 0.18025098238052986, + "grad_norm": 0.9296875, + "learning_rate": 0.00011580575888042713, + "loss": 1.7309, + "step": 1422 + }, + { + "epoch": 0.18037774115857524, + "grad_norm": 0.90234375, + "learning_rate": 0.00011579735629111546, + "loss": 2.1355, + "step": 1423 + }, + { + "epoch": 0.1805044999366206, + "grad_norm": 0.921875, + "learning_rate": 0.00011578894559896131, + "loss": 1.8338, + "step": 1424 + }, + { + "epoch": 0.18063125871466598, + "grad_norm": 0.9296875, + "learning_rate": 0.00011578052680518607, + "loss": 2.2209, + "step": 1425 + }, + { + "epoch": 0.18075801749271136, + "grad_norm": 0.921875, + "learning_rate": 0.00011577209991101231, + "loss": 2.1022, + "step": 1426 + }, + { + "epoch": 0.18088477627075675, + "grad_norm": 0.9765625, + "learning_rate": 0.0001157636649176638, + "loss": 1.9303, + "step": 1427 + }, + { + "epoch": 0.18101153504880213, + "grad_norm": 0.953125, + "learning_rate": 0.00011575522182636541, + "loss": 2.0104, + "step": 1428 + }, + { + "epoch": 0.1811382938268475, + "grad_norm": 0.8828125, + "learning_rate": 0.0001157467706383433, + "loss": 2.8187, + "step": 1429 + }, + { + "epoch": 0.1812650526048929, + "grad_norm": 1.046875, + "learning_rate": 0.0001157383113548247, + "loss": 2.3011, + "step": 1430 + }, + { + "epoch": 0.18139181138293828, + "grad_norm": 1.0078125, + "learning_rate": 0.00011572984397703807, + "loss": 2.119, + "step": 1431 + }, + { + "epoch": 0.18151857016098366, + "grad_norm": 0.8984375, + "learning_rate": 0.00011572136850621305, + "loss": 2.0259, + "step": 1432 + }, + { + "epoch": 0.18164532893902902, + "grad_norm": 0.890625, + "learning_rate": 0.00011571288494358043, + "loss": 1.8872, + "step": 1433 + }, + { + "epoch": 0.1817720877170744, + "grad_norm": 0.9609375, + "learning_rate": 0.00011570439329037217, + "loss": 1.6205, + "step": 1434 + }, + { + "epoch": 0.18189884649511978, + "grad_norm": 0.93359375, + "learning_rate": 0.00011569589354782145, + "loss": 2.2939, + "step": 1435 + }, + { + "epoch": 0.18202560527316516, + "grad_norm": 0.91796875, + "learning_rate": 0.00011568738571716257, + "loss": 1.9209, + "step": 1436 + }, + { + "epoch": 0.18215236405121055, + "grad_norm": 1.0703125, + "learning_rate": 0.00011567886979963104, + "loss": 1.8242, + "step": 1437 + }, + { + "epoch": 0.18227912282925593, + "grad_norm": 0.98046875, + "learning_rate": 0.00011567034579646353, + "loss": 2.2958, + "step": 1438 + }, + { + "epoch": 0.1824058816073013, + "grad_norm": 0.96875, + "learning_rate": 0.00011566181370889791, + "loss": 2.1523, + "step": 1439 + }, + { + "epoch": 0.1825326403853467, + "grad_norm": 0.80078125, + "learning_rate": 0.00011565327353817316, + "loss": 2.3284, + "step": 1440 + }, + { + "epoch": 0.18265939916339208, + "grad_norm": 0.94140625, + "learning_rate": 0.00011564472528552952, + "loss": 2.4607, + "step": 1441 + }, + { + "epoch": 0.18278615794143743, + "grad_norm": 0.8828125, + "learning_rate": 0.00011563616895220833, + "loss": 2.0969, + "step": 1442 + }, + { + "epoch": 0.18291291671948282, + "grad_norm": 0.82421875, + "learning_rate": 0.00011562760453945214, + "loss": 1.8607, + "step": 1443 + }, + { + "epoch": 0.1830396754975282, + "grad_norm": 0.859375, + "learning_rate": 0.00011561903204850467, + "loss": 1.9982, + "step": 1444 + }, + { + "epoch": 0.18316643427557358, + "grad_norm": 3.625, + "learning_rate": 0.0001156104514806108, + "loss": 1.7434, + "step": 1445 + }, + { + "epoch": 0.18329319305361896, + "grad_norm": 1.0, + "learning_rate": 0.00011560186283701662, + "loss": 1.6313, + "step": 1446 + }, + { + "epoch": 0.18341995183166435, + "grad_norm": 1.0078125, + "learning_rate": 0.00011559326611896935, + "loss": 2.0457, + "step": 1447 + }, + { + "epoch": 0.18354671060970973, + "grad_norm": 1.0546875, + "learning_rate": 0.00011558466132771737, + "loss": 1.7957, + "step": 1448 + }, + { + "epoch": 0.1836734693877551, + "grad_norm": 0.9453125, + "learning_rate": 0.00011557604846451031, + "loss": 2.0291, + "step": 1449 + }, + { + "epoch": 0.1838002281658005, + "grad_norm": 0.9921875, + "learning_rate": 0.00011556742753059887, + "loss": 1.8652, + "step": 1450 + }, + { + "epoch": 0.18392698694384585, + "grad_norm": 0.94140625, + "learning_rate": 0.00011555879852723502, + "loss": 2.059, + "step": 1451 + }, + { + "epoch": 0.18405374572189123, + "grad_norm": 0.9140625, + "learning_rate": 0.00011555016145567183, + "loss": 2.1226, + "step": 1452 + }, + { + "epoch": 0.18418050449993661, + "grad_norm": 0.875, + "learning_rate": 0.00011554151631716358, + "loss": 1.847, + "step": 1453 + }, + { + "epoch": 0.184307263277982, + "grad_norm": 0.9453125, + "learning_rate": 0.0001155328631129657, + "loss": 1.758, + "step": 1454 + }, + { + "epoch": 0.18443402205602738, + "grad_norm": 0.96875, + "learning_rate": 0.00011552420184433481, + "loss": 2.505, + "step": 1455 + }, + { + "epoch": 0.18456078083407276, + "grad_norm": 0.85546875, + "learning_rate": 0.00011551553251252867, + "loss": 2.4045, + "step": 1456 + }, + { + "epoch": 0.18468753961211815, + "grad_norm": 0.8984375, + "learning_rate": 0.00011550685511880626, + "loss": 1.5924, + "step": 1457 + }, + { + "epoch": 0.18481429839016353, + "grad_norm": 1.28125, + "learning_rate": 0.00011549816966442769, + "loss": 1.8448, + "step": 1458 + }, + { + "epoch": 0.1849410571682089, + "grad_norm": 0.96875, + "learning_rate": 0.00011548947615065427, + "loss": 1.8081, + "step": 1459 + }, + { + "epoch": 0.18506781594625427, + "grad_norm": 0.8515625, + "learning_rate": 0.00011548077457874843, + "loss": 1.8125, + "step": 1460 + }, + { + "epoch": 0.18519457472429965, + "grad_norm": 0.88671875, + "learning_rate": 0.00011547206494997383, + "loss": 1.4639, + "step": 1461 + }, + { + "epoch": 0.18532133350234503, + "grad_norm": 0.98046875, + "learning_rate": 0.00011546334726559528, + "loss": 2.3092, + "step": 1462 + }, + { + "epoch": 0.18544809228039041, + "grad_norm": 0.91796875, + "learning_rate": 0.00011545462152687875, + "loss": 2.0931, + "step": 1463 + }, + { + "epoch": 0.1855748510584358, + "grad_norm": 0.91015625, + "learning_rate": 0.00011544588773509136, + "loss": 2.2285, + "step": 1464 + }, + { + "epoch": 0.18570160983648118, + "grad_norm": 0.96875, + "learning_rate": 0.00011543714589150145, + "loss": 1.9125, + "step": 1465 + }, + { + "epoch": 0.18582836861452656, + "grad_norm": 0.8984375, + "learning_rate": 0.00011542839599737849, + "loss": 2.0016, + "step": 1466 + }, + { + "epoch": 0.18595512739257195, + "grad_norm": 0.875, + "learning_rate": 0.00011541963805399313, + "loss": 1.9566, + "step": 1467 + }, + { + "epoch": 0.18608188617061733, + "grad_norm": 0.85546875, + "learning_rate": 0.00011541087206261721, + "loss": 1.6469, + "step": 1468 + }, + { + "epoch": 0.18620864494866268, + "grad_norm": 0.96484375, + "learning_rate": 0.00011540209802452371, + "loss": 2.1394, + "step": 1469 + }, + { + "epoch": 0.18633540372670807, + "grad_norm": 0.90625, + "learning_rate": 0.00011539331594098678, + "loss": 2.2794, + "step": 1470 + }, + { + "epoch": 0.18646216250475345, + "grad_norm": 0.98046875, + "learning_rate": 0.00011538452581328174, + "loss": 1.5623, + "step": 1471 + }, + { + "epoch": 0.18658892128279883, + "grad_norm": 0.95703125, + "learning_rate": 0.00011537572764268512, + "loss": 1.7997, + "step": 1472 + }, + { + "epoch": 0.1867156800608442, + "grad_norm": 0.96484375, + "learning_rate": 0.00011536692143047453, + "loss": 1.8238, + "step": 1473 + }, + { + "epoch": 0.1868424388388896, + "grad_norm": 0.78125, + "learning_rate": 0.00011535810717792885, + "loss": 1.4472, + "step": 1474 + }, + { + "epoch": 0.18696919761693498, + "grad_norm": 0.9453125, + "learning_rate": 0.00011534928488632804, + "loss": 1.8083, + "step": 1475 + }, + { + "epoch": 0.18709595639498036, + "grad_norm": 0.94921875, + "learning_rate": 0.00011534045455695329, + "loss": 1.9584, + "step": 1476 + }, + { + "epoch": 0.18722271517302574, + "grad_norm": 0.92578125, + "learning_rate": 0.00011533161619108694, + "loss": 2.1273, + "step": 1477 + }, + { + "epoch": 0.1873494739510711, + "grad_norm": 1.09375, + "learning_rate": 0.00011532276979001245, + "loss": 2.4269, + "step": 1478 + }, + { + "epoch": 0.18747623272911648, + "grad_norm": 1.4921875, + "learning_rate": 0.00011531391535501451, + "loss": 2.4586, + "step": 1479 + }, + { + "epoch": 0.18760299150716186, + "grad_norm": 0.94921875, + "learning_rate": 0.00011530505288737896, + "loss": 2.3621, + "step": 1480 + }, + { + "epoch": 0.18772975028520725, + "grad_norm": 0.921875, + "learning_rate": 0.0001152961823883928, + "loss": 1.7885, + "step": 1481 + }, + { + "epoch": 0.18785650906325263, + "grad_norm": 0.91015625, + "learning_rate": 0.00011528730385934418, + "loss": 1.8499, + "step": 1482 + }, + { + "epoch": 0.187983267841298, + "grad_norm": 0.99609375, + "learning_rate": 0.00011527841730152244, + "loss": 2.3266, + "step": 1483 + }, + { + "epoch": 0.1881100266193434, + "grad_norm": 1.125, + "learning_rate": 0.0001152695227162181, + "loss": 2.0633, + "step": 1484 + }, + { + "epoch": 0.18823678539738878, + "grad_norm": 0.9453125, + "learning_rate": 0.00011526062010472277, + "loss": 2.0153, + "step": 1485 + }, + { + "epoch": 0.18836354417543416, + "grad_norm": 1.09375, + "learning_rate": 0.00011525170946832934, + "loss": 1.9151, + "step": 1486 + }, + { + "epoch": 0.18849030295347952, + "grad_norm": 1.265625, + "learning_rate": 0.00011524279080833176, + "loss": 2.252, + "step": 1487 + }, + { + "epoch": 0.1886170617315249, + "grad_norm": 1.15625, + "learning_rate": 0.00011523386412602521, + "loss": 2.2172, + "step": 1488 + }, + { + "epoch": 0.18874382050957028, + "grad_norm": 0.94921875, + "learning_rate": 0.00011522492942270601, + "loss": 2.259, + "step": 1489 + }, + { + "epoch": 0.18887057928761566, + "grad_norm": 0.94921875, + "learning_rate": 0.00011521598669967166, + "loss": 1.9296, + "step": 1490 + }, + { + "epoch": 0.18899733806566105, + "grad_norm": 0.8828125, + "learning_rate": 0.00011520703595822081, + "loss": 2.1687, + "step": 1491 + }, + { + "epoch": 0.18912409684370643, + "grad_norm": 1.0, + "learning_rate": 0.00011519807719965326, + "loss": 2.2101, + "step": 1492 + }, + { + "epoch": 0.1892508556217518, + "grad_norm": 0.8984375, + "learning_rate": 0.00011518911042527003, + "loss": 1.9152, + "step": 1493 + }, + { + "epoch": 0.1893776143997972, + "grad_norm": 0.87109375, + "learning_rate": 0.00011518013563637323, + "loss": 2.6482, + "step": 1494 + }, + { + "epoch": 0.18950437317784258, + "grad_norm": 0.953125, + "learning_rate": 0.00011517115283426617, + "loss": 1.6926, + "step": 1495 + }, + { + "epoch": 0.18963113195588793, + "grad_norm": 0.87109375, + "learning_rate": 0.00011516216202025337, + "loss": 2.1468, + "step": 1496 + }, + { + "epoch": 0.18975789073393332, + "grad_norm": 0.91015625, + "learning_rate": 0.0001151531631956404, + "loss": 1.8959, + "step": 1497 + }, + { + "epoch": 0.1898846495119787, + "grad_norm": 0.90625, + "learning_rate": 0.00011514415636173414, + "loss": 2.1068, + "step": 1498 + }, + { + "epoch": 0.19001140829002408, + "grad_norm": 1.1015625, + "learning_rate": 0.00011513514151984249, + "loss": 2.229, + "step": 1499 + }, + { + "epoch": 0.19013816706806946, + "grad_norm": 0.98828125, + "learning_rate": 0.00011512611867127461, + "loss": 1.7983, + "step": 1500 + }, + { + "epoch": 0.19026492584611485, + "grad_norm": 0.89453125, + "learning_rate": 0.00011511708781734078, + "loss": 1.9443, + "step": 1501 + }, + { + "epoch": 0.19039168462416023, + "grad_norm": 0.9609375, + "learning_rate": 0.00011510804895935245, + "loss": 2.0794, + "step": 1502 + }, + { + "epoch": 0.1905184434022056, + "grad_norm": 0.9453125, + "learning_rate": 0.00011509900209862224, + "loss": 1.7023, + "step": 1503 + }, + { + "epoch": 0.190645202180251, + "grad_norm": 0.9765625, + "learning_rate": 0.00011508994723646392, + "loss": 2.4678, + "step": 1504 + }, + { + "epoch": 0.19077196095829635, + "grad_norm": 0.97265625, + "learning_rate": 0.00011508088437419244, + "loss": 1.8963, + "step": 1505 + }, + { + "epoch": 0.19089871973634173, + "grad_norm": 0.89453125, + "learning_rate": 0.00011507181351312389, + "loss": 1.9281, + "step": 1506 + }, + { + "epoch": 0.19102547851438711, + "grad_norm": 0.8828125, + "learning_rate": 0.00011506273465457555, + "loss": 1.7779, + "step": 1507 + }, + { + "epoch": 0.1911522372924325, + "grad_norm": 0.91015625, + "learning_rate": 0.00011505364779986583, + "loss": 2.4982, + "step": 1508 + }, + { + "epoch": 0.19127899607047788, + "grad_norm": 0.94140625, + "learning_rate": 0.0001150445529503143, + "loss": 1.731, + "step": 1509 + }, + { + "epoch": 0.19140575484852326, + "grad_norm": 0.97265625, + "learning_rate": 0.00011503545010724173, + "loss": 2.2086, + "step": 1510 + }, + { + "epoch": 0.19153251362656865, + "grad_norm": 0.9453125, + "learning_rate": 0.00011502633927197002, + "loss": 2.1011, + "step": 1511 + }, + { + "epoch": 0.19165927240461403, + "grad_norm": 0.9765625, + "learning_rate": 0.00011501722044582224, + "loss": 2.2627, + "step": 1512 + }, + { + "epoch": 0.1917860311826594, + "grad_norm": 0.96875, + "learning_rate": 0.00011500809363012261, + "loss": 1.9455, + "step": 1513 + }, + { + "epoch": 0.19191278996070477, + "grad_norm": 0.87109375, + "learning_rate": 0.00011499895882619653, + "loss": 2.2059, + "step": 1514 + }, + { + "epoch": 0.19203954873875015, + "grad_norm": 1.25, + "learning_rate": 0.00011498981603537054, + "loss": 1.7699, + "step": 1515 + }, + { + "epoch": 0.19216630751679553, + "grad_norm": 0.9453125, + "learning_rate": 0.00011498066525897234, + "loss": 1.5404, + "step": 1516 + }, + { + "epoch": 0.19229306629484091, + "grad_norm": 0.9921875, + "learning_rate": 0.00011497150649833082, + "loss": 2.1719, + "step": 1517 + }, + { + "epoch": 0.1924198250728863, + "grad_norm": 0.96875, + "learning_rate": 0.00011496233975477598, + "loss": 2.3721, + "step": 1518 + }, + { + "epoch": 0.19254658385093168, + "grad_norm": 0.9140625, + "learning_rate": 0.00011495316502963902, + "loss": 1.9466, + "step": 1519 + }, + { + "epoch": 0.19267334262897706, + "grad_norm": 0.94921875, + "learning_rate": 0.0001149439823242523, + "loss": 2.0047, + "step": 1520 + }, + { + "epoch": 0.19280010140702245, + "grad_norm": 1.1171875, + "learning_rate": 0.0001149347916399493, + "loss": 2.1183, + "step": 1521 + }, + { + "epoch": 0.19292686018506783, + "grad_norm": 1.046875, + "learning_rate": 0.00011492559297806468, + "loss": 1.6834, + "step": 1522 + }, + { + "epoch": 0.19305361896311318, + "grad_norm": 0.90234375, + "learning_rate": 0.00011491638633993428, + "loss": 1.7123, + "step": 1523 + }, + { + "epoch": 0.19318037774115857, + "grad_norm": 0.9375, + "learning_rate": 0.00011490717172689509, + "loss": 2.0399, + "step": 1524 + }, + { + "epoch": 0.19330713651920395, + "grad_norm": 1.2734375, + "learning_rate": 0.00011489794914028521, + "loss": 2.8563, + "step": 1525 + }, + { + "epoch": 0.19343389529724933, + "grad_norm": 0.97265625, + "learning_rate": 0.00011488871858144398, + "loss": 1.9132, + "step": 1526 + }, + { + "epoch": 0.1935606540752947, + "grad_norm": 0.953125, + "learning_rate": 0.00011487948005171181, + "loss": 1.7047, + "step": 1527 + }, + { + "epoch": 0.1936874128533401, + "grad_norm": 0.8359375, + "learning_rate": 0.00011487023355243036, + "loss": 2.7341, + "step": 1528 + }, + { + "epoch": 0.19381417163138548, + "grad_norm": 0.99609375, + "learning_rate": 0.00011486097908494233, + "loss": 2.0278, + "step": 1529 + }, + { + "epoch": 0.19394093040943086, + "grad_norm": 0.96875, + "learning_rate": 0.00011485171665059171, + "loss": 2.1076, + "step": 1530 + }, + { + "epoch": 0.19406768918747624, + "grad_norm": 0.91015625, + "learning_rate": 0.00011484244625072356, + "loss": 2.0605, + "step": 1531 + }, + { + "epoch": 0.1941944479655216, + "grad_norm": 0.89453125, + "learning_rate": 0.00011483316788668412, + "loss": 2.3451, + "step": 1532 + }, + { + "epoch": 0.19432120674356698, + "grad_norm": 0.8984375, + "learning_rate": 0.00011482388155982077, + "loss": 2.3693, + "step": 1533 + }, + { + "epoch": 0.19444796552161236, + "grad_norm": 0.90234375, + "learning_rate": 0.00011481458727148208, + "loss": 2.3917, + "step": 1534 + }, + { + "epoch": 0.19457472429965775, + "grad_norm": 0.89453125, + "learning_rate": 0.00011480528502301776, + "loss": 2.214, + "step": 1535 + }, + { + "epoch": 0.19470148307770313, + "grad_norm": 0.8984375, + "learning_rate": 0.00011479597481577866, + "loss": 2.3455, + "step": 1536 + }, + { + "epoch": 0.1948282418557485, + "grad_norm": 1.0078125, + "learning_rate": 0.00011478665665111682, + "loss": 1.9683, + "step": 1537 + }, + { + "epoch": 0.1949550006337939, + "grad_norm": 0.91796875, + "learning_rate": 0.0001147773305303854, + "loss": 1.5155, + "step": 1538 + }, + { + "epoch": 0.19508175941183928, + "grad_norm": 0.890625, + "learning_rate": 0.00011476799645493872, + "loss": 1.5717, + "step": 1539 + }, + { + "epoch": 0.19520851818988466, + "grad_norm": 0.9140625, + "learning_rate": 0.0001147586544261323, + "loss": 1.7679, + "step": 1540 + }, + { + "epoch": 0.19533527696793002, + "grad_norm": 0.8359375, + "learning_rate": 0.00011474930444532275, + "loss": 2.1292, + "step": 1541 + }, + { + "epoch": 0.1954620357459754, + "grad_norm": 0.93359375, + "learning_rate": 0.00011473994651386788, + "loss": 2.2694, + "step": 1542 + }, + { + "epoch": 0.19558879452402078, + "grad_norm": 0.97265625, + "learning_rate": 0.00011473058063312664, + "loss": 1.8529, + "step": 1543 + }, + { + "epoch": 0.19571555330206616, + "grad_norm": 0.9921875, + "learning_rate": 0.00011472120680445914, + "loss": 1.6976, + "step": 1544 + }, + { + "epoch": 0.19584231208011155, + "grad_norm": 1.125, + "learning_rate": 0.00011471182502922664, + "loss": 2.3781, + "step": 1545 + }, + { + "epoch": 0.19596907085815693, + "grad_norm": 0.8984375, + "learning_rate": 0.00011470243530879152, + "loss": 2.1917, + "step": 1546 + }, + { + "epoch": 0.1960958296362023, + "grad_norm": 1.109375, + "learning_rate": 0.00011469303764451738, + "loss": 2.7729, + "step": 1547 + }, + { + "epoch": 0.1962225884142477, + "grad_norm": 0.89453125, + "learning_rate": 0.00011468363203776895, + "loss": 1.8288, + "step": 1548 + }, + { + "epoch": 0.19634934719229308, + "grad_norm": 0.8515625, + "learning_rate": 0.00011467421848991207, + "loss": 2.2218, + "step": 1549 + }, + { + "epoch": 0.19647610597033843, + "grad_norm": 1.0, + "learning_rate": 0.00011466479700231378, + "loss": 2.0362, + "step": 1550 + }, + { + "epoch": 0.19660286474838382, + "grad_norm": 0.9296875, + "learning_rate": 0.00011465536757634227, + "loss": 2.1633, + "step": 1551 + }, + { + "epoch": 0.1967296235264292, + "grad_norm": 0.88671875, + "learning_rate": 0.00011464593021336686, + "loss": 1.5545, + "step": 1552 + }, + { + "epoch": 0.19685638230447458, + "grad_norm": 2.296875, + "learning_rate": 0.00011463648491475807, + "loss": 1.7115, + "step": 1553 + }, + { + "epoch": 0.19698314108251996, + "grad_norm": 0.97265625, + "learning_rate": 0.00011462703168188747, + "loss": 1.7945, + "step": 1554 + }, + { + "epoch": 0.19710989986056535, + "grad_norm": 0.86328125, + "learning_rate": 0.00011461757051612792, + "loss": 2.0922, + "step": 1555 + }, + { + "epoch": 0.19723665863861073, + "grad_norm": 0.9765625, + "learning_rate": 0.00011460810141885331, + "loss": 1.7561, + "step": 1556 + }, + { + "epoch": 0.1973634174166561, + "grad_norm": 0.921875, + "learning_rate": 0.00011459862439143877, + "loss": 2.034, + "step": 1557 + }, + { + "epoch": 0.1974901761947015, + "grad_norm": 13.75, + "learning_rate": 0.00011458913943526053, + "loss": 1.5943, + "step": 1558 + }, + { + "epoch": 0.19761693497274685, + "grad_norm": 1.046875, + "learning_rate": 0.00011457964655169599, + "loss": 2.0073, + "step": 1559 + }, + { + "epoch": 0.19774369375079223, + "grad_norm": 0.8125, + "learning_rate": 0.00011457014574212369, + "loss": 1.6193, + "step": 1560 + }, + { + "epoch": 0.19787045252883761, + "grad_norm": 0.96875, + "learning_rate": 0.00011456063700792333, + "loss": 2.6768, + "step": 1561 + }, + { + "epoch": 0.197997211306883, + "grad_norm": 0.96484375, + "learning_rate": 0.00011455112035047578, + "loss": 2.4781, + "step": 1562 + }, + { + "epoch": 0.19812397008492838, + "grad_norm": 0.8671875, + "learning_rate": 0.00011454159577116302, + "loss": 1.853, + "step": 1563 + }, + { + "epoch": 0.19825072886297376, + "grad_norm": 1.0390625, + "learning_rate": 0.00011453206327136821, + "loss": 2.5363, + "step": 1564 + }, + { + "epoch": 0.19837748764101915, + "grad_norm": 0.9140625, + "learning_rate": 0.00011452252285247564, + "loss": 1.9284, + "step": 1565 + }, + { + "epoch": 0.19850424641906453, + "grad_norm": 0.86328125, + "learning_rate": 0.00011451297451587077, + "loss": 1.5369, + "step": 1566 + }, + { + "epoch": 0.1986310051971099, + "grad_norm": 0.8671875, + "learning_rate": 0.00011450341826294022, + "loss": 1.8416, + "step": 1567 + }, + { + "epoch": 0.19875776397515527, + "grad_norm": 0.89453125, + "learning_rate": 0.0001144938540950717, + "loss": 2.1704, + "step": 1568 + }, + { + "epoch": 0.19888452275320065, + "grad_norm": 0.99609375, + "learning_rate": 0.00011448428201365414, + "loss": 1.6665, + "step": 1569 + }, + { + "epoch": 0.19901128153124603, + "grad_norm": 0.88671875, + "learning_rate": 0.00011447470202007757, + "loss": 2.2956, + "step": 1570 + }, + { + "epoch": 0.19913804030929141, + "grad_norm": 0.8828125, + "learning_rate": 0.0001144651141157332, + "loss": 1.7257, + "step": 1571 + }, + { + "epoch": 0.1992647990873368, + "grad_norm": 1.0078125, + "learning_rate": 0.00011445551830201338, + "loss": 2.2848, + "step": 1572 + }, + { + "epoch": 0.19939155786538218, + "grad_norm": 1.078125, + "learning_rate": 0.0001144459145803116, + "loss": 2.0587, + "step": 1573 + }, + { + "epoch": 0.19951831664342756, + "grad_norm": 0.953125, + "learning_rate": 0.0001144363029520225, + "loss": 1.9333, + "step": 1574 + }, + { + "epoch": 0.19964507542147295, + "grad_norm": 0.96484375, + "learning_rate": 0.00011442668341854188, + "loss": 1.9749, + "step": 1575 + }, + { + "epoch": 0.19977183419951833, + "grad_norm": 0.89453125, + "learning_rate": 0.00011441705598126667, + "loss": 2.2388, + "step": 1576 + }, + { + "epoch": 0.19989859297756368, + "grad_norm": 1.0078125, + "learning_rate": 0.00011440742064159498, + "loss": 1.9092, + "step": 1577 + }, + { + "epoch": 0.20002535175560907, + "grad_norm": 0.9296875, + "learning_rate": 0.00011439777740092602, + "loss": 2.3095, + "step": 1578 + }, + { + "epoch": 0.20015211053365445, + "grad_norm": 0.84765625, + "learning_rate": 0.00011438812626066017, + "loss": 2.1366, + "step": 1579 + }, + { + "epoch": 0.20027886931169983, + "grad_norm": 1.0078125, + "learning_rate": 0.00011437846722219901, + "loss": 2.3277, + "step": 1580 + }, + { + "epoch": 0.2004056280897452, + "grad_norm": 0.921875, + "learning_rate": 0.00011436880028694515, + "loss": 1.8407, + "step": 1581 + }, + { + "epoch": 0.2005323868677906, + "grad_norm": 1.109375, + "learning_rate": 0.00011435912545630244, + "loss": 2.1008, + "step": 1582 + }, + { + "epoch": 0.20065914564583598, + "grad_norm": 0.8984375, + "learning_rate": 0.00011434944273167588, + "loss": 2.2148, + "step": 1583 + }, + { + "epoch": 0.20078590442388136, + "grad_norm": 0.8671875, + "learning_rate": 0.00011433975211447154, + "loss": 1.9835, + "step": 1584 + }, + { + "epoch": 0.20091266320192674, + "grad_norm": 0.890625, + "learning_rate": 0.0001143300536060967, + "loss": 2.3527, + "step": 1585 + }, + { + "epoch": 0.2010394219799721, + "grad_norm": 0.96484375, + "learning_rate": 0.00011432034720795979, + "loss": 1.6061, + "step": 1586 + }, + { + "epoch": 0.20116618075801748, + "grad_norm": 1.0234375, + "learning_rate": 0.00011431063292147033, + "loss": 2.3286, + "step": 1587 + }, + { + "epoch": 0.20129293953606286, + "grad_norm": 1.234375, + "learning_rate": 0.00011430091074803904, + "loss": 1.8174, + "step": 1588 + }, + { + "epoch": 0.20141969831410825, + "grad_norm": 0.8671875, + "learning_rate": 0.00011429118068907776, + "loss": 1.5983, + "step": 1589 + }, + { + "epoch": 0.20154645709215363, + "grad_norm": 1.0078125, + "learning_rate": 0.00011428144274599949, + "loss": 2.337, + "step": 1590 + }, + { + "epoch": 0.201673215870199, + "grad_norm": 0.9765625, + "learning_rate": 0.00011427169692021836, + "loss": 1.8694, + "step": 1591 + }, + { + "epoch": 0.2017999746482444, + "grad_norm": 0.87109375, + "learning_rate": 0.00011426194321314964, + "loss": 1.8436, + "step": 1592 + }, + { + "epoch": 0.20192673342628978, + "grad_norm": 0.91796875, + "learning_rate": 0.00011425218162620978, + "loss": 1.8827, + "step": 1593 + }, + { + "epoch": 0.20205349220433516, + "grad_norm": 0.921875, + "learning_rate": 0.00011424241216081631, + "loss": 1.6589, + "step": 1594 + }, + { + "epoch": 0.20218025098238052, + "grad_norm": 0.9921875, + "learning_rate": 0.00011423263481838797, + "loss": 2.0369, + "step": 1595 + }, + { + "epoch": 0.2023070097604259, + "grad_norm": 0.8828125, + "learning_rate": 0.00011422284960034462, + "loss": 1.694, + "step": 1596 + }, + { + "epoch": 0.20243376853847128, + "grad_norm": 1.125, + "learning_rate": 0.00011421305650810725, + "loss": 1.9539, + "step": 1597 + }, + { + "epoch": 0.20256052731651666, + "grad_norm": 1.0, + "learning_rate": 0.000114203255543098, + "loss": 1.9343, + "step": 1598 + }, + { + "epoch": 0.20268728609456205, + "grad_norm": 0.8984375, + "learning_rate": 0.00011419344670674017, + "loss": 2.5027, + "step": 1599 + }, + { + "epoch": 0.20281404487260743, + "grad_norm": 0.8515625, + "learning_rate": 0.00011418363000045817, + "loss": 1.9408, + "step": 1600 + }, + { + "epoch": 0.2029408036506528, + "grad_norm": 0.86328125, + "learning_rate": 0.0001141738054256776, + "loss": 2.428, + "step": 1601 + }, + { + "epoch": 0.2030675624286982, + "grad_norm": 0.90625, + "learning_rate": 0.00011416397298382518, + "loss": 2.2939, + "step": 1602 + }, + { + "epoch": 0.20319432120674358, + "grad_norm": 1.0234375, + "learning_rate": 0.00011415413267632874, + "loss": 2.4483, + "step": 1603 + }, + { + "epoch": 0.20332107998478896, + "grad_norm": 0.9296875, + "learning_rate": 0.00011414428450461729, + "loss": 1.6756, + "step": 1604 + }, + { + "epoch": 0.20344783876283432, + "grad_norm": 0.83984375, + "learning_rate": 0.00011413442847012098, + "loss": 1.9263, + "step": 1605 + }, + { + "epoch": 0.2035745975408797, + "grad_norm": 0.97265625, + "learning_rate": 0.00011412456457427108, + "loss": 1.4446, + "step": 1606 + }, + { + "epoch": 0.20370135631892508, + "grad_norm": 0.96875, + "learning_rate": 0.00011411469281850004, + "loss": 1.5815, + "step": 1607 + }, + { + "epoch": 0.20382811509697046, + "grad_norm": 0.86328125, + "learning_rate": 0.0001141048132042414, + "loss": 2.1134, + "step": 1608 + }, + { + "epoch": 0.20395487387501585, + "grad_norm": 0.85546875, + "learning_rate": 0.00011409492573292988, + "loss": 1.6897, + "step": 1609 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.8125, + "learning_rate": 0.00011408503040600136, + "loss": 2.0431, + "step": 1610 + }, + { + "epoch": 0.2042083914311066, + "grad_norm": 0.86328125, + "learning_rate": 0.00011407512722489278, + "loss": 2.1309, + "step": 1611 + }, + { + "epoch": 0.204335150209152, + "grad_norm": 0.93359375, + "learning_rate": 0.0001140652161910423, + "loss": 1.8774, + "step": 1612 + }, + { + "epoch": 0.20446190898719738, + "grad_norm": 1.0234375, + "learning_rate": 0.00011405529730588918, + "loss": 1.9765, + "step": 1613 + }, + { + "epoch": 0.20458866776524273, + "grad_norm": 1.3203125, + "learning_rate": 0.00011404537057087386, + "loss": 2.7709, + "step": 1614 + }, + { + "epoch": 0.20471542654328811, + "grad_norm": 0.984375, + "learning_rate": 0.00011403543598743785, + "loss": 1.9313, + "step": 1615 + }, + { + "epoch": 0.2048421853213335, + "grad_norm": 0.9296875, + "learning_rate": 0.00011402549355702387, + "loss": 2.0697, + "step": 1616 + }, + { + "epoch": 0.20496894409937888, + "grad_norm": 1.0234375, + "learning_rate": 0.00011401554328107571, + "loss": 2.0153, + "step": 1617 + }, + { + "epoch": 0.20509570287742426, + "grad_norm": 0.9921875, + "learning_rate": 0.00011400558516103842, + "loss": 1.8693, + "step": 1618 + }, + { + "epoch": 0.20522246165546965, + "grad_norm": 0.78125, + "learning_rate": 0.00011399561919835803, + "loss": 2.1455, + "step": 1619 + }, + { + "epoch": 0.20534922043351503, + "grad_norm": 1.109375, + "learning_rate": 0.00011398564539448184, + "loss": 2.1132, + "step": 1620 + }, + { + "epoch": 0.2054759792115604, + "grad_norm": 0.921875, + "learning_rate": 0.00011397566375085821, + "loss": 1.6704, + "step": 1621 + }, + { + "epoch": 0.2056027379896058, + "grad_norm": 0.921875, + "learning_rate": 0.00011396567426893666, + "loss": 2.2182, + "step": 1622 + }, + { + "epoch": 0.20572949676765115, + "grad_norm": 1.03125, + "learning_rate": 0.00011395567695016787, + "loss": 1.7067, + "step": 1623 + }, + { + "epoch": 0.20585625554569653, + "grad_norm": 0.98828125, + "learning_rate": 0.00011394567179600364, + "loss": 2.0757, + "step": 1624 + }, + { + "epoch": 0.20598301432374191, + "grad_norm": 0.9296875, + "learning_rate": 0.00011393565880789691, + "loss": 1.8869, + "step": 1625 + }, + { + "epoch": 0.2061097731017873, + "grad_norm": 0.921875, + "learning_rate": 0.00011392563798730175, + "loss": 2.1026, + "step": 1626 + }, + { + "epoch": 0.20623653187983268, + "grad_norm": 0.8515625, + "learning_rate": 0.00011391560933567339, + "loss": 1.4794, + "step": 1627 + }, + { + "epoch": 0.20636329065787806, + "grad_norm": 0.9453125, + "learning_rate": 0.00011390557285446814, + "loss": 2.0066, + "step": 1628 + }, + { + "epoch": 0.20649004943592345, + "grad_norm": 0.9296875, + "learning_rate": 0.00011389552854514354, + "loss": 1.9564, + "step": 1629 + }, + { + "epoch": 0.20661680821396883, + "grad_norm": 1.0625, + "learning_rate": 0.0001138854764091582, + "loss": 2.2551, + "step": 1630 + }, + { + "epoch": 0.2067435669920142, + "grad_norm": 1.0390625, + "learning_rate": 0.00011387541644797187, + "loss": 2.1688, + "step": 1631 + }, + { + "epoch": 0.20687032577005957, + "grad_norm": 0.98046875, + "learning_rate": 0.00011386534866304547, + "loss": 2.0605, + "step": 1632 + }, + { + "epoch": 0.20699708454810495, + "grad_norm": 0.8671875, + "learning_rate": 0.00011385527305584101, + "loss": 1.798, + "step": 1633 + }, + { + "epoch": 0.20712384332615033, + "grad_norm": 0.8359375, + "learning_rate": 0.00011384518962782168, + "loss": 1.927, + "step": 1634 + }, + { + "epoch": 0.2072506021041957, + "grad_norm": 0.97265625, + "learning_rate": 0.00011383509838045177, + "loss": 1.6926, + "step": 1635 + }, + { + "epoch": 0.2073773608822411, + "grad_norm": 0.93359375, + "learning_rate": 0.00011382499931519671, + "loss": 1.6868, + "step": 1636 + }, + { + "epoch": 0.20750411966028648, + "grad_norm": 0.83203125, + "learning_rate": 0.00011381489243352312, + "loss": 2.2065, + "step": 1637 + }, + { + "epoch": 0.20763087843833186, + "grad_norm": 0.85546875, + "learning_rate": 0.00011380477773689868, + "loss": 2.0217, + "step": 1638 + }, + { + "epoch": 0.20775763721637724, + "grad_norm": 0.921875, + "learning_rate": 0.00011379465522679227, + "loss": 2.1434, + "step": 1639 + }, + { + "epoch": 0.20788439599442263, + "grad_norm": 0.9453125, + "learning_rate": 0.00011378452490467382, + "loss": 1.7276, + "step": 1640 + }, + { + "epoch": 0.20801115477246798, + "grad_norm": 0.83984375, + "learning_rate": 0.00011377438677201449, + "loss": 1.9917, + "step": 1641 + }, + { + "epoch": 0.20813791355051336, + "grad_norm": 0.85546875, + "learning_rate": 0.00011376424083028652, + "loss": 1.5198, + "step": 1642 + }, + { + "epoch": 0.20826467232855875, + "grad_norm": 0.9453125, + "learning_rate": 0.00011375408708096327, + "loss": 1.389, + "step": 1643 + }, + { + "epoch": 0.20839143110660413, + "grad_norm": 0.8046875, + "learning_rate": 0.00011374392552551931, + "loss": 1.9101, + "step": 1644 + }, + { + "epoch": 0.2085181898846495, + "grad_norm": 0.94140625, + "learning_rate": 0.00011373375616543023, + "loss": 2.1235, + "step": 1645 + }, + { + "epoch": 0.2086449486626949, + "grad_norm": 0.8828125, + "learning_rate": 0.00011372357900217286, + "loss": 2.0348, + "step": 1646 + }, + { + "epoch": 0.20877170744074028, + "grad_norm": 0.9375, + "learning_rate": 0.00011371339403722512, + "loss": 1.723, + "step": 1647 + }, + { + "epoch": 0.20889846621878566, + "grad_norm": 0.875, + "learning_rate": 0.00011370320127206602, + "loss": 2.155, + "step": 1648 + }, + { + "epoch": 0.20902522499683104, + "grad_norm": 0.91015625, + "learning_rate": 0.00011369300070817578, + "loss": 1.8449, + "step": 1649 + }, + { + "epoch": 0.2091519837748764, + "grad_norm": 0.83203125, + "learning_rate": 0.00011368279234703571, + "loss": 2.4111, + "step": 1650 + }, + { + "epoch": 0.20927874255292178, + "grad_norm": 0.875, + "learning_rate": 0.00011367257619012826, + "loss": 2.1259, + "step": 1651 + }, + { + "epoch": 0.20940550133096716, + "grad_norm": 0.9609375, + "learning_rate": 0.000113662352238937, + "loss": 2.2397, + "step": 1652 + }, + { + "epoch": 0.20953226010901255, + "grad_norm": 1.0078125, + "learning_rate": 0.00011365212049494665, + "loss": 2.3617, + "step": 1653 + }, + { + "epoch": 0.20965901888705793, + "grad_norm": 0.921875, + "learning_rate": 0.00011364188095964306, + "loss": 2.0238, + "step": 1654 + }, + { + "epoch": 0.2097857776651033, + "grad_norm": 0.8984375, + "learning_rate": 0.00011363163363451321, + "loss": 2.0522, + "step": 1655 + }, + { + "epoch": 0.2099125364431487, + "grad_norm": 0.98046875, + "learning_rate": 0.0001136213785210452, + "loss": 2.2515, + "step": 1656 + }, + { + "epoch": 0.21003929522119408, + "grad_norm": 0.90234375, + "learning_rate": 0.00011361111562072825, + "loss": 2.1909, + "step": 1657 + }, + { + "epoch": 0.21016605399923946, + "grad_norm": 1.0546875, + "learning_rate": 0.00011360084493505276, + "loss": 2.1522, + "step": 1658 + }, + { + "epoch": 0.21029281277728482, + "grad_norm": 0.88671875, + "learning_rate": 0.0001135905664655102, + "loss": 2.0019, + "step": 1659 + }, + { + "epoch": 0.2104195715553302, + "grad_norm": 0.83984375, + "learning_rate": 0.00011358028021359325, + "loss": 2.1471, + "step": 1660 + }, + { + "epoch": 0.21054633033337558, + "grad_norm": 1.3671875, + "learning_rate": 0.0001135699861807956, + "loss": 2.2812, + "step": 1661 + }, + { + "epoch": 0.21067308911142096, + "grad_norm": 1.0234375, + "learning_rate": 0.00011355968436861222, + "loss": 2.0823, + "step": 1662 + }, + { + "epoch": 0.21079984788946635, + "grad_norm": 0.81640625, + "learning_rate": 0.00011354937477853906, + "loss": 1.8609, + "step": 1663 + }, + { + "epoch": 0.21092660666751173, + "grad_norm": 0.86328125, + "learning_rate": 0.0001135390574120733, + "loss": 1.9208, + "step": 1664 + }, + { + "epoch": 0.2110533654455571, + "grad_norm": 1.0234375, + "learning_rate": 0.00011352873227071321, + "loss": 1.7732, + "step": 1665 + }, + { + "epoch": 0.2111801242236025, + "grad_norm": 0.94921875, + "learning_rate": 0.00011351839935595823, + "loss": 1.785, + "step": 1666 + }, + { + "epoch": 0.21130688300164788, + "grad_norm": 0.98046875, + "learning_rate": 0.00011350805866930884, + "loss": 1.8062, + "step": 1667 + }, + { + "epoch": 0.21143364177969323, + "grad_norm": 0.95703125, + "learning_rate": 0.00011349771021226676, + "loss": 1.9102, + "step": 1668 + }, + { + "epoch": 0.21156040055773861, + "grad_norm": 0.9609375, + "learning_rate": 0.00011348735398633474, + "loss": 1.7619, + "step": 1669 + }, + { + "epoch": 0.211687159335784, + "grad_norm": 0.90625, + "learning_rate": 0.00011347698999301675, + "loss": 2.0346, + "step": 1670 + }, + { + "epoch": 0.21181391811382938, + "grad_norm": 0.88671875, + "learning_rate": 0.00011346661823381778, + "loss": 2.471, + "step": 1671 + }, + { + "epoch": 0.21194067689187476, + "grad_norm": 0.9921875, + "learning_rate": 0.00011345623871024406, + "loss": 1.9056, + "step": 1672 + }, + { + "epoch": 0.21206743566992015, + "grad_norm": 1.03125, + "learning_rate": 0.00011344585142380286, + "loss": 2.5207, + "step": 1673 + }, + { + "epoch": 0.21219419444796553, + "grad_norm": 0.96875, + "learning_rate": 0.00011343545637600264, + "loss": 1.7837, + "step": 1674 + }, + { + "epoch": 0.2123209532260109, + "grad_norm": 1.078125, + "learning_rate": 0.00011342505356835294, + "loss": 2.2347, + "step": 1675 + }, + { + "epoch": 0.2124477120040563, + "grad_norm": 0.87890625, + "learning_rate": 0.00011341464300236444, + "loss": 2.0306, + "step": 1676 + }, + { + "epoch": 0.21257447078210165, + "grad_norm": 0.91796875, + "learning_rate": 0.00011340422467954899, + "loss": 1.9878, + "step": 1677 + }, + { + "epoch": 0.21270122956014703, + "grad_norm": 0.97265625, + "learning_rate": 0.0001133937986014195, + "loss": 2.069, + "step": 1678 + }, + { + "epoch": 0.21282798833819241, + "grad_norm": 0.84375, + "learning_rate": 0.00011338336476949003, + "loss": 2.3773, + "step": 1679 + }, + { + "epoch": 0.2129547471162378, + "grad_norm": 0.8984375, + "learning_rate": 0.00011337292318527578, + "loss": 1.6187, + "step": 1680 + }, + { + "epoch": 0.21308150589428318, + "grad_norm": 0.875, + "learning_rate": 0.00011336247385029309, + "loss": 2.2853, + "step": 1681 + }, + { + "epoch": 0.21320826467232856, + "grad_norm": 0.90625, + "learning_rate": 0.00011335201676605939, + "loss": 1.738, + "step": 1682 + }, + { + "epoch": 0.21333502345037395, + "grad_norm": 0.82421875, + "learning_rate": 0.00011334155193409322, + "loss": 1.663, + "step": 1683 + }, + { + "epoch": 0.21346178222841933, + "grad_norm": 0.9375, + "learning_rate": 0.0001133310793559143, + "loss": 1.7941, + "step": 1684 + }, + { + "epoch": 0.2135885410064647, + "grad_norm": 0.9296875, + "learning_rate": 0.00011332059903304346, + "loss": 2.1791, + "step": 1685 + }, + { + "epoch": 0.21371529978451007, + "grad_norm": 0.91015625, + "learning_rate": 0.0001133101109670026, + "loss": 1.8736, + "step": 1686 + }, + { + "epoch": 0.21384205856255545, + "grad_norm": 1.296875, + "learning_rate": 0.00011329961515931483, + "loss": 2.4165, + "step": 1687 + }, + { + "epoch": 0.21396881734060083, + "grad_norm": 0.9453125, + "learning_rate": 0.00011328911161150433, + "loss": 1.7651, + "step": 1688 + }, + { + "epoch": 0.2140955761186462, + "grad_norm": 0.890625, + "learning_rate": 0.00011327860032509642, + "loss": 2.1969, + "step": 1689 + }, + { + "epoch": 0.2142223348966916, + "grad_norm": 0.9140625, + "learning_rate": 0.00011326808130161752, + "loss": 1.6884, + "step": 1690 + }, + { + "epoch": 0.21434909367473698, + "grad_norm": 0.8515625, + "learning_rate": 0.00011325755454259522, + "loss": 1.9444, + "step": 1691 + }, + { + "epoch": 0.21447585245278236, + "grad_norm": 0.98046875, + "learning_rate": 0.0001132470200495582, + "loss": 1.7434, + "step": 1692 + }, + { + "epoch": 0.21460261123082774, + "grad_norm": 0.91015625, + "learning_rate": 0.00011323647782403624, + "loss": 2.0652, + "step": 1693 + }, + { + "epoch": 0.21472937000887313, + "grad_norm": 1.046875, + "learning_rate": 0.00011322592786756029, + "loss": 2.2021, + "step": 1694 + }, + { + "epoch": 0.21485612878691848, + "grad_norm": 1.078125, + "learning_rate": 0.00011321537018166243, + "loss": 2.3423, + "step": 1695 + }, + { + "epoch": 0.21498288756496386, + "grad_norm": 0.92578125, + "learning_rate": 0.00011320480476787582, + "loss": 1.959, + "step": 1696 + }, + { + "epoch": 0.21510964634300925, + "grad_norm": 0.91796875, + "learning_rate": 0.00011319423162773475, + "loss": 2.2949, + "step": 1697 + }, + { + "epoch": 0.21523640512105463, + "grad_norm": 0.8984375, + "learning_rate": 0.00011318365076277463, + "loss": 2.2568, + "step": 1698 + }, + { + "epoch": 0.2153631638991, + "grad_norm": 0.98828125, + "learning_rate": 0.00011317306217453204, + "loss": 1.9458, + "step": 1699 + }, + { + "epoch": 0.2154899226771454, + "grad_norm": 0.94140625, + "learning_rate": 0.00011316246586454464, + "loss": 2.1855, + "step": 1700 + }, + { + "epoch": 0.21561668145519078, + "grad_norm": 1.359375, + "learning_rate": 0.0001131518618343512, + "loss": 2.1287, + "step": 1701 + }, + { + "epoch": 0.21574344023323616, + "grad_norm": 0.91796875, + "learning_rate": 0.00011314125008549163, + "loss": 1.9651, + "step": 1702 + }, + { + "epoch": 0.21587019901128154, + "grad_norm": 0.9765625, + "learning_rate": 0.00011313063061950695, + "loss": 2.084, + "step": 1703 + }, + { + "epoch": 0.2159969577893269, + "grad_norm": 1.03125, + "learning_rate": 0.00011312000343793935, + "loss": 2.3129, + "step": 1704 + }, + { + "epoch": 0.21612371656737228, + "grad_norm": 0.9453125, + "learning_rate": 0.00011310936854233206, + "loss": 1.8155, + "step": 1705 + }, + { + "epoch": 0.21625047534541766, + "grad_norm": 1.2734375, + "learning_rate": 0.00011309872593422948, + "loss": 2.1351, + "step": 1706 + }, + { + "epoch": 0.21637723412346305, + "grad_norm": 0.890625, + "learning_rate": 0.00011308807561517712, + "loss": 1.6095, + "step": 1707 + }, + { + "epoch": 0.21650399290150843, + "grad_norm": 0.890625, + "learning_rate": 0.00011307741758672162, + "loss": 1.5984, + "step": 1708 + }, + { + "epoch": 0.2166307516795538, + "grad_norm": 0.89453125, + "learning_rate": 0.00011306675185041071, + "loss": 1.7248, + "step": 1709 + }, + { + "epoch": 0.2167575104575992, + "grad_norm": 1.359375, + "learning_rate": 0.00011305607840779326, + "loss": 1.7754, + "step": 1710 + }, + { + "epoch": 0.21688426923564458, + "grad_norm": 0.97265625, + "learning_rate": 0.0001130453972604193, + "loss": 1.8595, + "step": 1711 + }, + { + "epoch": 0.21701102801368996, + "grad_norm": 0.9453125, + "learning_rate": 0.00011303470840983989, + "loss": 1.9527, + "step": 1712 + }, + { + "epoch": 0.21713778679173532, + "grad_norm": 0.87109375, + "learning_rate": 0.00011302401185760727, + "loss": 1.8134, + "step": 1713 + }, + { + "epoch": 0.2172645455697807, + "grad_norm": 0.9765625, + "learning_rate": 0.00011301330760527477, + "loss": 1.413, + "step": 1714 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 1.0703125, + "learning_rate": 0.00011300259565439689, + "loss": 1.6031, + "step": 1715 + }, + { + "epoch": 0.21751806312587146, + "grad_norm": 2.140625, + "learning_rate": 0.00011299187600652916, + "loss": 1.8899, + "step": 1716 + }, + { + "epoch": 0.21764482190391685, + "grad_norm": 0.84375, + "learning_rate": 0.00011298114866322833, + "loss": 2.4563, + "step": 1717 + }, + { + "epoch": 0.21777158068196223, + "grad_norm": 0.9140625, + "learning_rate": 0.00011297041362605218, + "loss": 2.176, + "step": 1718 + }, + { + "epoch": 0.2178983394600076, + "grad_norm": 1.078125, + "learning_rate": 0.00011295967089655963, + "loss": 1.972, + "step": 1719 + }, + { + "epoch": 0.218025098238053, + "grad_norm": 0.9375, + "learning_rate": 0.00011294892047631078, + "loss": 1.934, + "step": 1720 + }, + { + "epoch": 0.21815185701609838, + "grad_norm": 0.99609375, + "learning_rate": 0.00011293816236686675, + "loss": 2.1529, + "step": 1721 + }, + { + "epoch": 0.21827861579414373, + "grad_norm": 0.96484375, + "learning_rate": 0.00011292739656978984, + "loss": 1.9657, + "step": 1722 + }, + { + "epoch": 0.21840537457218911, + "grad_norm": 0.97265625, + "learning_rate": 0.00011291662308664347, + "loss": 1.852, + "step": 1723 + }, + { + "epoch": 0.2185321333502345, + "grad_norm": 0.84375, + "learning_rate": 0.00011290584191899211, + "loss": 1.8048, + "step": 1724 + }, + { + "epoch": 0.21865889212827988, + "grad_norm": 0.90234375, + "learning_rate": 0.00011289505306840142, + "loss": 2.0214, + "step": 1725 + }, + { + "epoch": 0.21878565090632526, + "grad_norm": 0.984375, + "learning_rate": 0.00011288425653643815, + "loss": 2.2263, + "step": 1726 + }, + { + "epoch": 0.21891240968437065, + "grad_norm": 0.94921875, + "learning_rate": 0.00011287345232467017, + "loss": 2.1432, + "step": 1727 + }, + { + "epoch": 0.21903916846241603, + "grad_norm": 0.88671875, + "learning_rate": 0.00011286264043466643, + "loss": 2.4881, + "step": 1728 + }, + { + "epoch": 0.2191659272404614, + "grad_norm": 1.03125, + "learning_rate": 0.00011285182086799705, + "loss": 2.3732, + "step": 1729 + }, + { + "epoch": 0.2192926860185068, + "grad_norm": 0.97265625, + "learning_rate": 0.00011284099362623322, + "loss": 1.6829, + "step": 1730 + }, + { + "epoch": 0.21941944479655215, + "grad_norm": 1.3671875, + "learning_rate": 0.00011283015871094727, + "loss": 1.1974, + "step": 1731 + }, + { + "epoch": 0.21954620357459753, + "grad_norm": 0.8828125, + "learning_rate": 0.00011281931612371263, + "loss": 1.457, + "step": 1732 + }, + { + "epoch": 0.21967296235264291, + "grad_norm": 0.890625, + "learning_rate": 0.00011280846586610387, + "loss": 2.1439, + "step": 1733 + }, + { + "epoch": 0.2197997211306883, + "grad_norm": 0.98828125, + "learning_rate": 0.00011279760793969663, + "loss": 1.6494, + "step": 1734 + }, + { + "epoch": 0.21992647990873368, + "grad_norm": 0.93359375, + "learning_rate": 0.0001127867423460677, + "loss": 1.8797, + "step": 1735 + }, + { + "epoch": 0.22005323868677906, + "grad_norm": 0.9453125, + "learning_rate": 0.00011277586908679498, + "loss": 2.0296, + "step": 1736 + }, + { + "epoch": 0.22017999746482445, + "grad_norm": 0.890625, + "learning_rate": 0.00011276498816345745, + "loss": 2.1084, + "step": 1737 + }, + { + "epoch": 0.22030675624286983, + "grad_norm": 1.03125, + "learning_rate": 0.00011275409957763525, + "loss": 1.8608, + "step": 1738 + }, + { + "epoch": 0.2204335150209152, + "grad_norm": 0.9609375, + "learning_rate": 0.0001127432033309096, + "loss": 2.0492, + "step": 1739 + }, + { + "epoch": 0.22056027379896057, + "grad_norm": 0.8984375, + "learning_rate": 0.00011273229942486287, + "loss": 1.9129, + "step": 1740 + }, + { + "epoch": 0.22068703257700595, + "grad_norm": 0.94140625, + "learning_rate": 0.00011272138786107848, + "loss": 2.1094, + "step": 1741 + }, + { + "epoch": 0.22081379135505133, + "grad_norm": 1.078125, + "learning_rate": 0.00011271046864114101, + "loss": 2.0271, + "step": 1742 + }, + { + "epoch": 0.2209405501330967, + "grad_norm": 0.9765625, + "learning_rate": 0.00011269954176663614, + "loss": 1.8049, + "step": 1743 + }, + { + "epoch": 0.2210673089111421, + "grad_norm": 0.92578125, + "learning_rate": 0.00011268860723915066, + "loss": 1.8328, + "step": 1744 + }, + { + "epoch": 0.22119406768918748, + "grad_norm": 0.94140625, + "learning_rate": 0.0001126776650602725, + "loss": 2.2105, + "step": 1745 + }, + { + "epoch": 0.22132082646723286, + "grad_norm": 1.0625, + "learning_rate": 0.00011266671523159062, + "loss": 2.2894, + "step": 1746 + }, + { + "epoch": 0.22144758524527824, + "grad_norm": 0.90625, + "learning_rate": 0.00011265575775469518, + "loss": 1.9724, + "step": 1747 + }, + { + "epoch": 0.22157434402332363, + "grad_norm": 0.859375, + "learning_rate": 0.0001126447926311774, + "loss": 2.0877, + "step": 1748 + }, + { + "epoch": 0.22170110280136898, + "grad_norm": 0.94140625, + "learning_rate": 0.00011263381986262965, + "loss": 1.4707, + "step": 1749 + }, + { + "epoch": 0.22182786157941436, + "grad_norm": 0.9609375, + "learning_rate": 0.00011262283945064534, + "loss": 2.0119, + "step": 1750 + }, + { + "epoch": 0.22195462035745975, + "grad_norm": 0.90625, + "learning_rate": 0.00011261185139681909, + "loss": 2.0175, + "step": 1751 + }, + { + "epoch": 0.22208137913550513, + "grad_norm": 0.98046875, + "learning_rate": 0.00011260085570274655, + "loss": 1.8443, + "step": 1752 + }, + { + "epoch": 0.2222081379135505, + "grad_norm": 0.9765625, + "learning_rate": 0.0001125898523700245, + "loss": 1.6954, + "step": 1753 + }, + { + "epoch": 0.2223348966915959, + "grad_norm": 1.0078125, + "learning_rate": 0.00011257884140025083, + "loss": 2.309, + "step": 1754 + }, + { + "epoch": 0.22246165546964128, + "grad_norm": 0.90625, + "learning_rate": 0.00011256782279502456, + "loss": 2.0581, + "step": 1755 + }, + { + "epoch": 0.22258841424768666, + "grad_norm": 0.9140625, + "learning_rate": 0.00011255679655594578, + "loss": 1.6548, + "step": 1756 + }, + { + "epoch": 0.22271517302573204, + "grad_norm": 0.984375, + "learning_rate": 0.00011254576268461574, + "loss": 1.8822, + "step": 1757 + }, + { + "epoch": 0.2228419318037774, + "grad_norm": 0.9140625, + "learning_rate": 0.00011253472118263674, + "loss": 1.6084, + "step": 1758 + }, + { + "epoch": 0.22296869058182278, + "grad_norm": 0.9453125, + "learning_rate": 0.00011252367205161224, + "loss": 2.2172, + "step": 1759 + }, + { + "epoch": 0.22309544935986816, + "grad_norm": 0.89453125, + "learning_rate": 0.00011251261529314678, + "loss": 2.0781, + "step": 1760 + }, + { + "epoch": 0.22322220813791355, + "grad_norm": 1.0078125, + "learning_rate": 0.00011250155090884601, + "loss": 1.52, + "step": 1761 + }, + { + "epoch": 0.22334896691595893, + "grad_norm": 0.84765625, + "learning_rate": 0.00011249047890031668, + "loss": 2.3089, + "step": 1762 + }, + { + "epoch": 0.2234757256940043, + "grad_norm": 0.83203125, + "learning_rate": 0.00011247939926916666, + "loss": 2.0268, + "step": 1763 + }, + { + "epoch": 0.2236024844720497, + "grad_norm": 1.09375, + "learning_rate": 0.00011246831201700493, + "loss": 2.0295, + "step": 1764 + }, + { + "epoch": 0.22372924325009508, + "grad_norm": 0.98828125, + "learning_rate": 0.00011245721714544159, + "loss": 1.8774, + "step": 1765 + }, + { + "epoch": 0.22385600202814046, + "grad_norm": 0.95703125, + "learning_rate": 0.00011244611465608779, + "loss": 1.9199, + "step": 1766 + }, + { + "epoch": 0.22398276080618582, + "grad_norm": 0.97265625, + "learning_rate": 0.00011243500455055586, + "loss": 1.6384, + "step": 1767 + }, + { + "epoch": 0.2241095195842312, + "grad_norm": 1.0859375, + "learning_rate": 0.00011242388683045916, + "loss": 2.2951, + "step": 1768 + }, + { + "epoch": 0.22423627836227658, + "grad_norm": 0.91015625, + "learning_rate": 0.00011241276149741223, + "loss": 2.0019, + "step": 1769 + }, + { + "epoch": 0.22436303714032196, + "grad_norm": 0.89453125, + "learning_rate": 0.00011240162855303065, + "loss": 1.5862, + "step": 1770 + }, + { + "epoch": 0.22448979591836735, + "grad_norm": 0.93359375, + "learning_rate": 0.00011239048799893118, + "loss": 2.2524, + "step": 1771 + }, + { + "epoch": 0.22461655469641273, + "grad_norm": 0.953125, + "learning_rate": 0.00011237933983673161, + "loss": 2.118, + "step": 1772 + }, + { + "epoch": 0.2247433134744581, + "grad_norm": 0.90234375, + "learning_rate": 0.00011236818406805086, + "loss": 2.0593, + "step": 1773 + }, + { + "epoch": 0.2248700722525035, + "grad_norm": 1.046875, + "learning_rate": 0.00011235702069450897, + "loss": 2.1958, + "step": 1774 + }, + { + "epoch": 0.22499683103054888, + "grad_norm": 1.109375, + "learning_rate": 0.00011234584971772708, + "loss": 1.7664, + "step": 1775 + }, + { + "epoch": 0.22512358980859423, + "grad_norm": 0.875, + "learning_rate": 0.00011233467113932743, + "loss": 1.7337, + "step": 1776 + }, + { + "epoch": 0.22525034858663961, + "grad_norm": 1.125, + "learning_rate": 0.00011232348496093338, + "loss": 1.9444, + "step": 1777 + }, + { + "epoch": 0.225377107364685, + "grad_norm": 0.94140625, + "learning_rate": 0.00011231229118416934, + "loss": 1.7405, + "step": 1778 + }, + { + "epoch": 0.22550386614273038, + "grad_norm": 0.96875, + "learning_rate": 0.00011230108981066086, + "loss": 1.5865, + "step": 1779 + }, + { + "epoch": 0.22563062492077576, + "grad_norm": 1.03125, + "learning_rate": 0.00011228988084203463, + "loss": 1.9092, + "step": 1780 + }, + { + "epoch": 0.22575738369882115, + "grad_norm": 1.0703125, + "learning_rate": 0.0001122786642799184, + "loss": 2.2845, + "step": 1781 + }, + { + "epoch": 0.22588414247686653, + "grad_norm": 0.87890625, + "learning_rate": 0.00011226744012594098, + "loss": 1.8124, + "step": 1782 + }, + { + "epoch": 0.2260109012549119, + "grad_norm": 0.8671875, + "learning_rate": 0.00011225620838173239, + "loss": 1.6382, + "step": 1783 + }, + { + "epoch": 0.2261376600329573, + "grad_norm": 0.98046875, + "learning_rate": 0.00011224496904892367, + "loss": 1.8672, + "step": 1784 + }, + { + "epoch": 0.22626441881100265, + "grad_norm": 0.88671875, + "learning_rate": 0.000112233722129147, + "loss": 2.5175, + "step": 1785 + }, + { + "epoch": 0.22639117758904803, + "grad_norm": 0.93359375, + "learning_rate": 0.00011222246762403561, + "loss": 1.8184, + "step": 1786 + }, + { + "epoch": 0.22651793636709341, + "grad_norm": 0.9375, + "learning_rate": 0.0001122112055352239, + "loss": 1.4041, + "step": 1787 + }, + { + "epoch": 0.2266446951451388, + "grad_norm": 0.89453125, + "learning_rate": 0.00011219993586434735, + "loss": 1.6064, + "step": 1788 + }, + { + "epoch": 0.22677145392318418, + "grad_norm": 0.8515625, + "learning_rate": 0.00011218865861304251, + "loss": 2.008, + "step": 1789 + }, + { + "epoch": 0.22689821270122956, + "grad_norm": 0.9296875, + "learning_rate": 0.00011217737378294707, + "loss": 1.6627, + "step": 1790 + }, + { + "epoch": 0.22702497147927495, + "grad_norm": 0.87109375, + "learning_rate": 0.00011216608137569978, + "loss": 2.0856, + "step": 1791 + }, + { + "epoch": 0.22715173025732033, + "grad_norm": 0.9609375, + "learning_rate": 0.00011215478139294054, + "loss": 1.4823, + "step": 1792 + }, + { + "epoch": 0.2272784890353657, + "grad_norm": 1.03125, + "learning_rate": 0.00011214347383631029, + "loss": 1.8086, + "step": 1793 + }, + { + "epoch": 0.22740524781341107, + "grad_norm": 0.87109375, + "learning_rate": 0.00011213215870745113, + "loss": 1.7201, + "step": 1794 + }, + { + "epoch": 0.22753200659145645, + "grad_norm": 0.94140625, + "learning_rate": 0.00011212083600800622, + "loss": 1.9477, + "step": 1795 + }, + { + "epoch": 0.22765876536950183, + "grad_norm": 0.95703125, + "learning_rate": 0.00011210950573961985, + "loss": 2.072, + "step": 1796 + }, + { + "epoch": 0.2277855241475472, + "grad_norm": 0.8828125, + "learning_rate": 0.00011209816790393737, + "loss": 1.9112, + "step": 1797 + }, + { + "epoch": 0.2279122829255926, + "grad_norm": 0.98828125, + "learning_rate": 0.00011208682250260526, + "loss": 2.2568, + "step": 1798 + }, + { + "epoch": 0.22803904170363798, + "grad_norm": 0.91796875, + "learning_rate": 0.0001120754695372711, + "loss": 2.1276, + "step": 1799 + }, + { + "epoch": 0.22816580048168336, + "grad_norm": 0.92578125, + "learning_rate": 0.00011206410900958355, + "loss": 1.9848, + "step": 1800 + }, + { + "epoch": 0.22829255925972874, + "grad_norm": 0.84765625, + "learning_rate": 0.00011205274092119235, + "loss": 2.4462, + "step": 1801 + }, + { + "epoch": 0.22841931803777413, + "grad_norm": 0.953125, + "learning_rate": 0.00011204136527374839, + "loss": 1.9101, + "step": 1802 + }, + { + "epoch": 0.22854607681581948, + "grad_norm": 0.93359375, + "learning_rate": 0.00011202998206890364, + "loss": 1.7892, + "step": 1803 + }, + { + "epoch": 0.22867283559386486, + "grad_norm": 0.953125, + "learning_rate": 0.00011201859130831114, + "loss": 2.0911, + "step": 1804 + }, + { + "epoch": 0.22879959437191025, + "grad_norm": 0.9765625, + "learning_rate": 0.00011200719299362506, + "loss": 2.2664, + "step": 1805 + }, + { + "epoch": 0.22892635314995563, + "grad_norm": 0.9609375, + "learning_rate": 0.00011199578712650062, + "loss": 2.2006, + "step": 1806 + }, + { + "epoch": 0.229053111928001, + "grad_norm": 1.046875, + "learning_rate": 0.00011198437370859421, + "loss": 1.9552, + "step": 1807 + }, + { + "epoch": 0.2291798707060464, + "grad_norm": 0.9609375, + "learning_rate": 0.00011197295274156326, + "loss": 1.9655, + "step": 1808 + }, + { + "epoch": 0.22930662948409178, + "grad_norm": 1.078125, + "learning_rate": 0.00011196152422706633, + "loss": 2.3358, + "step": 1809 + }, + { + "epoch": 0.22943338826213716, + "grad_norm": 0.9453125, + "learning_rate": 0.00011195008816676302, + "loss": 2.0809, + "step": 1810 + }, + { + "epoch": 0.22956014704018254, + "grad_norm": 1.09375, + "learning_rate": 0.00011193864456231411, + "loss": 1.9587, + "step": 1811 + }, + { + "epoch": 0.2296869058182279, + "grad_norm": 0.921875, + "learning_rate": 0.00011192719341538138, + "loss": 2.4708, + "step": 1812 + }, + { + "epoch": 0.22981366459627328, + "grad_norm": 0.91796875, + "learning_rate": 0.0001119157347276278, + "loss": 2.2584, + "step": 1813 + }, + { + "epoch": 0.22994042337431866, + "grad_norm": 0.91015625, + "learning_rate": 0.00011190426850071738, + "loss": 1.6815, + "step": 1814 + }, + { + "epoch": 0.23006718215236405, + "grad_norm": 0.796875, + "learning_rate": 0.00011189279473631521, + "loss": 1.2917, + "step": 1815 + }, + { + "epoch": 0.23019394093040943, + "grad_norm": 0.84375, + "learning_rate": 0.00011188131343608753, + "loss": 1.8444, + "step": 1816 + }, + { + "epoch": 0.2303206997084548, + "grad_norm": 0.9609375, + "learning_rate": 0.00011186982460170164, + "loss": 2.0416, + "step": 1817 + }, + { + "epoch": 0.2304474584865002, + "grad_norm": 0.97265625, + "learning_rate": 0.00011185832823482593, + "loss": 2.1699, + "step": 1818 + }, + { + "epoch": 0.23057421726454558, + "grad_norm": 0.9453125, + "learning_rate": 0.00011184682433712987, + "loss": 2.0034, + "step": 1819 + }, + { + "epoch": 0.23070097604259096, + "grad_norm": 0.984375, + "learning_rate": 0.0001118353129102841, + "loss": 1.6573, + "step": 1820 + }, + { + "epoch": 0.23082773482063632, + "grad_norm": 0.87109375, + "learning_rate": 0.00011182379395596025, + "loss": 2.3483, + "step": 1821 + }, + { + "epoch": 0.2309544935986817, + "grad_norm": 0.9765625, + "learning_rate": 0.00011181226747583111, + "loss": 1.7456, + "step": 1822 + }, + { + "epoch": 0.23108125237672708, + "grad_norm": 0.8828125, + "learning_rate": 0.00011180073347157054, + "loss": 2.1377, + "step": 1823 + }, + { + "epoch": 0.23120801115477246, + "grad_norm": 0.83984375, + "learning_rate": 0.00011178919194485352, + "loss": 1.8963, + "step": 1824 + }, + { + "epoch": 0.23133476993281785, + "grad_norm": 1.109375, + "learning_rate": 0.00011177764289735608, + "loss": 2.2232, + "step": 1825 + }, + { + "epoch": 0.23146152871086323, + "grad_norm": 0.94140625, + "learning_rate": 0.00011176608633075536, + "loss": 2.1092, + "step": 1826 + }, + { + "epoch": 0.2315882874889086, + "grad_norm": 0.92578125, + "learning_rate": 0.00011175452224672961, + "loss": 2.3346, + "step": 1827 + }, + { + "epoch": 0.231715046266954, + "grad_norm": 0.95703125, + "learning_rate": 0.00011174295064695814, + "loss": 1.6763, + "step": 1828 + }, + { + "epoch": 0.23184180504499938, + "grad_norm": 0.88671875, + "learning_rate": 0.00011173137153312137, + "loss": 1.6134, + "step": 1829 + }, + { + "epoch": 0.23196856382304473, + "grad_norm": 0.86328125, + "learning_rate": 0.00011171978490690082, + "loss": 1.9884, + "step": 1830 + }, + { + "epoch": 0.23209532260109011, + "grad_norm": 0.98828125, + "learning_rate": 0.00011170819076997907, + "loss": 2.1096, + "step": 1831 + }, + { + "epoch": 0.2322220813791355, + "grad_norm": 0.953125, + "learning_rate": 0.00011169658912403984, + "loss": 1.6547, + "step": 1832 + }, + { + "epoch": 0.23234884015718088, + "grad_norm": 1.0625, + "learning_rate": 0.00011168497997076789, + "loss": 1.6564, + "step": 1833 + }, + { + "epoch": 0.23247559893522626, + "grad_norm": 1.0234375, + "learning_rate": 0.0001116733633118491, + "loss": 2.0879, + "step": 1834 + }, + { + "epoch": 0.23260235771327165, + "grad_norm": 0.8828125, + "learning_rate": 0.0001116617391489704, + "loss": 2.3098, + "step": 1835 + }, + { + "epoch": 0.23272911649131703, + "grad_norm": 0.9296875, + "learning_rate": 0.00011165010748381988, + "loss": 2.2079, + "step": 1836 + }, + { + "epoch": 0.2328558752693624, + "grad_norm": 1.0859375, + "learning_rate": 0.00011163846831808667, + "loss": 1.9217, + "step": 1837 + }, + { + "epoch": 0.2329826340474078, + "grad_norm": 1.0078125, + "learning_rate": 0.00011162682165346099, + "loss": 2.422, + "step": 1838 + }, + { + "epoch": 0.23310939282545315, + "grad_norm": 0.90234375, + "learning_rate": 0.00011161516749163416, + "loss": 1.949, + "step": 1839 + }, + { + "epoch": 0.23323615160349853, + "grad_norm": 0.94140625, + "learning_rate": 0.0001116035058342986, + "loss": 1.7733, + "step": 1840 + }, + { + "epoch": 0.23336291038154391, + "grad_norm": 0.98828125, + "learning_rate": 0.0001115918366831478, + "loss": 1.9932, + "step": 1841 + }, + { + "epoch": 0.2334896691595893, + "grad_norm": 0.9609375, + "learning_rate": 0.00011158016003987632, + "loss": 1.722, + "step": 1842 + }, + { + "epoch": 0.23361642793763468, + "grad_norm": 0.94921875, + "learning_rate": 0.00011156847590617985, + "loss": 1.8859, + "step": 1843 + }, + { + "epoch": 0.23374318671568006, + "grad_norm": 0.98828125, + "learning_rate": 0.00011155678428375517, + "loss": 1.756, + "step": 1844 + }, + { + "epoch": 0.23386994549372545, + "grad_norm": 1.078125, + "learning_rate": 0.0001115450851743001, + "loss": 1.9091, + "step": 1845 + }, + { + "epoch": 0.23399670427177083, + "grad_norm": 0.96875, + "learning_rate": 0.00011153337857951357, + "loss": 2.2108, + "step": 1846 + }, + { + "epoch": 0.2341234630498162, + "grad_norm": 0.9921875, + "learning_rate": 0.00011152166450109562, + "loss": 1.6843, + "step": 1847 + }, + { + "epoch": 0.23425022182786157, + "grad_norm": 0.85546875, + "learning_rate": 0.00011150994294074737, + "loss": 2.2695, + "step": 1848 + }, + { + "epoch": 0.23437698060590695, + "grad_norm": 0.9296875, + "learning_rate": 0.00011149821390017098, + "loss": 2.1799, + "step": 1849 + }, + { + "epoch": 0.23450373938395233, + "grad_norm": 0.97265625, + "learning_rate": 0.00011148647738106973, + "loss": 2.1236, + "step": 1850 + }, + { + "epoch": 0.2346304981619977, + "grad_norm": 0.953125, + "learning_rate": 0.00011147473338514804, + "loss": 2.1795, + "step": 1851 + }, + { + "epoch": 0.2347572569400431, + "grad_norm": 0.95703125, + "learning_rate": 0.0001114629819141113, + "loss": 2.0387, + "step": 1852 + }, + { + "epoch": 0.23488401571808848, + "grad_norm": 0.8515625, + "learning_rate": 0.0001114512229696661, + "loss": 1.5641, + "step": 1853 + }, + { + "epoch": 0.23501077449613386, + "grad_norm": 0.91796875, + "learning_rate": 0.00011143945655352001, + "loss": 2.0649, + "step": 1854 + }, + { + "epoch": 0.23513753327417924, + "grad_norm": 0.8828125, + "learning_rate": 0.00011142768266738177, + "loss": 1.3388, + "step": 1855 + }, + { + "epoch": 0.23526429205222463, + "grad_norm": 1.015625, + "learning_rate": 0.00011141590131296119, + "loss": 2.2702, + "step": 1856 + }, + { + "epoch": 0.23539105083027, + "grad_norm": 0.91796875, + "learning_rate": 0.00011140411249196911, + "loss": 1.8902, + "step": 1857 + }, + { + "epoch": 0.23551780960831536, + "grad_norm": 0.8203125, + "learning_rate": 0.00011139231620611752, + "loss": 1.7341, + "step": 1858 + }, + { + "epoch": 0.23564456838636075, + "grad_norm": 0.83203125, + "learning_rate": 0.00011138051245711944, + "loss": 2.2396, + "step": 1859 + }, + { + "epoch": 0.23577132716440613, + "grad_norm": 0.96875, + "learning_rate": 0.00011136870124668903, + "loss": 1.6936, + "step": 1860 + }, + { + "epoch": 0.2358980859424515, + "grad_norm": 1.0234375, + "learning_rate": 0.00011135688257654148, + "loss": 2.4677, + "step": 1861 + }, + { + "epoch": 0.2360248447204969, + "grad_norm": 1.0, + "learning_rate": 0.0001113450564483931, + "loss": 2.1463, + "step": 1862 + }, + { + "epoch": 0.23615160349854228, + "grad_norm": 0.94921875, + "learning_rate": 0.00011133322286396127, + "loss": 1.9601, + "step": 1863 + }, + { + "epoch": 0.23627836227658766, + "grad_norm": 0.99609375, + "learning_rate": 0.00011132138182496445, + "loss": 1.9763, + "step": 1864 + }, + { + "epoch": 0.23640512105463304, + "grad_norm": 1.03125, + "learning_rate": 0.00011130953333312217, + "loss": 1.8936, + "step": 1865 + }, + { + "epoch": 0.23653187983267843, + "grad_norm": 0.95703125, + "learning_rate": 0.00011129767739015509, + "loss": 1.906, + "step": 1866 + }, + { + "epoch": 0.23665863861072378, + "grad_norm": 0.96484375, + "learning_rate": 0.0001112858139977849, + "loss": 2.3812, + "step": 1867 + }, + { + "epoch": 0.23678539738876916, + "grad_norm": 1.0, + "learning_rate": 0.00011127394315773438, + "loss": 1.6732, + "step": 1868 + }, + { + "epoch": 0.23691215616681455, + "grad_norm": 1.2421875, + "learning_rate": 0.00011126206487172741, + "loss": 2.1126, + "step": 1869 + }, + { + "epoch": 0.23703891494485993, + "grad_norm": 0.859375, + "learning_rate": 0.00011125017914148895, + "loss": 2.0343, + "step": 1870 + }, + { + "epoch": 0.2371656737229053, + "grad_norm": 0.8046875, + "learning_rate": 0.00011123828596874504, + "loss": 2.0351, + "step": 1871 + }, + { + "epoch": 0.2372924325009507, + "grad_norm": 1.109375, + "learning_rate": 0.0001112263853552228, + "loss": 2.495, + "step": 1872 + }, + { + "epoch": 0.23741919127899608, + "grad_norm": 1.1328125, + "learning_rate": 0.00011121447730265039, + "loss": 2.3829, + "step": 1873 + }, + { + "epoch": 0.23754595005704146, + "grad_norm": 0.88671875, + "learning_rate": 0.00011120256181275715, + "loss": 1.9801, + "step": 1874 + }, + { + "epoch": 0.23767270883508684, + "grad_norm": 0.8828125, + "learning_rate": 0.00011119063888727336, + "loss": 2.0665, + "step": 1875 + }, + { + "epoch": 0.2377994676131322, + "grad_norm": 0.94921875, + "learning_rate": 0.00011117870852793051, + "loss": 2.025, + "step": 1876 + }, + { + "epoch": 0.23792622639117758, + "grad_norm": 0.89453125, + "learning_rate": 0.00011116677073646113, + "loss": 1.9129, + "step": 1877 + }, + { + "epoch": 0.23805298516922296, + "grad_norm": 0.99609375, + "learning_rate": 0.00011115482551459876, + "loss": 2.3149, + "step": 1878 + }, + { + "epoch": 0.23817974394726835, + "grad_norm": 1.015625, + "learning_rate": 0.00011114287286407811, + "loss": 1.7753, + "step": 1879 + }, + { + "epoch": 0.23830650272531373, + "grad_norm": 0.99609375, + "learning_rate": 0.00011113091278663492, + "loss": 1.9654, + "step": 1880 + }, + { + "epoch": 0.2384332615033591, + "grad_norm": 0.91796875, + "learning_rate": 0.00011111894528400603, + "loss": 2.5581, + "step": 1881 + }, + { + "epoch": 0.2385600202814045, + "grad_norm": 0.8671875, + "learning_rate": 0.00011110697035792936, + "loss": 2.056, + "step": 1882 + }, + { + "epoch": 0.23868677905944988, + "grad_norm": 0.921875, + "learning_rate": 0.00011109498801014388, + "loss": 1.9327, + "step": 1883 + }, + { + "epoch": 0.23881353783749526, + "grad_norm": 1.0078125, + "learning_rate": 0.00011108299824238967, + "loss": 2.0871, + "step": 1884 + }, + { + "epoch": 0.23894029661554061, + "grad_norm": 0.9375, + "learning_rate": 0.00011107100105640786, + "loss": 1.9444, + "step": 1885 + }, + { + "epoch": 0.239067055393586, + "grad_norm": 0.92578125, + "learning_rate": 0.00011105899645394066, + "loss": 1.6214, + "step": 1886 + }, + { + "epoch": 0.23919381417163138, + "grad_norm": 0.9453125, + "learning_rate": 0.0001110469844367314, + "loss": 1.7278, + "step": 1887 + }, + { + "epoch": 0.23932057294967676, + "grad_norm": 0.9140625, + "learning_rate": 0.00011103496500652444, + "loss": 2.4886, + "step": 1888 + }, + { + "epoch": 0.23944733172772215, + "grad_norm": 0.87890625, + "learning_rate": 0.00011102293816506523, + "loss": 1.8917, + "step": 1889 + }, + { + "epoch": 0.23957409050576753, + "grad_norm": 0.86328125, + "learning_rate": 0.00011101090391410028, + "loss": 1.7682, + "step": 1890 + }, + { + "epoch": 0.2397008492838129, + "grad_norm": 0.93359375, + "learning_rate": 0.00011099886225537723, + "loss": 1.8318, + "step": 1891 + }, + { + "epoch": 0.2398276080618583, + "grad_norm": 0.9375, + "learning_rate": 0.00011098681319064473, + "loss": 1.8688, + "step": 1892 + }, + { + "epoch": 0.23995436683990368, + "grad_norm": 0.953125, + "learning_rate": 0.00011097475672165254, + "loss": 1.766, + "step": 1893 + }, + { + "epoch": 0.24008112561794903, + "grad_norm": 0.9765625, + "learning_rate": 0.00011096269285015149, + "loss": 2.0014, + "step": 1894 + }, + { + "epoch": 0.24020788439599441, + "grad_norm": 1.0546875, + "learning_rate": 0.0001109506215778935, + "loss": 2.0121, + "step": 1895 + }, + { + "epoch": 0.2403346431740398, + "grad_norm": 0.9921875, + "learning_rate": 0.00011093854290663152, + "loss": 2.0996, + "step": 1896 + }, + { + "epoch": 0.24046140195208518, + "grad_norm": 0.9765625, + "learning_rate": 0.00011092645683811964, + "loss": 1.8513, + "step": 1897 + }, + { + "epoch": 0.24058816073013056, + "grad_norm": 1.0078125, + "learning_rate": 0.00011091436337411296, + "loss": 2.5915, + "step": 1898 + }, + { + "epoch": 0.24071491950817595, + "grad_norm": 0.84765625, + "learning_rate": 0.00011090226251636768, + "loss": 2.4251, + "step": 1899 + }, + { + "epoch": 0.24084167828622133, + "grad_norm": 0.88671875, + "learning_rate": 0.0001108901542666411, + "loss": 1.9666, + "step": 1900 + }, + { + "epoch": 0.2409684370642667, + "grad_norm": 0.73828125, + "learning_rate": 0.00011087803862669156, + "loss": 2.2832, + "step": 1901 + }, + { + "epoch": 0.2410951958423121, + "grad_norm": 0.9140625, + "learning_rate": 0.00011086591559827847, + "loss": 2.1511, + "step": 1902 + }, + { + "epoch": 0.24122195462035745, + "grad_norm": 0.84765625, + "learning_rate": 0.00011085378518316236, + "loss": 2.0961, + "step": 1903 + }, + { + "epoch": 0.24134871339840283, + "grad_norm": 0.890625, + "learning_rate": 0.00011084164738310474, + "loss": 2.2358, + "step": 1904 + }, + { + "epoch": 0.2414754721764482, + "grad_norm": 0.9296875, + "learning_rate": 0.00011082950219986828, + "loss": 1.6418, + "step": 1905 + }, + { + "epoch": 0.2416022309544936, + "grad_norm": 0.97265625, + "learning_rate": 0.00011081734963521673, + "loss": 1.4844, + "step": 1906 + }, + { + "epoch": 0.24172898973253898, + "grad_norm": 0.84375, + "learning_rate": 0.00011080518969091481, + "loss": 2.1625, + "step": 1907 + }, + { + "epoch": 0.24185574851058436, + "grad_norm": 0.921875, + "learning_rate": 0.00011079302236872842, + "loss": 2.0725, + "step": 1908 + }, + { + "epoch": 0.24198250728862974, + "grad_norm": 0.8984375, + "learning_rate": 0.00011078084767042447, + "loss": 2.3077, + "step": 1909 + }, + { + "epoch": 0.24210926606667513, + "grad_norm": 0.984375, + "learning_rate": 0.00011076866559777096, + "loss": 1.6039, + "step": 1910 + }, + { + "epoch": 0.2422360248447205, + "grad_norm": 0.88671875, + "learning_rate": 0.00011075647615253696, + "loss": 1.7217, + "step": 1911 + }, + { + "epoch": 0.24236278362276586, + "grad_norm": 0.84765625, + "learning_rate": 0.00011074427933649261, + "loss": 1.8374, + "step": 1912 + }, + { + "epoch": 0.24248954240081125, + "grad_norm": 0.83203125, + "learning_rate": 0.00011073207515140915, + "loss": 1.5506, + "step": 1913 + }, + { + "epoch": 0.24261630117885663, + "grad_norm": 0.92578125, + "learning_rate": 0.00011071986359905881, + "loss": 2.2718, + "step": 1914 + }, + { + "epoch": 0.242743059956902, + "grad_norm": 0.83203125, + "learning_rate": 0.00011070764468121498, + "loss": 1.898, + "step": 1915 + }, + { + "epoch": 0.2428698187349474, + "grad_norm": 1.015625, + "learning_rate": 0.00011069541839965205, + "loss": 1.7358, + "step": 1916 + }, + { + "epoch": 0.24299657751299278, + "grad_norm": 0.92578125, + "learning_rate": 0.00011068318475614553, + "loss": 1.8377, + "step": 1917 + }, + { + "epoch": 0.24312333629103816, + "grad_norm": 1.1640625, + "learning_rate": 0.00011067094375247199, + "loss": 2.1218, + "step": 1918 + }, + { + "epoch": 0.24325009506908354, + "grad_norm": 1.0, + "learning_rate": 0.00011065869539040902, + "loss": 1.8304, + "step": 1919 + }, + { + "epoch": 0.24337685384712893, + "grad_norm": 0.87109375, + "learning_rate": 0.00011064643967173536, + "loss": 2.0517, + "step": 1920 + }, + { + "epoch": 0.24350361262517428, + "grad_norm": 0.9296875, + "learning_rate": 0.00011063417659823075, + "loss": 1.7884, + "step": 1921 + }, + { + "epoch": 0.24363037140321966, + "grad_norm": 0.9609375, + "learning_rate": 0.00011062190617167602, + "loss": 2.3266, + "step": 1922 + }, + { + "epoch": 0.24375713018126505, + "grad_norm": 0.9296875, + "learning_rate": 0.00011060962839385306, + "loss": 2.4072, + "step": 1923 + }, + { + "epoch": 0.24388388895931043, + "grad_norm": 0.859375, + "learning_rate": 0.00011059734326654488, + "loss": 1.6475, + "step": 1924 + }, + { + "epoch": 0.2440106477373558, + "grad_norm": 0.97265625, + "learning_rate": 0.00011058505079153546, + "loss": 1.6104, + "step": 1925 + }, + { + "epoch": 0.2441374065154012, + "grad_norm": 0.9921875, + "learning_rate": 0.00011057275097060997, + "loss": 1.9628, + "step": 1926 + }, + { + "epoch": 0.24426416529344658, + "grad_norm": 0.89453125, + "learning_rate": 0.0001105604438055545, + "loss": 1.8682, + "step": 1927 + }, + { + "epoch": 0.24439092407149196, + "grad_norm": 0.89453125, + "learning_rate": 0.00011054812929815636, + "loss": 2.0616, + "step": 1928 + }, + { + "epoch": 0.24451768284953734, + "grad_norm": 0.9765625, + "learning_rate": 0.00011053580745020381, + "loss": 2.1254, + "step": 1929 + }, + { + "epoch": 0.2446444416275827, + "grad_norm": 0.97265625, + "learning_rate": 0.00011052347826348621, + "loss": 2.4461, + "step": 1930 + }, + { + "epoch": 0.24477120040562808, + "grad_norm": 0.921875, + "learning_rate": 0.00011051114173979403, + "loss": 2.045, + "step": 1931 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.99609375, + "learning_rate": 0.00011049879788091874, + "loss": 2.0409, + "step": 1932 + }, + { + "epoch": 0.24502471796171885, + "grad_norm": 1.5859375, + "learning_rate": 0.0001104864466886529, + "loss": 2.2023, + "step": 1933 + }, + { + "epoch": 0.24515147673976423, + "grad_norm": 1.078125, + "learning_rate": 0.00011047408816479017, + "loss": 2.2773, + "step": 1934 + }, + { + "epoch": 0.2452782355178096, + "grad_norm": 0.9609375, + "learning_rate": 0.00011046172231112523, + "loss": 1.8321, + "step": 1935 + }, + { + "epoch": 0.245404994295855, + "grad_norm": 0.84375, + "learning_rate": 0.00011044934912945382, + "loss": 2.1143, + "step": 1936 + }, + { + "epoch": 0.24553175307390038, + "grad_norm": 0.91796875, + "learning_rate": 0.00011043696862157279, + "loss": 1.5387, + "step": 1937 + }, + { + "epoch": 0.24565851185194576, + "grad_norm": 1.2734375, + "learning_rate": 0.00011042458078927999, + "loss": 2.0486, + "step": 1938 + }, + { + "epoch": 0.24578527062999111, + "grad_norm": 1.0859375, + "learning_rate": 0.0001104121856343744, + "loss": 1.777, + "step": 1939 + }, + { + "epoch": 0.2459120294080365, + "grad_norm": 0.9921875, + "learning_rate": 0.00011039978315865603, + "loss": 1.4646, + "step": 1940 + }, + { + "epoch": 0.24603878818608188, + "grad_norm": 0.97265625, + "learning_rate": 0.00011038737336392596, + "loss": 2.4365, + "step": 1941 + }, + { + "epoch": 0.24616554696412726, + "grad_norm": 0.9921875, + "learning_rate": 0.00011037495625198631, + "loss": 2.4839, + "step": 1942 + }, + { + "epoch": 0.24629230574217265, + "grad_norm": 1.1875, + "learning_rate": 0.00011036253182464031, + "loss": 2.3482, + "step": 1943 + }, + { + "epoch": 0.24641906452021803, + "grad_norm": 0.7578125, + "learning_rate": 0.00011035010008369219, + "loss": 1.9241, + "step": 1944 + }, + { + "epoch": 0.2465458232982634, + "grad_norm": 1.1953125, + "learning_rate": 0.0001103376610309473, + "loss": 2.4444, + "step": 1945 + }, + { + "epoch": 0.2466725820763088, + "grad_norm": 0.84375, + "learning_rate": 0.00011032521466821204, + "loss": 1.9348, + "step": 1946 + }, + { + "epoch": 0.24679934085435418, + "grad_norm": 0.890625, + "learning_rate": 0.00011031276099729382, + "loss": 1.4657, + "step": 1947 + }, + { + "epoch": 0.24692609963239953, + "grad_norm": 1.0234375, + "learning_rate": 0.00011030030002000118, + "loss": 2.4435, + "step": 1948 + }, + { + "epoch": 0.24705285841044491, + "grad_norm": 1.078125, + "learning_rate": 0.00011028783173814369, + "loss": 1.8672, + "step": 1949 + }, + { + "epoch": 0.2471796171884903, + "grad_norm": 0.87109375, + "learning_rate": 0.00011027535615353197, + "loss": 2.0986, + "step": 1950 + }, + { + "epoch": 0.24730637596653568, + "grad_norm": 0.84765625, + "learning_rate": 0.00011026287326797774, + "loss": 1.6975, + "step": 1951 + }, + { + "epoch": 0.24743313474458106, + "grad_norm": 0.83984375, + "learning_rate": 0.00011025038308329372, + "loss": 1.9396, + "step": 1952 + }, + { + "epoch": 0.24755989352262645, + "grad_norm": 1.03125, + "learning_rate": 0.00011023788560129374, + "loss": 1.6226, + "step": 1953 + }, + { + "epoch": 0.24768665230067183, + "grad_norm": 0.8828125, + "learning_rate": 0.00011022538082379268, + "loss": 2.2463, + "step": 1954 + }, + { + "epoch": 0.2478134110787172, + "grad_norm": 0.94921875, + "learning_rate": 0.00011021286875260646, + "loss": 2.2083, + "step": 1955 + }, + { + "epoch": 0.2479401698567626, + "grad_norm": 0.89453125, + "learning_rate": 0.00011020034938955205, + "loss": 1.9937, + "step": 1956 + }, + { + "epoch": 0.24806692863480795, + "grad_norm": 0.734375, + "learning_rate": 0.00011018782273644757, + "loss": 1.8878, + "step": 1957 + }, + { + "epoch": 0.24819368741285333, + "grad_norm": 1.3046875, + "learning_rate": 0.00011017528879511206, + "loss": 2.2198, + "step": 1958 + }, + { + "epoch": 0.2483204461908987, + "grad_norm": 0.86328125, + "learning_rate": 0.00011016274756736572, + "loss": 1.9249, + "step": 1959 + }, + { + "epoch": 0.2484472049689441, + "grad_norm": 0.921875, + "learning_rate": 0.00011015019905502979, + "loss": 2.5262, + "step": 1960 + }, + { + "epoch": 0.24857396374698948, + "grad_norm": 0.953125, + "learning_rate": 0.00011013764325992652, + "loss": 1.3796, + "step": 1961 + }, + { + "epoch": 0.24870072252503486, + "grad_norm": 0.97265625, + "learning_rate": 0.00011012508018387925, + "loss": 1.7887, + "step": 1962 + }, + { + "epoch": 0.24882748130308024, + "grad_norm": 1.1640625, + "learning_rate": 0.00011011250982871242, + "loss": 2.5132, + "step": 1963 + }, + { + "epoch": 0.24895424008112563, + "grad_norm": 0.96875, + "learning_rate": 0.00011009993219625144, + "loss": 2.3026, + "step": 1964 + }, + { + "epoch": 0.249080998859171, + "grad_norm": 1.0, + "learning_rate": 0.00011008734728832286, + "loss": 1.6692, + "step": 1965 + }, + { + "epoch": 0.24920775763721636, + "grad_norm": 1.0859375, + "learning_rate": 0.00011007475510675421, + "loss": 2.0657, + "step": 1966 + }, + { + "epoch": 0.24933451641526175, + "grad_norm": 0.9609375, + "learning_rate": 0.00011006215565337416, + "loss": 1.8933, + "step": 1967 + }, + { + "epoch": 0.24946127519330713, + "grad_norm": 0.96484375, + "learning_rate": 0.00011004954893001237, + "loss": 1.6081, + "step": 1968 + }, + { + "epoch": 0.2495880339713525, + "grad_norm": 0.87109375, + "learning_rate": 0.00011003693493849956, + "loss": 1.7577, + "step": 1969 + }, + { + "epoch": 0.2497147927493979, + "grad_norm": 1.0078125, + "learning_rate": 0.00011002431368066754, + "loss": 2.6812, + "step": 1970 + }, + { + "epoch": 0.24984155152744328, + "grad_norm": 1.1953125, + "learning_rate": 0.00011001168515834915, + "loss": 2.5858, + "step": 1971 + }, + { + "epoch": 0.24996831030548866, + "grad_norm": 0.8359375, + "learning_rate": 0.00010999904937337833, + "loss": 2.075, + "step": 1972 + }, + { + "epoch": 0.250095069083534, + "grad_norm": 1.03125, + "learning_rate": 0.00010998640632758997, + "loss": 2.0207, + "step": 1973 + }, + { + "epoch": 0.2502218278615794, + "grad_norm": 0.953125, + "learning_rate": 0.00010997375602282014, + "loss": 1.7996, + "step": 1974 + }, + { + "epoch": 0.2503485866396248, + "grad_norm": 1.0546875, + "learning_rate": 0.00010996109846090588, + "loss": 2.0745, + "step": 1975 + }, + { + "epoch": 0.2504753454176702, + "grad_norm": 0.8359375, + "learning_rate": 0.00010994843364368533, + "loss": 1.9539, + "step": 1976 + }, + { + "epoch": 0.25060210419571555, + "grad_norm": 0.89453125, + "learning_rate": 0.00010993576157299763, + "loss": 1.8362, + "step": 1977 + }, + { + "epoch": 0.25072886297376096, + "grad_norm": 0.80859375, + "learning_rate": 0.00010992308225068303, + "loss": 1.6796, + "step": 1978 + }, + { + "epoch": 0.2508556217518063, + "grad_norm": 0.78515625, + "learning_rate": 0.00010991039567858279, + "loss": 2.194, + "step": 1979 + }, + { + "epoch": 0.25098238052985167, + "grad_norm": 0.9140625, + "learning_rate": 0.00010989770185853926, + "loss": 2.204, + "step": 1980 + }, + { + "epoch": 0.2511091393078971, + "grad_norm": 0.921875, + "learning_rate": 0.00010988500079239585, + "loss": 1.9912, + "step": 1981 + }, + { + "epoch": 0.25123589808594243, + "grad_norm": 0.87109375, + "learning_rate": 0.00010987229248199694, + "loss": 1.9439, + "step": 1982 + }, + { + "epoch": 0.25136265686398784, + "grad_norm": 0.94140625, + "learning_rate": 0.00010985957692918806, + "loss": 1.9681, + "step": 1983 + }, + { + "epoch": 0.2514894156420332, + "grad_norm": 0.9453125, + "learning_rate": 0.00010984685413581575, + "loss": 2.3029, + "step": 1984 + }, + { + "epoch": 0.2516161744200786, + "grad_norm": 1.0, + "learning_rate": 0.00010983412410372759, + "loss": 2.1463, + "step": 1985 + }, + { + "epoch": 0.25174293319812396, + "grad_norm": 0.90625, + "learning_rate": 0.00010982138683477223, + "loss": 1.6559, + "step": 1986 + }, + { + "epoch": 0.2518696919761694, + "grad_norm": 0.94921875, + "learning_rate": 0.00010980864233079935, + "loss": 2.3137, + "step": 1987 + }, + { + "epoch": 0.25199645075421473, + "grad_norm": 0.9765625, + "learning_rate": 0.00010979589059365972, + "loss": 1.5084, + "step": 1988 + }, + { + "epoch": 0.2521232095322601, + "grad_norm": 0.9296875, + "learning_rate": 0.00010978313162520511, + "loss": 1.7619, + "step": 1989 + }, + { + "epoch": 0.2522499683103055, + "grad_norm": 0.953125, + "learning_rate": 0.0001097703654272884, + "loss": 1.6992, + "step": 1990 + }, + { + "epoch": 0.25237672708835085, + "grad_norm": 0.859375, + "learning_rate": 0.00010975759200176346, + "loss": 2.2644, + "step": 1991 + }, + { + "epoch": 0.25250348586639626, + "grad_norm": 0.96484375, + "learning_rate": 0.00010974481135048524, + "loss": 1.8911, + "step": 1992 + }, + { + "epoch": 0.2526302446444416, + "grad_norm": 0.9765625, + "learning_rate": 0.00010973202347530973, + "loss": 1.855, + "step": 1993 + }, + { + "epoch": 0.252757003422487, + "grad_norm": 0.9140625, + "learning_rate": 0.000109719228378094, + "loss": 1.9577, + "step": 1994 + }, + { + "epoch": 0.2528837622005324, + "grad_norm": 0.953125, + "learning_rate": 0.00010970642606069609, + "loss": 2.1349, + "step": 1995 + }, + { + "epoch": 0.2530105209785778, + "grad_norm": 0.94921875, + "learning_rate": 0.00010969361652497519, + "loss": 2.3517, + "step": 1996 + }, + { + "epoch": 0.25313727975662315, + "grad_norm": 0.8671875, + "learning_rate": 0.00010968079977279148, + "loss": 2.1919, + "step": 1997 + }, + { + "epoch": 0.2532640385346685, + "grad_norm": 0.98046875, + "learning_rate": 0.00010966797580600617, + "loss": 2.1692, + "step": 1998 + }, + { + "epoch": 0.2533907973127139, + "grad_norm": 0.78125, + "learning_rate": 0.00010965514462648159, + "loss": 1.4257, + "step": 1999 + }, + { + "epoch": 0.25351755609075927, + "grad_norm": 0.84375, + "learning_rate": 0.00010964230623608104, + "loss": 2.3177, + "step": 2000 + }, + { + "epoch": 0.2536443148688047, + "grad_norm": 1.0, + "learning_rate": 0.00010962946063666892, + "loss": 1.971, + "step": 2001 + }, + { + "epoch": 0.25377107364685003, + "grad_norm": 1.1796875, + "learning_rate": 0.00010961660783011061, + "loss": 2.0185, + "step": 2002 + }, + { + "epoch": 0.25389783242489544, + "grad_norm": 0.8984375, + "learning_rate": 0.00010960374781827267, + "loss": 2.1336, + "step": 2003 + }, + { + "epoch": 0.2540245912029408, + "grad_norm": 0.91796875, + "learning_rate": 0.00010959088060302252, + "loss": 1.8294, + "step": 2004 + }, + { + "epoch": 0.2541513499809862, + "grad_norm": 1.2265625, + "learning_rate": 0.0001095780061862288, + "loss": 1.9698, + "step": 2005 + }, + { + "epoch": 0.25427810875903156, + "grad_norm": 0.96484375, + "learning_rate": 0.00010956512456976108, + "loss": 2.2703, + "step": 2006 + }, + { + "epoch": 0.2544048675370769, + "grad_norm": 0.8984375, + "learning_rate": 0.00010955223575549002, + "loss": 1.6022, + "step": 2007 + }, + { + "epoch": 0.25453162631512233, + "grad_norm": 0.8984375, + "learning_rate": 0.00010953933974528736, + "loss": 1.6724, + "step": 2008 + }, + { + "epoch": 0.2546583850931677, + "grad_norm": 0.80859375, + "learning_rate": 0.0001095264365410258, + "loss": 1.9459, + "step": 2009 + }, + { + "epoch": 0.2547851438712131, + "grad_norm": 1.0390625, + "learning_rate": 0.00010951352614457916, + "loss": 1.6412, + "step": 2010 + }, + { + "epoch": 0.25491190264925845, + "grad_norm": 1.0234375, + "learning_rate": 0.00010950060855782228, + "loss": 1.8433, + "step": 2011 + }, + { + "epoch": 0.25503866142730386, + "grad_norm": 0.796875, + "learning_rate": 0.00010948768378263101, + "loss": 1.879, + "step": 2012 + }, + { + "epoch": 0.2551654202053492, + "grad_norm": 0.97265625, + "learning_rate": 0.0001094747518208823, + "loss": 1.7705, + "step": 2013 + }, + { + "epoch": 0.2552921789833946, + "grad_norm": 0.92578125, + "learning_rate": 0.00010946181267445411, + "loss": 2.1051, + "step": 2014 + }, + { + "epoch": 0.25541893776144, + "grad_norm": 0.9375, + "learning_rate": 0.00010944886634522546, + "loss": 1.8151, + "step": 2015 + }, + { + "epoch": 0.25554569653948533, + "grad_norm": 0.859375, + "learning_rate": 0.00010943591283507639, + "loss": 2.1353, + "step": 2016 + }, + { + "epoch": 0.25567245531753074, + "grad_norm": 0.96484375, + "learning_rate": 0.00010942295214588801, + "loss": 1.8492, + "step": 2017 + }, + { + "epoch": 0.2557992140955761, + "grad_norm": 0.90625, + "learning_rate": 0.00010940998427954244, + "loss": 1.6428, + "step": 2018 + }, + { + "epoch": 0.2559259728736215, + "grad_norm": 0.86328125, + "learning_rate": 0.00010939700923792288, + "loss": 2.0904, + "step": 2019 + }, + { + "epoch": 0.25605273165166686, + "grad_norm": 1.0234375, + "learning_rate": 0.00010938402702291358, + "loss": 2.3051, + "step": 2020 + }, + { + "epoch": 0.2561794904297123, + "grad_norm": 0.859375, + "learning_rate": 0.00010937103763639975, + "loss": 1.9981, + "step": 2021 + }, + { + "epoch": 0.25630624920775763, + "grad_norm": 0.96875, + "learning_rate": 0.0001093580410802677, + "loss": 2.4197, + "step": 2022 + }, + { + "epoch": 0.25643300798580304, + "grad_norm": 0.94921875, + "learning_rate": 0.00010934503735640484, + "loss": 2.0751, + "step": 2023 + }, + { + "epoch": 0.2565597667638484, + "grad_norm": 0.93359375, + "learning_rate": 0.00010933202646669951, + "loss": 1.8955, + "step": 2024 + }, + { + "epoch": 0.25668652554189375, + "grad_norm": 1.0390625, + "learning_rate": 0.00010931900841304114, + "loss": 2.0562, + "step": 2025 + }, + { + "epoch": 0.25681328431993916, + "grad_norm": 0.9296875, + "learning_rate": 0.00010930598319732021, + "loss": 1.7824, + "step": 2026 + }, + { + "epoch": 0.2569400430979845, + "grad_norm": 0.83984375, + "learning_rate": 0.00010929295082142825, + "loss": 1.6555, + "step": 2027 + }, + { + "epoch": 0.2570668018760299, + "grad_norm": 0.9375, + "learning_rate": 0.00010927991128725778, + "loss": 1.6762, + "step": 2028 + }, + { + "epoch": 0.2571935606540753, + "grad_norm": 0.98046875, + "learning_rate": 0.00010926686459670239, + "loss": 2.1007, + "step": 2029 + }, + { + "epoch": 0.2573203194321207, + "grad_norm": 0.92578125, + "learning_rate": 0.00010925381075165673, + "loss": 2.1472, + "step": 2030 + }, + { + "epoch": 0.25744707821016605, + "grad_norm": 1.0, + "learning_rate": 0.00010924074975401646, + "loss": 2.3564, + "step": 2031 + }, + { + "epoch": 0.25757383698821146, + "grad_norm": 0.9296875, + "learning_rate": 0.00010922768160567829, + "loss": 1.9732, + "step": 2032 + }, + { + "epoch": 0.2577005957662568, + "grad_norm": 0.875, + "learning_rate": 0.00010921460630853994, + "loss": 2.1771, + "step": 2033 + }, + { + "epoch": 0.25782735454430217, + "grad_norm": 0.82421875, + "learning_rate": 0.0001092015238645002, + "loss": 1.7126, + "step": 2034 + }, + { + "epoch": 0.2579541133223476, + "grad_norm": 0.98828125, + "learning_rate": 0.00010918843427545892, + "loss": 2.0775, + "step": 2035 + }, + { + "epoch": 0.25808087210039293, + "grad_norm": 0.796875, + "learning_rate": 0.00010917533754331694, + "loss": 1.7782, + "step": 2036 + }, + { + "epoch": 0.25820763087843834, + "grad_norm": 0.89453125, + "learning_rate": 0.00010916223366997616, + "loss": 1.8822, + "step": 2037 + }, + { + "epoch": 0.2583343896564837, + "grad_norm": 0.81640625, + "learning_rate": 0.00010914912265733952, + "loss": 2.0884, + "step": 2038 + }, + { + "epoch": 0.2584611484345291, + "grad_norm": 0.953125, + "learning_rate": 0.00010913600450731095, + "loss": 2.1335, + "step": 2039 + }, + { + "epoch": 0.25858790721257446, + "grad_norm": 0.90625, + "learning_rate": 0.00010912287922179551, + "loss": 2.2726, + "step": 2040 + }, + { + "epoch": 0.2587146659906199, + "grad_norm": 0.86328125, + "learning_rate": 0.0001091097468026992, + "loss": 2.0254, + "step": 2041 + }, + { + "epoch": 0.25884142476866523, + "grad_norm": 0.94140625, + "learning_rate": 0.00010909660725192912, + "loss": 1.694, + "step": 2042 + }, + { + "epoch": 0.2589681835467106, + "grad_norm": 1.0625, + "learning_rate": 0.0001090834605713934, + "loss": 1.6937, + "step": 2043 + }, + { + "epoch": 0.259094942324756, + "grad_norm": 0.90625, + "learning_rate": 0.00010907030676300115, + "loss": 1.6514, + "step": 2044 + }, + { + "epoch": 0.25922170110280135, + "grad_norm": 0.95703125, + "learning_rate": 0.00010905714582866256, + "loss": 1.8929, + "step": 2045 + }, + { + "epoch": 0.25934845988084676, + "grad_norm": 0.9453125, + "learning_rate": 0.00010904397777028887, + "loss": 2.0018, + "step": 2046 + }, + { + "epoch": 0.2594752186588921, + "grad_norm": 0.84375, + "learning_rate": 0.00010903080258979233, + "loss": 1.289, + "step": 2047 + }, + { + "epoch": 0.2596019774369375, + "grad_norm": 0.96484375, + "learning_rate": 0.00010901762028908623, + "loss": 1.8984, + "step": 2048 + }, + { + "epoch": 0.2597287362149829, + "grad_norm": 0.8515625, + "learning_rate": 0.00010900443087008488, + "loss": 1.8592, + "step": 2049 + }, + { + "epoch": 0.2598554949930283, + "grad_norm": 0.96875, + "learning_rate": 0.00010899123433470365, + "loss": 2.3645, + "step": 2050 + }, + { + "epoch": 0.25998225377107365, + "grad_norm": 0.875, + "learning_rate": 0.0001089780306848589, + "loss": 1.7831, + "step": 2051 + }, + { + "epoch": 0.260109012549119, + "grad_norm": 1.078125, + "learning_rate": 0.0001089648199224681, + "loss": 2.2172, + "step": 2052 + }, + { + "epoch": 0.2602357713271644, + "grad_norm": 0.875, + "learning_rate": 0.00010895160204944966, + "loss": 1.7063, + "step": 2053 + }, + { + "epoch": 0.26036253010520977, + "grad_norm": 0.94140625, + "learning_rate": 0.0001089383770677231, + "loss": 1.6328, + "step": 2054 + }, + { + "epoch": 0.2604892888832552, + "grad_norm": 0.9375, + "learning_rate": 0.00010892514497920891, + "loss": 1.8716, + "step": 2055 + }, + { + "epoch": 0.26061604766130053, + "grad_norm": 0.87890625, + "learning_rate": 0.00010891190578582867, + "loss": 1.4233, + "step": 2056 + }, + { + "epoch": 0.26074280643934594, + "grad_norm": 0.90625, + "learning_rate": 0.00010889865948950494, + "loss": 2.43, + "step": 2057 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.98828125, + "learning_rate": 0.00010888540609216136, + "loss": 2.1284, + "step": 2058 + }, + { + "epoch": 0.2609963239954367, + "grad_norm": 0.9765625, + "learning_rate": 0.00010887214559572255, + "loss": 2.1232, + "step": 2059 + }, + { + "epoch": 0.26112308277348206, + "grad_norm": 0.8828125, + "learning_rate": 0.0001088588780021142, + "loss": 1.8737, + "step": 2060 + }, + { + "epoch": 0.2612498415515274, + "grad_norm": 0.94140625, + "learning_rate": 0.00010884560331326304, + "loss": 1.8746, + "step": 2061 + }, + { + "epoch": 0.26137660032957283, + "grad_norm": 0.93359375, + "learning_rate": 0.00010883232153109678, + "loss": 2.2546, + "step": 2062 + }, + { + "epoch": 0.2615033591076182, + "grad_norm": 0.94140625, + "learning_rate": 0.00010881903265754419, + "loss": 1.6832, + "step": 2063 + }, + { + "epoch": 0.2616301178856636, + "grad_norm": 0.8984375, + "learning_rate": 0.0001088057366945351, + "loss": 2.0841, + "step": 2064 + }, + { + "epoch": 0.26175687666370895, + "grad_norm": 0.9921875, + "learning_rate": 0.00010879243364400028, + "loss": 2.0414, + "step": 2065 + }, + { + "epoch": 0.26188363544175436, + "grad_norm": 0.9375, + "learning_rate": 0.00010877912350787164, + "loss": 1.5754, + "step": 2066 + }, + { + "epoch": 0.2620103942197997, + "grad_norm": 1.203125, + "learning_rate": 0.00010876580628808203, + "loss": 2.9128, + "step": 2067 + }, + { + "epoch": 0.2621371529978451, + "grad_norm": 0.84765625, + "learning_rate": 0.0001087524819865654, + "loss": 2.2536, + "step": 2068 + }, + { + "epoch": 0.2622639117758905, + "grad_norm": 0.91015625, + "learning_rate": 0.00010873915060525666, + "loss": 2.1688, + "step": 2069 + }, + { + "epoch": 0.26239067055393583, + "grad_norm": 0.92578125, + "learning_rate": 0.0001087258121460918, + "loss": 2.2801, + "step": 2070 + }, + { + "epoch": 0.26251742933198124, + "grad_norm": 0.84765625, + "learning_rate": 0.00010871246661100782, + "loss": 2.2957, + "step": 2071 + }, + { + "epoch": 0.2626441881100266, + "grad_norm": 1.0859375, + "learning_rate": 0.00010869911400194273, + "loss": 2.5681, + "step": 2072 + }, + { + "epoch": 0.262770946888072, + "grad_norm": 1.0390625, + "learning_rate": 0.0001086857543208356, + "loss": 1.9326, + "step": 2073 + }, + { + "epoch": 0.26289770566611737, + "grad_norm": 0.84765625, + "learning_rate": 0.00010867238756962652, + "loss": 2.1175, + "step": 2074 + }, + { + "epoch": 0.2630244644441628, + "grad_norm": 0.90625, + "learning_rate": 0.00010865901375025658, + "loss": 1.4167, + "step": 2075 + }, + { + "epoch": 0.26315122322220813, + "grad_norm": 0.84765625, + "learning_rate": 0.00010864563286466791, + "loss": 1.8575, + "step": 2076 + }, + { + "epoch": 0.26327798200025354, + "grad_norm": 0.94921875, + "learning_rate": 0.00010863224491480369, + "loss": 1.8406, + "step": 2077 + }, + { + "epoch": 0.2634047407782989, + "grad_norm": 0.97265625, + "learning_rate": 0.00010861884990260809, + "loss": 1.9168, + "step": 2078 + }, + { + "epoch": 0.26353149955634425, + "grad_norm": 0.875, + "learning_rate": 0.00010860544783002633, + "loss": 2.0365, + "step": 2079 + }, + { + "epoch": 0.26365825833438966, + "grad_norm": 0.88671875, + "learning_rate": 0.00010859203869900462, + "loss": 1.5835, + "step": 2080 + }, + { + "epoch": 0.263785017112435, + "grad_norm": 0.83203125, + "learning_rate": 0.00010857862251149028, + "loss": 1.8031, + "step": 2081 + }, + { + "epoch": 0.2639117758904804, + "grad_norm": 1.9765625, + "learning_rate": 0.00010856519926943155, + "loss": 3.324, + "step": 2082 + }, + { + "epoch": 0.2640385346685258, + "grad_norm": 2.90625, + "learning_rate": 0.00010855176897477775, + "loss": 3.0361, + "step": 2083 + }, + { + "epoch": 0.2641652934465712, + "grad_norm": 0.84765625, + "learning_rate": 0.0001085383316294792, + "loss": 2.0123, + "step": 2084 + }, + { + "epoch": 0.26429205222461655, + "grad_norm": 0.85546875, + "learning_rate": 0.0001085248872354873, + "loss": 1.7503, + "step": 2085 + }, + { + "epoch": 0.26441881100266196, + "grad_norm": 1.109375, + "learning_rate": 0.00010851143579475443, + "loss": 2.4202, + "step": 2086 + }, + { + "epoch": 0.2645455697807073, + "grad_norm": 0.84765625, + "learning_rate": 0.00010849797730923396, + "loss": 1.9654, + "step": 2087 + }, + { + "epoch": 0.26467232855875267, + "grad_norm": 0.85546875, + "learning_rate": 0.00010848451178088033, + "loss": 2.1493, + "step": 2088 + }, + { + "epoch": 0.2647990873367981, + "grad_norm": 0.890625, + "learning_rate": 0.00010847103921164902, + "loss": 1.9769, + "step": 2089 + }, + { + "epoch": 0.26492584611484343, + "grad_norm": 0.89453125, + "learning_rate": 0.00010845755960349647, + "loss": 1.6657, + "step": 2090 + }, + { + "epoch": 0.26505260489288884, + "grad_norm": 0.84765625, + "learning_rate": 0.00010844407295838019, + "loss": 2.1509, + "step": 2091 + }, + { + "epoch": 0.2651793636709342, + "grad_norm": 1.09375, + "learning_rate": 0.0001084305792782587, + "loss": 2.333, + "step": 2092 + }, + { + "epoch": 0.2653061224489796, + "grad_norm": 1.046875, + "learning_rate": 0.00010841707856509154, + "loss": 2.2181, + "step": 2093 + }, + { + "epoch": 0.26543288122702496, + "grad_norm": 0.90625, + "learning_rate": 0.00010840357082083928, + "loss": 1.6493, + "step": 2094 + }, + { + "epoch": 0.2655596400050704, + "grad_norm": 6.0, + "learning_rate": 0.00010839005604746349, + "loss": 2.9562, + "step": 2095 + }, + { + "epoch": 0.26568639878311573, + "grad_norm": 0.953125, + "learning_rate": 0.00010837653424692677, + "loss": 2.206, + "step": 2096 + }, + { + "epoch": 0.2658131575611611, + "grad_norm": 0.96484375, + "learning_rate": 0.00010836300542119276, + "loss": 2.0237, + "step": 2097 + }, + { + "epoch": 0.2659399163392065, + "grad_norm": 0.828125, + "learning_rate": 0.00010834946957222608, + "loss": 2.1401, + "step": 2098 + }, + { + "epoch": 0.26606667511725185, + "grad_norm": 0.9453125, + "learning_rate": 0.00010833592670199243, + "loss": 2.0903, + "step": 2099 + }, + { + "epoch": 0.26619343389529726, + "grad_norm": 0.83984375, + "learning_rate": 0.00010832237681245846, + "loss": 1.4192, + "step": 2100 + }, + { + "epoch": 0.2663201926733426, + "grad_norm": 0.87109375, + "learning_rate": 0.00010830881990559189, + "loss": 2.2133, + "step": 2101 + }, + { + "epoch": 0.266446951451388, + "grad_norm": 0.890625, + "learning_rate": 0.00010829525598336144, + "loss": 2.3092, + "step": 2102 + }, + { + "epoch": 0.2665737102294334, + "grad_norm": 0.91015625, + "learning_rate": 0.00010828168504773683, + "loss": 2.3911, + "step": 2103 + }, + { + "epoch": 0.2667004690074788, + "grad_norm": 0.9375, + "learning_rate": 0.00010826810710068886, + "loss": 1.4397, + "step": 2104 + }, + { + "epoch": 0.26682722778552415, + "grad_norm": 0.89453125, + "learning_rate": 0.00010825452214418928, + "loss": 2.2206, + "step": 2105 + }, + { + "epoch": 0.2669539865635695, + "grad_norm": 0.796875, + "learning_rate": 0.00010824093018021087, + "loss": 2.1534, + "step": 2106 + }, + { + "epoch": 0.2670807453416149, + "grad_norm": 0.9765625, + "learning_rate": 0.00010822733121072747, + "loss": 1.9457, + "step": 2107 + }, + { + "epoch": 0.26720750411966027, + "grad_norm": 0.8671875, + "learning_rate": 0.00010821372523771392, + "loss": 1.9259, + "step": 2108 + }, + { + "epoch": 0.2673342628977057, + "grad_norm": 0.859375, + "learning_rate": 0.00010820011226314606, + "loss": 2.0859, + "step": 2109 + }, + { + "epoch": 0.26746102167575103, + "grad_norm": 0.9453125, + "learning_rate": 0.00010818649228900073, + "loss": 1.9406, + "step": 2110 + }, + { + "epoch": 0.26758778045379644, + "grad_norm": 0.9296875, + "learning_rate": 0.0001081728653172558, + "loss": 1.9499, + "step": 2111 + }, + { + "epoch": 0.2677145392318418, + "grad_norm": 0.8828125, + "learning_rate": 0.00010815923134989023, + "loss": 2.0755, + "step": 2112 + }, + { + "epoch": 0.2678412980098872, + "grad_norm": 1.1328125, + "learning_rate": 0.00010814559038888387, + "loss": 2.1358, + "step": 2113 + }, + { + "epoch": 0.26796805678793256, + "grad_norm": 0.96875, + "learning_rate": 0.00010813194243621767, + "loss": 2.4974, + "step": 2114 + }, + { + "epoch": 0.2680948155659779, + "grad_norm": 0.953125, + "learning_rate": 0.00010811828749387361, + "loss": 1.8863, + "step": 2115 + }, + { + "epoch": 0.26822157434402333, + "grad_norm": 0.9375, + "learning_rate": 0.00010810462556383459, + "loss": 2.3304, + "step": 2116 + }, + { + "epoch": 0.2683483331220687, + "grad_norm": 0.94921875, + "learning_rate": 0.0001080909566480846, + "loss": 2.0889, + "step": 2117 + }, + { + "epoch": 0.2684750919001141, + "grad_norm": 1.015625, + "learning_rate": 0.00010807728074860866, + "loss": 2.0821, + "step": 2118 + }, + { + "epoch": 0.26860185067815945, + "grad_norm": 0.9921875, + "learning_rate": 0.00010806359786739273, + "loss": 1.9428, + "step": 2119 + }, + { + "epoch": 0.26872860945620486, + "grad_norm": 0.99609375, + "learning_rate": 0.00010804990800642386, + "loss": 1.7947, + "step": 2120 + }, + { + "epoch": 0.2688553682342502, + "grad_norm": 0.9609375, + "learning_rate": 0.00010803621116769004, + "loss": 2.5499, + "step": 2121 + }, + { + "epoch": 0.2689821270122956, + "grad_norm": 1.0, + "learning_rate": 0.00010802250735318035, + "loss": 1.9677, + "step": 2122 + }, + { + "epoch": 0.269108885790341, + "grad_norm": 0.90234375, + "learning_rate": 0.00010800879656488483, + "loss": 2.2482, + "step": 2123 + }, + { + "epoch": 0.26923564456838633, + "grad_norm": 0.890625, + "learning_rate": 0.00010799507880479456, + "loss": 2.3249, + "step": 2124 + }, + { + "epoch": 0.26936240334643174, + "grad_norm": 0.9453125, + "learning_rate": 0.00010798135407490158, + "loss": 1.7045, + "step": 2125 + }, + { + "epoch": 0.2694891621244771, + "grad_norm": 0.8828125, + "learning_rate": 0.00010796762237719904, + "loss": 1.5704, + "step": 2126 + }, + { + "epoch": 0.2696159209025225, + "grad_norm": 1.0546875, + "learning_rate": 0.00010795388371368104, + "loss": 2.1678, + "step": 2127 + }, + { + "epoch": 0.26974267968056787, + "grad_norm": 0.92578125, + "learning_rate": 0.00010794013808634264, + "loss": 1.8206, + "step": 2128 + }, + { + "epoch": 0.2698694384586133, + "grad_norm": 1.421875, + "learning_rate": 0.00010792638549718002, + "loss": 1.9321, + "step": 2129 + }, + { + "epoch": 0.26999619723665863, + "grad_norm": 0.96484375, + "learning_rate": 0.0001079126259481903, + "loss": 2.0523, + "step": 2130 + }, + { + "epoch": 0.27012295601470404, + "grad_norm": 0.87109375, + "learning_rate": 0.00010789885944137162, + "loss": 2.2796, + "step": 2131 + }, + { + "epoch": 0.2702497147927494, + "grad_norm": 0.98828125, + "learning_rate": 0.00010788508597872317, + "loss": 2.4429, + "step": 2132 + }, + { + "epoch": 0.27037647357079475, + "grad_norm": 0.96484375, + "learning_rate": 0.00010787130556224507, + "loss": 1.6398, + "step": 2133 + }, + { + "epoch": 0.27050323234884016, + "grad_norm": 0.98828125, + "learning_rate": 0.00010785751819393857, + "loss": 1.6361, + "step": 2134 + }, + { + "epoch": 0.2706299911268855, + "grad_norm": 0.86328125, + "learning_rate": 0.0001078437238758058, + "loss": 1.9116, + "step": 2135 + }, + { + "epoch": 0.2707567499049309, + "grad_norm": 0.9453125, + "learning_rate": 0.00010782992260984998, + "loss": 1.5352, + "step": 2136 + }, + { + "epoch": 0.2708835086829763, + "grad_norm": 0.84375, + "learning_rate": 0.00010781611439807534, + "loss": 1.8568, + "step": 2137 + }, + { + "epoch": 0.2710102674610217, + "grad_norm": 0.93359375, + "learning_rate": 0.00010780229924248705, + "loss": 1.891, + "step": 2138 + }, + { + "epoch": 0.27113702623906705, + "grad_norm": 0.890625, + "learning_rate": 0.00010778847714509136, + "loss": 1.99, + "step": 2139 + }, + { + "epoch": 0.27126378501711246, + "grad_norm": 1.2734375, + "learning_rate": 0.0001077746481078955, + "loss": 2.6804, + "step": 2140 + }, + { + "epoch": 0.2713905437951578, + "grad_norm": 0.9453125, + "learning_rate": 0.00010776081213290772, + "loss": 1.7029, + "step": 2141 + }, + { + "epoch": 0.27151730257320317, + "grad_norm": 0.93359375, + "learning_rate": 0.00010774696922213725, + "loss": 1.8984, + "step": 2142 + }, + { + "epoch": 0.2716440613512486, + "grad_norm": 0.9296875, + "learning_rate": 0.00010773311937759436, + "loss": 2.014, + "step": 2143 + }, + { + "epoch": 0.27177082012929393, + "grad_norm": 0.83203125, + "learning_rate": 0.0001077192626012903, + "loss": 2.3618, + "step": 2144 + }, + { + "epoch": 0.27189757890733934, + "grad_norm": 1.0859375, + "learning_rate": 0.00010770539889523736, + "loss": 2.218, + "step": 2145 + }, + { + "epoch": 0.2720243376853847, + "grad_norm": 0.87890625, + "learning_rate": 0.00010769152826144878, + "loss": 1.9896, + "step": 2146 + }, + { + "epoch": 0.2721510964634301, + "grad_norm": 0.78515625, + "learning_rate": 0.00010767765070193887, + "loss": 2.1676, + "step": 2147 + }, + { + "epoch": 0.27227785524147546, + "grad_norm": 1.1015625, + "learning_rate": 0.00010766376621872291, + "loss": 2.7233, + "step": 2148 + }, + { + "epoch": 0.2724046140195209, + "grad_norm": 0.87109375, + "learning_rate": 0.00010764987481381718, + "loss": 1.8668, + "step": 2149 + }, + { + "epoch": 0.27253137279756623, + "grad_norm": 0.921875, + "learning_rate": 0.00010763597648923902, + "loss": 1.8857, + "step": 2150 + }, + { + "epoch": 0.2726581315756116, + "grad_norm": 0.9921875, + "learning_rate": 0.00010762207124700666, + "loss": 2.1943, + "step": 2151 + }, + { + "epoch": 0.272784890353657, + "grad_norm": 0.88671875, + "learning_rate": 0.00010760815908913948, + "loss": 1.7987, + "step": 2152 + }, + { + "epoch": 0.27291164913170235, + "grad_norm": 0.98046875, + "learning_rate": 0.00010759424001765774, + "loss": 2.397, + "step": 2153 + }, + { + "epoch": 0.27303840790974776, + "grad_norm": 1.0625, + "learning_rate": 0.00010758031403458277, + "loss": 1.833, + "step": 2154 + }, + { + "epoch": 0.2731651666877931, + "grad_norm": 0.92578125, + "learning_rate": 0.00010756638114193689, + "loss": 1.8482, + "step": 2155 + }, + { + "epoch": 0.2732919254658385, + "grad_norm": 0.94921875, + "learning_rate": 0.00010755244134174344, + "loss": 1.7616, + "step": 2156 + }, + { + "epoch": 0.2734186842438839, + "grad_norm": 0.8515625, + "learning_rate": 0.00010753849463602673, + "loss": 1.8068, + "step": 2157 + }, + { + "epoch": 0.2735454430219293, + "grad_norm": 0.93359375, + "learning_rate": 0.00010752454102681209, + "loss": 2.4359, + "step": 2158 + }, + { + "epoch": 0.27367220179997465, + "grad_norm": 0.9453125, + "learning_rate": 0.00010751058051612584, + "loss": 2.1111, + "step": 2159 + }, + { + "epoch": 0.27379896057802, + "grad_norm": 0.98828125, + "learning_rate": 0.00010749661310599532, + "loss": 2.3023, + "step": 2160 + }, + { + "epoch": 0.2739257193560654, + "grad_norm": 0.86328125, + "learning_rate": 0.00010748263879844887, + "loss": 2.2653, + "step": 2161 + }, + { + "epoch": 0.27405247813411077, + "grad_norm": 0.88671875, + "learning_rate": 0.00010746865759551582, + "loss": 1.5266, + "step": 2162 + }, + { + "epoch": 0.2741792369121562, + "grad_norm": 0.9375, + "learning_rate": 0.00010745466949922653, + "loss": 2.2371, + "step": 2163 + }, + { + "epoch": 0.27430599569020153, + "grad_norm": 0.953125, + "learning_rate": 0.0001074406745116123, + "loss": 2.403, + "step": 2164 + }, + { + "epoch": 0.27443275446824694, + "grad_norm": 0.8359375, + "learning_rate": 0.0001074266726347055, + "loss": 1.9814, + "step": 2165 + }, + { + "epoch": 0.2745595132462923, + "grad_norm": 0.94140625, + "learning_rate": 0.00010741266387053945, + "loss": 1.6237, + "step": 2166 + }, + { + "epoch": 0.2746862720243377, + "grad_norm": 0.8984375, + "learning_rate": 0.00010739864822114852, + "loss": 1.8948, + "step": 2167 + }, + { + "epoch": 0.27481303080238306, + "grad_norm": 0.97265625, + "learning_rate": 0.000107384625688568, + "loss": 2.7144, + "step": 2168 + }, + { + "epoch": 0.2749397895804284, + "grad_norm": 0.96875, + "learning_rate": 0.00010737059627483427, + "loss": 2.0845, + "step": 2169 + }, + { + "epoch": 0.27506654835847383, + "grad_norm": 0.8984375, + "learning_rate": 0.00010735655998198467, + "loss": 1.9002, + "step": 2170 + }, + { + "epoch": 0.27506654835847383, + "eval_loss": 1.9766894578933716, + "eval_runtime": 46.4841, + "eval_samples_per_second": 55.417, + "eval_steps_per_second": 13.854, + "step": 2170 + } + ], + "logging_steps": 1, + "max_steps": 8678, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2170, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.385495432973517e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}